def test_val_split(self): # Using default 0.2 value... train, val, _ = data_split() self.assertEqual(len(list(train.unbatch())), 31500) self.assertEqual(len(list(val.unbatch())), 7875) # Using custom value... train, val, _ = data_split(split=0.4) self.assertEqual(len(list(train.unbatch())), 23625) self.assertEqual(len(list(val.unbatch())), 15750)
def run_dmrg(config): np.random.seed(config['random_seed']) log_directory = config['log_directory'] data_path = config['data_path'] test_fraction = config['test_fraction'] max_sweeps = config['max_sweeps'] patience = config['patience'] num_sites = config['num_sites'] bond_dimension = config['bond_dimension'] ix_to_char = config.get('ix_to_char') logger, save_name = get_logger() logger.info(config) tf_logger = Logger('tensorboard/{}'.format(save_name)) text = process_text(data_path, lower=True, remove_punctuation=False) char_to_ix, ix_to_char = encodings(text, ix_to_char) site_dimension = len(char_to_ix) numeric = prepare_numeric(text, char_to_ix) logger.info("Data has {} characters, {} unique.".format( len(text), len(char_to_ix))) train_batch, cv_batch, test_batch = data_split(numeric, num_sites, test_fraction) mps = random_gauged_mps(num_sites, site_dimension, bond_dimension) context = {'config': config, 'step': 0} stats_history = [mps_stats(mps, train_batch, cv_batch, test_batch)] sweep, cv_bumps = 1, 0 while sweep <= max_sweeps and cv_bumps <= patience: dmrg_sweep(mps, train_batch, context) stats = mps_stats(mps, train_batch, cv_batch, test_batch) stats_history.append(stats) log_sweep(logger, tf_logger, stats, sweep) save_path = '{}/{}/mps-after-step-{}.pickle'.format( log_directory, save_name, sweep) data_to_save = { 'config': config, 'mps': mps, 'ix_to_char': ix_to_char, 'save_name': save_name, 'sweep': sweep } save_object(data_to_save, save_path) logger.info("saved mps to: {}".format(data_path)) if config['generate_samples']: samples_per_sweep = config['samples_per_sweep'] samples_txt = list( generate_samples(mps, ix_to_char, samples_per_sweep)) for phrase in samples_txt: logger.info("sample phrase: {}".format(phrase)) sweep += 1 cv_bumps = update_cv_bumps(cv_bumps, stats_history) return stats
def train_nn(n_epochs=2, log_interval=10): df = pd.read_csv('./data/df_super.csv') train_df, val_df, test_df = data.data_split(data.create_nn_dataset(df)) model = LSTM(output_size=64) model, history = train_loop(model, train_df, val_df, n_epochs=n_epochs, log_interval=log_interval) torch.save({'model_state_dict': model.state_dict()}, 'nn.hdf5')
def main(dataset, alpha, learning_rate=0.001, v_prior=1.0, batch_size=32, epochs=500, K=100, hidden_size=100, offset=-10, init_scale=0.1, seed=0): data = data_split( dataset, '/home/rohan/Desktop/Projects/AML_project/VRbound/BayesianNN/data/', seed, 0.1) input_size, output_size = data.shape() if dataset == ('protein' or 'year'): hidden_size = 100 K = 10 else: hidden_size = 50 K = 100 model = Model(input_size, hidden_size, output_size, K, alpha, v_prior, init_scale, offset, seed) model.initialise_q() start_time = time.time() output = model.fit_q(data, batch_size, epochs, learning_rate) running_time = time.time() - start_time params = [dataset, alpha, seed] path = "" for i in params: path += str(i) + "_" save_file = './Results/' + dataset + '/' + path + '.npy' np.save(save_file, output) print('Dataset: ' + dataset + ' Alpha: ' + str(alpha) + ' seed: ' + str(seed) + ' RMSE: ' + str(output[0]) + ' NLL: ' + str(-output[1]) + ' running_time: ' + str(running_time))
# Experiment setup # experiment_parameters = experiment_setup.parse_args_vae(experiment_parameters) model_dir = join_path("./model", experiment_parameters["model_dir"]) experiment_setup.make_dir(model_dir) # HyperParams batch_size = 512 layers = utils.get_layer_sizes(experiment_parameters) label_thresh = 1 # include only a subset of MNIST classes device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device( 'cpu') ## Data x_train, y_train, N, x_test, y_test, N_test = data.load_data( experiment_parameters, root="./data") train_loader, test_loader = data.data_split(x_train, x_test, batch_size) # Fit mean network if experiment_parameters["dataset"] == "bodies": model = model.VAE_bodies( x_train, layers, num_components=experiment_parameters["num_components"], device=device) else: model = model.VAE(x_train, layers, num_components=experiment_parameters["num_components"], device=device) model.fit_mean(train_loader, num_epochs=5, num_cycles=1, max_kl=1)
def data_kmer(rbp_name, n_bases=10, kmer=6, pos_features=POS_FEATURES, valid_chr=[1, 3], test_chr=[2, 4, 6, 8, 10]): """ pos_class_weight: positive class weight """ dt_train, dt_valid, dt_test = data.data_split(rbp_name + "_extended", valid_chr, test_chr) # merge train and valid dt_train = pd.concat([dt_train, dt_valid]) del dt_valid seq_train = kmer_count(dt_train.seq.tolist(), kmer) seq_test = kmer_count(dt_test.seq.tolist(), kmer) # y y_train = dt_train.binding_site.as_matrix().reshape( (-1, 1)).astype("float")[:, 0] y_test = dt_test.binding_site.as_matrix().reshape( (-1, 1)).astype("float")[:, 0] if n_bases is not None: # impute missing values (not part of the pipeline as the Imputer lacks inverse_transform method) imp = Imputer(strategy="median") imp.fit(dt_train[pos_features]) dt_train[pos_features] = imp.transform(dt_train[pos_features]) dt_test[pos_features] = imp.transform(dt_test[pos_features]) preproc_pipeline = make_pipeline( FunctionTransformer(func=data.sign_log_func, inverse_func=data.sign_log_func_inverse)) # positions dtx_train = np.array(dt_train[pos_features]) dtx_test = np.array(dt_test[pos_features]) # transform pos features preproc_pipeline.fit(dtx_train) train_pos = preproc_pipeline.transform(dtx_train) test_pos = preproc_pipeline.transform(dtx_test) st = EncodeSplines(n_bases=n_bases) st.fit(train_pos) x_pos_bs_train = st.transform(train_pos) x_pos_bs_test = st.transform(test_pos) x_pos_bs_train = x_pos_bs_train.reshape((x_pos_bs_train.shape[0], -1)) x_pos_bs_test = x_pos_bs_test.reshape((x_pos_bs_test.shape[0], -1)) x_train = np.concatenate([seq_train, x_pos_bs_train], axis=1) x_test = np.concatenate([seq_test, x_pos_bs_test], axis=1) else: st = None preproc_pipeline = None (x_train, y_train), (x_test, y_test) = (np.array(seq_train), np.array(y_train)), (np.array(seq_test), np.array(y_test)) # min-max scale everything scaler = MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) return (x_train, y_train, pos_features, preproc_pipeline), (x_test, y_test)
def main(): log.info('Starting program..') log.info('Current dir ' + os.getcwd()) log.info('Seed = {}'.format(seed)) download_status = False downloading_cycles = 0 try: df = pd.read_csv('/src/fragments3-lite-train.csv') except Exception as e: log.exception('Exception during reading CSV-file') return 1 log.info('CSV-file with data structure was readed') # READ DATAFRAME WITH TRAIN SAMPLES AND SPLIT IT BEFORE DOWNLOADING # AND RECEIVING IMAGE DATA log.info('Splitting data into train and val subsets...') try: splitted_data = data_split(df, samples_per_class=40, split_koeffs=[1], arrays_labels=['train', 'none'], seed=seed) except Exception as e: log.exception('Error during splitting') return 1 while download_status == False: downloading_cycles += 1 try: data_downloading(ip='83.149.249.48', splitted_data=splitted_data, fragments_per_sample=50, seed=seed) download_status = True except Exception as e: log.exception('Error during data downloading') if downloading_cycles > 5: log.info("Bad connection. Downloading stops!") return 1 time.sleep(600) try: X_, Y_, class_dict = get_image_data(df) except: log.exception('Error during image data receiving') return 1 log.info('Image data received') log.info('Start data processing..') try: new_class_dict = {} Yn_ = {} for key in class_dict: new_class_dict[class_dict[key]] = key num_classes = len(new_class_dict) for key in Y_: Yn_[key] = np_utils.to_categorical(Y_[key], num_classes) except Exception as e: log.exception('Data processing failed!') return 1 # T-CNN(2) implementation log.info('Data processing was finished') try: model = load_model('/src/global_tcnn2_crio_3.h5') except Exception as e: log.exception('Exception during model loading') return 1 log.info('Initializing model callbacks..') try: csv_logger = CSVLogger('/root/shared/results/tcnn_crio.log') except Exception as e: log.exception('Callbacks were not initialized') return 1 log.info('Initializing completed') log.info('Fitting model..') try: model.fit(X_['train'], Yn_['train'], batch_size=16, epochs=10, verbose=1, shuffle=True, callbacks=[csv_logger]) except Exception as e: log.exception('Fitting was failed!') return 1 log.info('Success!') model.save("/root/shared/results/global_tcnn2_crio_3.h5") log.info('Model was saved!') log.info('FINISH')