Ejemplo n.º 1
0
    def test_val_split(self):
        # Using default 0.2 value...
        train, val, _ = data_split()
        self.assertEqual(len(list(train.unbatch())), 31500)
        self.assertEqual(len(list(val.unbatch())), 7875)

        # Using custom value...
        train, val, _ = data_split(split=0.4)
        self.assertEqual(len(list(train.unbatch())), 23625)
        self.assertEqual(len(list(val.unbatch())), 15750)
Ejemplo n.º 2
0
def run_dmrg(config):
    np.random.seed(config['random_seed'])
    log_directory = config['log_directory']
    data_path = config['data_path']
    test_fraction = config['test_fraction']
    max_sweeps = config['max_sweeps']
    patience = config['patience']
    num_sites = config['num_sites']
    bond_dimension = config['bond_dimension']
    ix_to_char = config.get('ix_to_char')
    logger, save_name = get_logger()
    logger.info(config)
    tf_logger = Logger('tensorboard/{}'.format(save_name))

    text = process_text(data_path, lower=True, remove_punctuation=False)
    char_to_ix, ix_to_char = encodings(text, ix_to_char)
    site_dimension = len(char_to_ix)
    numeric = prepare_numeric(text, char_to_ix)
    logger.info("Data has {} characters, {} unique.".format(
        len(text), len(char_to_ix)))
    train_batch, cv_batch, test_batch = data_split(numeric, num_sites,
                                                   test_fraction)

    mps = random_gauged_mps(num_sites, site_dimension, bond_dimension)

    context = {'config': config, 'step': 0}

    stats_history = [mps_stats(mps, train_batch, cv_batch, test_batch)]
    sweep, cv_bumps = 1, 0

    while sweep <= max_sweeps and cv_bumps <= patience:
        dmrg_sweep(mps, train_batch, context)
        stats = mps_stats(mps, train_batch, cv_batch, test_batch)
        stats_history.append(stats)
        log_sweep(logger, tf_logger, stats, sweep)
        save_path = '{}/{}/mps-after-step-{}.pickle'.format(
            log_directory, save_name, sweep)
        data_to_save = {
            'config': config,
            'mps': mps,
            'ix_to_char': ix_to_char,
            'save_name': save_name,
            'sweep': sweep
        }
        save_object(data_to_save, save_path)
        logger.info("saved mps to: {}".format(data_path))

        if config['generate_samples']:
            samples_per_sweep = config['samples_per_sweep']
            samples_txt = list(
                generate_samples(mps, ix_to_char, samples_per_sweep))

            for phrase in samples_txt:
                logger.info("sample phrase: {}".format(phrase))

        sweep += 1
        cv_bumps = update_cv_bumps(cv_bumps, stats_history)

    return stats
Ejemplo n.º 3
0
def train_nn(n_epochs=2, log_interval=10):
    df = pd.read_csv('./data/df_super.csv')
    train_df, val_df, test_df = data.data_split(data.create_nn_dataset(df))

    model = LSTM(output_size=64)

    model, history = train_loop(model,
                                train_df,
                                val_df,
                                n_epochs=n_epochs,
                                log_interval=log_interval)

    torch.save({'model_state_dict': model.state_dict()}, 'nn.hdf5')
Ejemplo n.º 4
0
def main(dataset,
         alpha,
         learning_rate=0.001,
         v_prior=1.0,
         batch_size=32,
         epochs=500,
         K=100,
         hidden_size=100,
         offset=-10,
         init_scale=0.1,
         seed=0):

    data = data_split(
        dataset,
        '/home/rohan/Desktop/Projects/AML_project/VRbound/BayesianNN/data/',
        seed, 0.1)

    input_size, output_size = data.shape()

    if dataset == ('protein' or 'year'):
        hidden_size = 100
        K = 10
    else:
        hidden_size = 50
        K = 100

    model = Model(input_size, hidden_size, output_size, K, alpha, v_prior,
                  init_scale, offset, seed)
    model.initialise_q()
    start_time = time.time()
    output = model.fit_q(data, batch_size, epochs, learning_rate)
    running_time = time.time() - start_time

    params = [dataset, alpha, seed]
    path = ""
    for i in params:
        path += str(i) + "_"
    save_file = './Results/' + dataset + '/' + path + '.npy'
    np.save(save_file, output)

    print('Dataset: ' + dataset + ' Alpha: ' + str(alpha) + ' seed: ' +
          str(seed) + ' RMSE: ' + str(output[0]) + ' NLL: ' + str(-output[1]) +
          ' running_time: ' + str(running_time))
Ejemplo n.º 5
0
# Experiment setup
# experiment_parameters = experiment_setup.parse_args_vae(experiment_parameters)
model_dir = join_path("./model", experiment_parameters["model_dir"])
experiment_setup.make_dir(model_dir)

# HyperParams
batch_size = 512
layers = utils.get_layer_sizes(experiment_parameters)
label_thresh = 1  # include only a subset of MNIST classes
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device(
    'cpu')

## Data
x_train, y_train, N, x_test, y_test, N_test = data.load_data(
    experiment_parameters, root="./data")
train_loader, test_loader = data.data_split(x_train, x_test, batch_size)

# Fit mean network
if experiment_parameters["dataset"] == "bodies":
    model = model.VAE_bodies(
        x_train,
        layers,
        num_components=experiment_parameters["num_components"],
        device=device)
else:
    model = model.VAE(x_train,
                      layers,
                      num_components=experiment_parameters["num_components"],
                      device=device)
model.fit_mean(train_loader, num_epochs=5, num_cycles=1, max_kl=1)
Ejemplo n.º 6
0
def data_kmer(rbp_name,
              n_bases=10,
              kmer=6,
              pos_features=POS_FEATURES,
              valid_chr=[1, 3],
              test_chr=[2, 4, 6, 8, 10]):
    """
    pos_class_weight: positive class weight
    """
    dt_train, dt_valid, dt_test = data.data_split(rbp_name + "_extended",
                                                  valid_chr, test_chr)

    # merge train and valid
    dt_train = pd.concat([dt_train, dt_valid])
    del dt_valid

    seq_train = kmer_count(dt_train.seq.tolist(), kmer)
    seq_test = kmer_count(dt_test.seq.tolist(), kmer)

    # y
    y_train = dt_train.binding_site.as_matrix().reshape(
        (-1, 1)).astype("float")[:, 0]
    y_test = dt_test.binding_site.as_matrix().reshape(
        (-1, 1)).astype("float")[:, 0]

    if n_bases is not None:
        # impute missing values (not part of the pipeline as the Imputer lacks inverse_transform method)
        imp = Imputer(strategy="median")
        imp.fit(dt_train[pos_features])
        dt_train[pos_features] = imp.transform(dt_train[pos_features])
        dt_test[pos_features] = imp.transform(dt_test[pos_features])

        preproc_pipeline = make_pipeline(
            FunctionTransformer(func=data.sign_log_func,
                                inverse_func=data.sign_log_func_inverse))

        # positions
        dtx_train = np.array(dt_train[pos_features])
        dtx_test = np.array(dt_test[pos_features])

        # transform pos features
        preproc_pipeline.fit(dtx_train)
        train_pos = preproc_pipeline.transform(dtx_train)
        test_pos = preproc_pipeline.transform(dtx_test)

        st = EncodeSplines(n_bases=n_bases)

        st.fit(train_pos)

        x_pos_bs_train = st.transform(train_pos)
        x_pos_bs_test = st.transform(test_pos)
        x_pos_bs_train = x_pos_bs_train.reshape((x_pos_bs_train.shape[0], -1))
        x_pos_bs_test = x_pos_bs_test.reshape((x_pos_bs_test.shape[0], -1))

        x_train = np.concatenate([seq_train, x_pos_bs_train], axis=1)
        x_test = np.concatenate([seq_test, x_pos_bs_test], axis=1)
    else:
        st = None
        preproc_pipeline = None
        (x_train,
         y_train), (x_test, y_test) = (np.array(seq_train),
                                       np.array(y_train)), (np.array(seq_test),
                                                            np.array(y_test))

    # min-max scale everything
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    return (x_train, y_train, pos_features, preproc_pipeline), (x_test, y_test)
Ejemplo n.º 7
0
def main():

    log.info('Starting program..')

    log.info('Current dir ' + os.getcwd())

    log.info('Seed = {}'.format(seed))

    download_status = False
    downloading_cycles = 0
    try:
        df = pd.read_csv('/src/fragments3-lite-train.csv')
    except Exception as e:
        log.exception('Exception during reading CSV-file')
        return 1
    log.info('CSV-file with data structure was readed')

    # READ DATAFRAME WITH TRAIN SAMPLES AND SPLIT IT BEFORE DOWNLOADING
    # AND RECEIVING IMAGE DATA
    log.info('Splitting data into train and val subsets...')
    try:
        splitted_data = data_split(df, samples_per_class=40,
                  split_koeffs=[1],
                  arrays_labels=['train', 'none'], seed=seed)
    except Exception as e:
        log.exception('Error during splitting')
        return 1
    while download_status == False:
        downloading_cycles += 1
        try:
            data_downloading(ip='83.149.249.48', splitted_data=splitted_data, fragments_per_sample=50, seed=seed)
            download_status = True
        except Exception as e:
            log.exception('Error during data downloading')
            if downloading_cycles > 5:
                log.info("Bad connection. Downloading stops!")
                return 1
            time.sleep(600)
    try:
        X_, Y_, class_dict = get_image_data(df)
    except:
        log.exception('Error during image data receiving')
        return 1

    log.info('Image data received')

    log.info('Start data processing..')
    try:
        new_class_dict = {}
        Yn_ = {}

        for key in class_dict:
            new_class_dict[class_dict[key]] = key

        num_classes = len(new_class_dict)

        for key in Y_:
            Yn_[key] = np_utils.to_categorical(Y_[key], num_classes)
    except Exception as e:
        log.exception('Data processing failed!')
        return 1
    # T-CNN(2) implementation
    log.info('Data processing was finished')
    try:
        model = load_model('/src/global_tcnn2_crio_3.h5')
    except Exception as e:
        log.exception('Exception during model loading')
        return 1

    log.info('Initializing model callbacks..')
    try:
        csv_logger = CSVLogger('/root/shared/results/tcnn_crio.log')
    except Exception as e:
        log.exception('Callbacks were not initialized')
        return 1

    log.info('Initializing completed')
    log.info('Fitting model..')
    try:
        model.fit(X_['train'], Yn_['train'],
                    batch_size=16, epochs=10,
                    verbose=1, shuffle=True,
                    callbacks=[csv_logger])
    except Exception as e:
        log.exception('Fitting was failed!')
        return 1

    log.info('Success!')

    model.save("/root/shared/results/global_tcnn2_crio_3.h5")

    log.info('Model was saved!')

    log.info('FINISH')