Example #1
0
def make_oracle_paired_plots():
    """Makes paired scatter plots showing oracle predictions compared to ground trutth GFP values"""
    for it in range(1):
        fig = plt.figure(figsize=(4, 3))
        TRAIN_SIZE = 5000
        train_size_str = "%ik" % (TRAIN_SIZE / 1000)
        RANDOM_STATE = it + 1

        loss = neg_log_likelihood
        get_custom_objects().update({"neg_log_likelihood": loss})

        df = pd.read_csv('../data/gfp_data.csv')
        X_all, _ = util.get_gfp_X_y_aa(df, large_only=True, ignore_stops=True)
        y_all = np.load("../data/gfp_gt_evals.npy")
        perc = np.percentile(y_all, 20)
        above_idx = np.where(y_all > perc)[0]
        X_above, y_above = X_all[above_idx], y_all[above_idx]

        X_train, y_train, gt_train, X_test, y_test, gt_test = util.get_experimental_X_y(
            random_state=RANDOM_STATE, train_size=TRAIN_SIZE, return_test=True)
        num_models = [1, 5, 20][it]
        oracle_suffix = '_%s_%i_%i' % (train_size_str, num_models,
                                       RANDOM_STATE)
        oracles = [
            load_model("../models/oracle_%i%s.h5" % (i, oracle_suffix))
            for i in range(num_models)
        ]

        y_pred, _ = util.get_balaji_predictions(oracles, X_test)
        y_pred_above, _ = util.get_balaji_predictions(oracles, X_above)

        plt.scatter(y_test, y_pred, s=1, label="$< 20^{th}$ percentile")
        plt.scatter(y_above,
                    y_pred_above,
                    s=1,
                    label="$\geq 20^{th}$ percentile")
        plt.plot((2.9, 3.5), (2.9, 3.5), c='k', ls='--')
        plt.ylim([2.95, 3.5])
        plt.xlim([2.95, 3.5])
        plt.xlabel("Ground Truth Values")
        plt.ylabel("Mean Oracle Predictions")
        plt.legend(markerscale=3)
        plt.grid(True)
        plt.gca().set_axisbelow(True)
        plt.gca().grid(color='gray', alpha=0.2)
        plt.gca().spines['right'].set_visible(False)
        plt.gca().spines['top'].set_visible(False)
        plt.gca().yaxis.set_ticks_position('left')
        plt.gca().xaxis.set_ticks_position('bottom')
        plt.tight_layout()
        plt.savefig("../plots/paired_plot_%i.png" % (it + 1), dpi=500)
        plt.show()
        plt.close()
Example #2
0
def train_experimental_oracles():
    """
    Trains and saves oracles on the simulated GFP data (i.e. data generated
    from the GP model
    """
    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    i = 1
    num_models = [1, 5, 20]
    for i in range(len(num_models)):
        RANDOM_STATE = i + 1
        nm = num_models[i]
        X_train, y_train, _ = util.get_experimental_X_y(
            random_state=RANDOM_STATE, train_size=TRAIN_SIZE)
        suffix = '_%s_%i_%i' % (train_size_str, nm, RANDOM_STATE)
        train_and_save_oracles(X_train,
                               y_train,
                               batch_size=10,
                               n=nm,
                               suffix=suffix)
Example #3
0
def run_gomez_bombarelli(constrained=True):
    """Runs the GFP comparative tests on the Gomez-Bombarelli optimization algorithm"""
    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    for it in range(3):
        RANDOM_STATE = it + 1

        X_train, _, _ = util.get_experimental_X_y(random_state=RANDOM_STATE,
                                                  train_size=TRAIN_SIZE)
        ground_truth = gfp_gp.SequenceGP(load=True,
                                         load_prefix="../data/gfp_gp")

        L = X_train.shape[1]
        LD = 20
        gt_var = 0.01
        pred_vae = util.build_pred_vae_model(latent_dim=LD,
                                             n_tokens=X_train.shape[2],
                                             seq_length=L,
                                             enc1_units=50,
                                             pred_var=gt_var)
        suffix = "_%s_%i" % (train_size_str, RANDOM_STATE)

        pred_vae.encoder_.load_weights(
            "../models/pred_vae_encoder_weights%s.h5" % suffix)
        pred_vae.decoder_.load_weights(
            "../models/pred_vae_decoder_weights%s.h5" % suffix)
        pred_vae.predictor_.load_weights(
            "../models/pred_vae_predictor_weights%s.h5" % suffix)
        pred_vae.vae_.load_weights("../models/pred_vae_vae_weights%s.h5" %
                                   suffix)
        if not constrained:
            suffix = "_unconstrained" + suffix
        bomb_results, test_max = optimization_algs.bombarelli_opt(
            X_train,
            pred_vae,
            ground_truth,
            total_it=1000,
            constrained=constrained)
        with open('../results/%s_max%s.json' % ('bombarelli', suffix),
                  'w') as outfile:
            json.dump(test_max, outfile)
Example #4
0
def train_experimental_pred_vaes():
    """
    Trains and saves the semi-supervised VAEs on the GFP experimental data for use 
    in the Gomez-bombarelli optimization method.
    """
    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    for it in range(3):
        RANDOM_STATE = it + 1
        X_train, y_train, _ = util.get_experimental_X_y(
            random_state=RANDOM_STATE, train_size=TRAIN_SIZE)

        L = X_train.shape[1]
        LD = 20
        gt_var = 0.01
        pred_vae = util.build_pred_vae_model(latent_dim=LD,
                                             n_tokens=X_train.shape[2],
                                             seq_length=L,
                                             enc1_units=50,
                                             pred_var=gt_var)

        pred_vae.fit([X_train], [
            X_train,
            np.zeros(X_train.shape[0]), y_train,
            np.zeros_like(y_train)
        ],
                     batch_size=10,
                     epochs=100,
                     shuffle=True,
                     validation_split=0,
                     verbose=2)
        suffix = "_%s_%i" % (train_size_str, RANDOM_STATE)
        pred_vae.encoder_.save_weights(
            "../models/pred_vae_encoder_weights%s.h5" % suffix)
        pred_vae.decoder_.save_weights(
            "../models/pred_vae_decoder_weights%s.h5" % suffix)
        pred_vae.predictor_.save_weights(
            "../models/pred_vae_predictor_weights%s.h5" % suffix)
        pred_vae.vae_.save_weights("../models/pred_vae_vae_weights%s.h5" %
                                   suffix)
Example #5
0
def train_experimental_vaes():
    """Trains and saves VAEs on the GFP data for use in the weighted ML methods"""
    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    suffix = '_%s' % train_size_str
    for i in [0, 2]:
        RANDOM_STATE = i + 1
        X_train, _, _ = util.get_experimental_X_y(random_state=RANDOM_STATE,
                                                  train_size=TRAIN_SIZE)
        vae_0 = util.build_vae(latent_dim=20,
                               n_tokens=20,
                               seq_length=X_train.shape[1],
                               enc1_units=50)
        vae_0.fit([X_train], [X_train, np.zeros(X_train.shape[0])],
                  epochs=100,
                  batch_size=10,
                  verbose=2)
        vae_0.encoder_.save_weights("../models/vae_0_encoder_weights%s_%i.h5" %
                                    (suffix, RANDOM_STATE))
        vae_0.decoder_.save_weights("../models/vae_0_decoder_weights%s_%i.h5" %
                                    (suffix, RANDOM_STATE))
        vae_0.vae_.save_weights("../models/vae_0_vae_weights%s_%i.h5" %
                                (suffix, RANDOM_STATE))
Example #6
0
def run_killoran(killoran=True):
    """Runs the GFP comparative tests on the Killoran (aka AM-VAE) optimization algorithm"""
    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    for i in range(3):
        RANDOM_STATE = i + 1
        print(RANDOM_STATE)
        num_models = [1, 5, 20][i]
        X_train, _, _ = util.get_experimental_X_y(random_state=RANDOM_STATE,
                                                  train_size=TRAIN_SIZE)

        LD = 20
        L = X_train.shape[1]

        vae_suffix = '_%s_%i' % (train_size_str, RANDOM_STATE)

        ground_truth = gfp_gp.SequenceGP(load=True, load_prefix="data/gfp_gp")
        loss = losses.neg_log_likelihood
        keras.utils.get_custom_objects().update({"neg_log_likelihood": loss})
        oracle_suffix = '_%s_%i_%i' % (train_size_str, num_models,
                                       RANDOM_STATE)

        sess = tf.Session(graph=tf.get_default_graph())
        K.set_session(sess)
        vae = util.build_vae(latent_dim=20,
                             n_tokens=20,
                             seq_length=X_train.shape[1],
                             enc1_units=50)
        vae.encoder_.load_weights("../models/vae_0_encoder_weights%s.h5" %
                                  vae_suffix)
        vae.decoder_.load_weights("../models/vae_0_decoder_weights%s.h5" %
                                  vae_suffix)
        vae.vae_.load_weights("../models/vae_0_vae_weights%s.h5" % vae_suffix)

        oracles = [
            keras.models.load_model("../models/oracle_%i%s.h5" %
                                    (i, oracle_suffix))
            for i in range(num_models)
        ]
        if not killoran:
            results, test_max = optimization_algs.killoran_opt(X_train,
                                                               vae,
                                                               oracles,
                                                               ground_truth,
                                                               steps=30000,
                                                               epsilon1=1e-5,
                                                               epsilon2=1.,
                                                               noise_std=1e-5,
                                                               LD=20,
                                                               verbose=False,
                                                               adam=False)

            np.save(
                "../results/mala_results_%s_%i.npy" %
                (train_size_str, RANDOM_STATE), results)
            suffix = "_%s_%i" % (train_size_str, RANDOM_STATE)
            with open('results/%s_max%s.json' % ('mala', suffix),
                      'w') as outfile:
                json.dump(test_max, outfile)

        else:
            results, test_max = optimization_algs.killoran_opt(X_train,
                                                               vae,
                                                               oracles,
                                                               ground_truth,
                                                               steps=10000,
                                                               epsilon1=0.,
                                                               epsilon2=0.1,
                                                               noise_std=1e-6,
                                                               LD=20,
                                                               verbose=False,
                                                               adam=True)
            np.save(
                "../results/killoran_may_results_%s_%i.npy" %
                (train_size_str, RANDOM_STATE), results)
            suffix = "_%s_%i" % (train_size_str, RANDOM_STATE)
            with open('../results/%s_max%s.json' % ('killoran', suffix),
                      'w') as outfile:
                json.dump(test_max, outfile)
Example #7
0
def run_experimental_weighted_ml(it, repeats=3):
    """Runs the GFP comparative tests on the weighted ML models and FBVAE."""

    assert it in [0, 1, 2]

    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    num_models = [1, 5, 20][it]
    RANDOM_STATE = it + 1

    X_train, y_train, gt_train = util.get_experimental_X_y(
        random_state=RANDOM_STATE, train_size=TRAIN_SIZE)

    vae_suffix = '_%s_%i' % (train_size_str, RANDOM_STATE)
    oracle_suffix = '_%s_%i_%i' % (train_size_str, num_models, RANDOM_STATE)

    vae_0 = build_vae(latent_dim=20,
                      n_tokens=20,
                      seq_length=X_train.shape[1],
                      enc1_units=50)

    vae_0.encoder_.load_weights("../models/vae_0_encoder_weights%s.h5" %
                                vae_suffix)
    vae_0.decoder_.load_weights("../models/vae_0_decoder_weights%s.h5" %
                                vae_suffix)
    vae_0.vae_.load_weights("../models/vae_0_vae_weights%s.h5" % vae_suffix)

    ground_truth = gfp_gp.SequenceGP(load=True, load_prefix="data/gfp_gp")

    loss = neg_log_likelihood
    keras.utils.get_custom_objects().update({"neg_log_likelihood": loss})
    oracles = [
        keras.models.load_model("../models/oracle_%i%s.h5" %
                                (i, oracle_suffix)) for i in range(num_models)
    ]

    test_kwargs = [{
        'weights_type': 'cbas',
        'quantile': 1
    }, {
        'weights_type': 'rwr',
        'alpha': 20
    }, {
        'weights_type': 'dbas',
        'quantile': 0.95
    }, {
        'weights_type': 'cem-pi',
        'quantile': 0.8
    }, {
        'weights_type': 'fbvae',
        'quantile': 0.8
    }]

    base_kwargs = {
        'homoscedastic': False,
        'homo_y_var': 0.01,
        'train_gt_evals': gt_train,
        'samples': 100,
        'cutoff': 1e-6,
        'it_epochs': 10,
        'verbose': True,
        'LD': 20,
        'enc1_units': 50,
        'iters': 50
    }

    if num_models == 1:
        base_kwargs['homoscedastic'] = True
        base_kwargs['homo_y_var'] = np.mean(
            (util.get_balaji_predictions(oracles, X_train)[0] - y_train)**2)

    for k in range(repeats):
        for j in range(len(test_kwargs)):
            test_name = test_kwargs[j]['weights_type']
            suffix = "_%s_%i_%i" % (train_size_str, RANDOM_STATE, k)
            if test_name == 'fbvae':
                if base_kwargs['iters'] > 100:
                    suffix += '_long'

                print(suffix)
                kwargs = {}
                kwargs.update(test_kwargs[j])
                kwargs.update(base_kwargs)
                [
                    kwargs.pop(k) for k in
                    ['homoscedastic', 'homo_y_var', 'cutoff', 'it_epochs']
                ]
                test_traj, test_oracle_samples, test_gt_samples, test_max = optimization_algs.fb_opt(
                    np.copy(X_train), oracles, ground_truth, vae_0, **kwargs)
            else:
                if base_kwargs['iters'] > 100:
                    suffix += '_long'
                kwargs = {}
                kwargs.update(test_kwargs[j])
                kwargs.update(base_kwargs)
                test_traj, test_oracle_samples, test_gt_samples, test_max = optimization_algs.weighted_ml_opt(
                    np.copy(X_train), oracles, ground_truth, vae_0, **kwargs)
            np.save('../results/%s_traj%s.npy' % (test_name, suffix),
                    test_traj)
            np.save('../results/%s_oracle_samples%s.npy' % (test_name, suffix),
                    test_oracle_samples)
            np.save('../results/%s_gt_samples%s.npy' % (test_name, suffix),
                    test_gt_samples)

            with open('../results/%s_max%s.json' % (test_name, suffix),
                      'w') as outfile:
                json.dump(test_max, outfile)