コード例 #1
0
ファイル: plotting.py プロジェクト: latent-knowledge/CbAS
def make_oracle_paired_plots():
    """Makes paired scatter plots showing oracle predictions compared to ground trutth GFP values"""
    for it in range(1):
        fig = plt.figure(figsize=(4, 3))
        TRAIN_SIZE = 5000
        train_size_str = "%ik" % (TRAIN_SIZE / 1000)
        RANDOM_STATE = it + 1

        loss = neg_log_likelihood
        get_custom_objects().update({"neg_log_likelihood": loss})

        df = pd.read_csv('../data/gfp_data.csv')
        X_all, _ = util.get_gfp_X_y_aa(df, large_only=True, ignore_stops=True)
        y_all = np.load("../data/gfp_gt_evals.npy")
        perc = np.percentile(y_all, 20)
        above_idx = np.where(y_all > perc)[0]
        X_above, y_above = X_all[above_idx], y_all[above_idx]

        X_train, y_train, gt_train, X_test, y_test, gt_test = util.get_experimental_X_y(
            random_state=RANDOM_STATE, train_size=TRAIN_SIZE, return_test=True)
        num_models = [1, 5, 20][it]
        oracle_suffix = '_%s_%i_%i' % (train_size_str, num_models,
                                       RANDOM_STATE)
        oracles = [
            load_model("../models/oracle_%i%s.h5" % (i, oracle_suffix))
            for i in range(num_models)
        ]

        y_pred, _ = util.get_balaji_predictions(oracles, X_test)
        y_pred_above, _ = util.get_balaji_predictions(oracles, X_above)

        plt.scatter(y_test, y_pred, s=1, label="$< 20^{th}$ percentile")
        plt.scatter(y_above,
                    y_pred_above,
                    s=1,
                    label="$\geq 20^{th}$ percentile")
        plt.plot((2.9, 3.5), (2.9, 3.5), c='k', ls='--')
        plt.ylim([2.95, 3.5])
        plt.xlim([2.95, 3.5])
        plt.xlabel("Ground Truth Values")
        plt.ylabel("Mean Oracle Predictions")
        plt.legend(markerscale=3)
        plt.grid(True)
        plt.gca().set_axisbelow(True)
        plt.gca().grid(color='gray', alpha=0.2)
        plt.gca().spines['right'].set_visible(False)
        plt.gca().spines['top'].set_visible(False)
        plt.gca().yaxis.set_ticks_position('left')
        plt.gca().xaxis.set_ticks_position('bottom')
        plt.tight_layout()
        plt.savefig("../plots/paired_plot_%i.png" % (it + 1), dpi=500)
        plt.show()
        plt.close()
コード例 #2
0
ファイル: optimization_algs.py プロジェクト: dhbrookes/CbAS
def weighted_ml_opt(X_train,
                    oracles,
                    ground_truth,
                    vae_0,
                    weights_type='dbas',
                    LD=20,
                    iters=20,
                    samples=500,
                    homoscedastic=False,
                    homo_y_var=0.1,
                    quantile=0.95,
                    verbose=False,
                    alpha=1,
                    train_gt_evals=None,
                    cutoff=1e-6,
                    it_epochs=10,
                    enc1_units=50):
    """
    Runs weighted maximum likelihood optimization algorithms ('CbAS', 'DbAS',
    RWR, and CEM-PI)
    """

    assert weights_type in ['cbas', 'dbas', 'rwr', 'cem-pi']
    L = X_train.shape[1]
    vae = util.build_vae(latent_dim=LD,
                         n_tokens=20,
                         seq_length=L,
                         enc1_units=enc1_units)

    traj = np.zeros((iters, 7))
    oracle_samples = np.zeros((iters, samples))
    gt_samples = np.zeros((iters, samples))
    oracle_max_seq = None
    oracle_max = -np.inf
    gt_of_oracle_max = -np.inf
    y_star = -np.inf

    for t in range(iters):
        ### Take Samples ###
        zt = np.random.randn(samples, LD)
        if t > 0:
            Xt_p = vae.decoder_.predict(zt)
            Xt = util.get_samples(Xt_p)
        else:
            Xt = X_train

        ### Evaluate ground truth and oracle ###
        yt, yt_var = util.get_balaji_predictions(oracles, Xt)
        if homoscedastic:
            yt_var = np.ones_like(yt) * homo_y_var
        Xt_aa = np.argmax(Xt, axis=-1)
        if t == 0 and train_gt_evals is not None:
            yt_gt = train_gt_evals
        else:
            yt_gt = ground_truth.predict(Xt_aa, print_every=1000000)[:, 0]

        ### Calculate weights for different schemes ###
        if t > 0:
            if weights_type == 'cbas':
                log_pxt = np.sum(np.log(Xt_p) * Xt, axis=(1, 2))
                X0_p = vae_0.decoder_.predict(zt)
                log_px0 = np.sum(np.log(X0_p) * Xt, axis=(1, 2))
                w1 = np.exp(log_px0 - log_pxt)
                y_star_1 = np.percentile(yt, quantile * 100)
                if y_star_1 > y_star:
                    y_star = y_star_1
                w2 = scipy.stats.norm.sf(y_star, loc=yt, scale=np.sqrt(yt_var))
                weights = w1 * w2
            elif weights_type == 'cem-pi':
                pi = scipy.stats.norm.sf(max_train_gt,
                                         loc=yt,
                                         scale=np.sqrt(yt_var))
                pi_thresh = np.percentile(pi, quantile * 100)
                weights = (pi > pi_thresh).astype(int)
            elif weights_type == 'dbas':
                y_star_1 = np.percentile(yt, quantile * 100)
                if y_star_1 > y_star:
                    y_star = y_star_1
                weights = scipy.stats.norm.sf(y_star,
                                              loc=yt,
                                              scale=np.sqrt(yt_var))
            elif weights_type == 'rwr':
                weights = np.exp(alpha * yt)
                weights /= np.sum(weights)
        else:
            weights = np.ones(yt.shape[0])
            max_train_gt = np.max(yt_gt)

        yt_max_idx = np.argmax(yt)
        yt_max = yt[yt_max_idx]
        if yt_max > oracle_max:
            oracle_max = yt_max
            try:
                oracle_max_seq = util.convert_idx_array_to_aas(
                    Xt_aa[yt_max_idx - 1:yt_max_idx])[0]
            except IndexError:
                print(Xt_aa[yt_max_idx - 1:yt_max_idx])
            gt_of_oracle_max = yt_gt[yt_max_idx]

        ### Record and print results ##
        if t == 0:
            rand_idx = np.random.randint(0, len(yt), samples)
            oracle_samples[t, :] = yt[rand_idx]
            gt_samples[t, :] = yt_gt[rand_idx]
        if t > 0:
            oracle_samples[t, :] = yt
            gt_samples[t, :] = yt_gt

        traj[t, 0] = np.max(yt_gt)
        traj[t, 1] = np.mean(yt_gt)
        traj[t, 2] = np.std(yt_gt)
        traj[t, 3] = np.max(yt)
        traj[t, 4] = np.mean(yt)
        traj[t, 5] = np.std(yt)
        traj[t, 6] = np.mean(yt_var)

        if verbose:
            print(weights_type.upper(), t, traj[t, 0],
                  color.BOLD + str(traj[t, 1]) + color.END, traj[t, 2],
                  traj[t, 3], color.BOLD + str(traj[t, 4]) + color.END,
                  traj[t, 5], traj[t, 6])

        ### Train model ###
        if t == 0:
            vae.encoder_.set_weights(vae_0.encoder_.get_weights())
            vae.decoder_.set_weights(vae_0.decoder_.get_weights())
            vae.vae_.set_weights(vae_0.vae_.get_weights())
        else:
            cutoff_idx = np.where(weights < cutoff)
            Xt = np.delete(Xt, cutoff_idx, axis=0)
            yt = np.delete(yt, cutoff_idx, axis=0)
            weights = np.delete(weights, cutoff_idx, axis=0)
            vae.fit([Xt], [Xt, np.zeros(Xt.shape[0])],
                    epochs=it_epochs,
                    batch_size=10,
                    shuffle=False,
                    sample_weight=[weights, weights],
                    verbose=0)

    max_dict = {
        'oracle_max': oracle_max,
        'oracle_max_seq': oracle_max_seq,
        'gt_of_oracle_max': gt_of_oracle_max
    }
    return traj, oracle_samples, gt_samples, max_dict
コード例 #3
0
ファイル: optimization_algs.py プロジェクト: dhbrookes/CbAS
def killoran_opt(X_train,
                 vae,
                 oracles,
                 ground_truth,
                 steps=10000,
                 epsilon1=10**-5,
                 epsilon2=1,
                 noise_std=10**-5,
                 LD=100,
                 verbose=False,
                 adam=False):
    """Runs the Killoran optimization algorithm"""

    L = X_train.shape[1]

    G = vae.decoder_
    f = oracles

    sess = K.get_session()
    zt = K.tf.Variable(np.random.normal(size=[1, LD]), dtype='float32')
    pred_input = K.tf.Variable(np.zeros((1, L, X_train.shape[2])),
                               dtype='float32')
    gen_output = G(zt)
    prior = tfd.Normal(0, 1)
    p_z = prior.log_prob(zt)
    predictions = K.tf.reduce_mean(
        [f[i](pred_input)[0, 0] for i in range(len(f))])
    update_pred_input = K.tf.assign(pred_input, gen_output)
    dfdx = K.tf.gradients(ys=-predictions, xs=pred_input)[0]
    dfdz = K.tf.gradients(gen_output, zt, grad_ys=dfdx)[0]
    dpz = K.tf.gradients(p_z, zt)[0]

    noise = K.tf.random_normal(shape=[1, LD], stddev=noise_std)
    eps1 = K.tf.Variable(epsilon1, trainable=False)
    eps2 = K.tf.Variable(epsilon2, trainable=False)
    if adam:
        optimizer = K.tf.train.AdamOptimizer(learning_rate=epsilon2)
        step = dfdz + noise
    else:
        optimizer = K.tf.train.GradientDescentOptimizer(learning_rate=1)
        step = eps1 * dpz + eps2 * dfdz + noise

    design_op = optimizer.apply_gradients([(step, zt)])
    adam_initializers = [
        var.initializer for var in K.tf.global_variables()
        if 'Adam' in var.name or 'beta' in var.name
    ]
    sess.run(adam_initializers)
    sess.run(pred_input.initializer)
    sess.run(zt.initializer)
    sess.run(eps1.initializer)
    sess.run(eps2.initializer)

    s = sess.run(K.tf.shape(zt))
    sess.run(update_pred_input, {zt: np.random.normal(size=s)})
    z_0 = sess.run([zt])
    results = np.zeros((steps, 2))
    xt_prev = None
    for t in range(steps):
        xt0, _, = sess.run([gen_output, design_op], {
            eps1: epsilon1,
            eps2: epsilon2
        })
        pred_in, preds = sess.run([update_pred_input, predictions])
        xt = util.get_argmax(xt0)
        ft = util.get_balaji_predictions(oracles, xt)[0][0]
        xt_seq = np.argmax(xt, axis=-1)
        if xt_prev is None or not np.all(xt_seq == xt_prev):
            xt_prev = xt_seq
            gt = ground_truth.predict(xt_seq)[:, 0][0]
        else:
            gt = results[t - 1, 1]
        results[t, 0] = ft
        results[t, 1] = gt
    return results, {}
コード例 #4
0
ファイル: optimization_algs.py プロジェクト: dhbrookes/CbAS
def fb_opt(X_train,
           oracles,
           ground_truth,
           vae_0,
           weights_type='fbvae',
           LD=20,
           iters=20,
           samples=500,
           quantile=0.8,
           verbose=False,
           train_gt_evals=None,
           it_epochs=10,
           enc1_units=50):
    """Runs FBVAE optimization algorithm"""

    assert weights_type in ['fbvae']
    L = X_train.shape[1]
    vae = util.build_vae(latent_dim=LD,
                         n_tokens=20,
                         seq_length=L,
                         enc1_units=enc1_units)

    traj = np.zeros((iters, 7))
    oracle_samples = np.zeros((iters, samples))
    gt_samples = np.zeros((iters, samples))
    oracle_max_seq = None
    oracle_max = -np.inf
    gt_of_oracle_max = -np.inf
    y_star = -np.inf
    for t in range(iters):
        ### Take Samples and evaluate ground truth and oracle ##
        zt = np.random.randn(samples, LD)
        if t > 0:
            Xt_sample_p = vae.decoder_.predict(zt)
            Xt_sample = get_samples(Xt_sample_p)
            yt_sample, _ = get_balaji_predictions(oracles, Xt_sample)
            Xt_aa_sample = np.argmax(Xt_sample, axis=-1)
            yt_gt_sample = ground_truth.predict(Xt_aa_sample,
                                                print_every=1000000)[:, 0]
        else:
            Xt = X_train
            yt, _ = util.get_balaji_predictions(oracles, Xt)
            Xt_aa = np.argmax(Xt, axis=-1)
            fb_thresh = np.percentile(yt, quantile * 100)
            if train_gt_evals is not None:
                yt_gt = train_gt_evals
            else:
                yt_gt = ground_truth.predict(Xt_aa, print_every=1000000)[:, 0]

        ### Calculate threshold ###
        if t > 0:
            threshold_idx = np.where(yt_sample >= fb_thresh)[0]
            n_top = len(threshold_idx)
            sample_arrs = [Xt_sample, yt_sample, yt_gt_sample, Xt_aa_sample]
            full_arrs = [Xt, yt, yt_gt, Xt_aa]

            for l in range(len(full_arrs)):
                sample_arr = sample_arrs[l]
                full_arr = full_arrs[l]
                sample_top = sample_arr[threshold_idx]
                full_arr = np.concatenate([sample_top, full_arr])
                full_arr = np.delete(full_arr,
                                     range(full_arr.shape[0] - n_top,
                                           full_arr.shape[0]),
                                     axis=0)
                full_arrs[l] = full_arr
            Xt, yt, yt_gt, Xt_aa = full_arrs
        yt_max_idx = np.argmax(yt)
        yt_max = yt[yt_max_idx]
        if yt_max > oracle_max:
            oracle_max = yt_max
            try:
                oracle_max_seq = util.convert_idx_array_to_aas(
                    Xt_aa[yt_max_idx - 1:yt_max_idx])[0]
            except IndexError:
                print(Xt_aa[yt_max_idx - 1:yt_max_idx])
            gt_of_oracle_max = yt_gt[yt_max_idx]

        ### Record and print results ##

        rand_idx = np.random.randint(0, len(yt), samples)
        oracle_samples[t, :] = yt[rand_idx]
        gt_samples[t, :] = yt_gt[rand_idx]

        traj[t, 0] = np.max(yt_gt)
        traj[t, 1] = np.mean(yt_gt)
        traj[t, 2] = np.std(yt_gt)
        traj[t, 3] = np.max(yt)
        traj[t, 4] = np.mean(yt)
        traj[t, 5] = np.std(yt)
        if t > 0:
            traj[t, 6] = n_top
        else:
            traj[t, 6] = 0

        if verbose:
            print(weights_type.upper(), t, traj[t, 0],
                  color.BOLD + str(traj[t, 1]) + color.END, traj[t, 2],
                  traj[t, 3], color.BOLD + str(traj[t, 4]) + color.END,
                  traj[t, 5], traj[t, 6])

        ### Train model ###
        if t == 0:
            vae.encoder_.set_weights(vae_0.encoder_.get_weights())
            vae.decoder_.set_weights(vae_0.decoder_.get_weights())
            vae.vae_.set_weights(vae_0.vae_.get_weights())
        else:

            vae.fit([Xt], [Xt, np.zeros(Xt.shape[0])],
                    epochs=1,
                    batch_size=10,
                    shuffle=False,
                    verbose=0)

    max_dict = {
        'oracle_max': oracle_max,
        'oracle_max_seq': oracle_max_seq,
        'gt_of_oracle_max': gt_of_oracle_max
    }
    return traj, oracle_samples, gt_samples, max_dict
コード例 #5
0
def run_experimental_weighted_ml(it, repeats=3):
    """Runs the GFP comparative tests on the weighted ML models and FBVAE."""

    assert it in [0, 1, 2]

    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    num_models = [1, 5, 20][it]
    RANDOM_STATE = it + 1

    X_train, y_train, gt_train = util.get_experimental_X_y(
        random_state=RANDOM_STATE, train_size=TRAIN_SIZE)

    vae_suffix = '_%s_%i' % (train_size_str, RANDOM_STATE)
    oracle_suffix = '_%s_%i_%i' % (train_size_str, num_models, RANDOM_STATE)

    vae_0 = build_vae(latent_dim=20,
                      n_tokens=20,
                      seq_length=X_train.shape[1],
                      enc1_units=50)

    vae_0.encoder_.load_weights("../models/vae_0_encoder_weights%s.h5" %
                                vae_suffix)
    vae_0.decoder_.load_weights("../models/vae_0_decoder_weights%s.h5" %
                                vae_suffix)
    vae_0.vae_.load_weights("../models/vae_0_vae_weights%s.h5" % vae_suffix)

    ground_truth = gfp_gp.SequenceGP(load=True, load_prefix="data/gfp_gp")

    loss = neg_log_likelihood
    keras.utils.get_custom_objects().update({"neg_log_likelihood": loss})
    oracles = [
        keras.models.load_model("../models/oracle_%i%s.h5" %
                                (i, oracle_suffix)) for i in range(num_models)
    ]

    test_kwargs = [{
        'weights_type': 'cbas',
        'quantile': 1
    }, {
        'weights_type': 'rwr',
        'alpha': 20
    }, {
        'weights_type': 'dbas',
        'quantile': 0.95
    }, {
        'weights_type': 'cem-pi',
        'quantile': 0.8
    }, {
        'weights_type': 'fbvae',
        'quantile': 0.8
    }]

    base_kwargs = {
        'homoscedastic': False,
        'homo_y_var': 0.01,
        'train_gt_evals': gt_train,
        'samples': 100,
        'cutoff': 1e-6,
        'it_epochs': 10,
        'verbose': True,
        'LD': 20,
        'enc1_units': 50,
        'iters': 50
    }

    if num_models == 1:
        base_kwargs['homoscedastic'] = True
        base_kwargs['homo_y_var'] = np.mean(
            (util.get_balaji_predictions(oracles, X_train)[0] - y_train)**2)

    for k in range(repeats):
        for j in range(len(test_kwargs)):
            test_name = test_kwargs[j]['weights_type']
            suffix = "_%s_%i_%i" % (train_size_str, RANDOM_STATE, k)
            if test_name == 'fbvae':
                if base_kwargs['iters'] > 100:
                    suffix += '_long'

                print(suffix)
                kwargs = {}
                kwargs.update(test_kwargs[j])
                kwargs.update(base_kwargs)
                [
                    kwargs.pop(k) for k in
                    ['homoscedastic', 'homo_y_var', 'cutoff', 'it_epochs']
                ]
                test_traj, test_oracle_samples, test_gt_samples, test_max = optimization_algs.fb_opt(
                    np.copy(X_train), oracles, ground_truth, vae_0, **kwargs)
            else:
                if base_kwargs['iters'] > 100:
                    suffix += '_long'
                kwargs = {}
                kwargs.update(test_kwargs[j])
                kwargs.update(base_kwargs)
                test_traj, test_oracle_samples, test_gt_samples, test_max = optimization_algs.weighted_ml_opt(
                    np.copy(X_train), oracles, ground_truth, vae_0, **kwargs)
            np.save('../results/%s_traj%s.npy' % (test_name, suffix),
                    test_traj)
            np.save('../results/%s_oracle_samples%s.npy' % (test_name, suffix),
                    test_oracle_samples)
            np.save('../results/%s_gt_samples%s.npy' % (test_name, suffix),
                    test_gt_samples)

            with open('../results/%s_max%s.json' % (test_name, suffix),
                      'w') as outfile:
                json.dump(test_max, outfile)