Ejemplo n.º 1
0
def train_experimental_vaes():
    """Trains and saves VAEs on the GFP data for use in the weighted ML methods"""
    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    suffix = '_%s' % train_size_str
    for i in [0, 2]:
        RANDOM_STATE = i + 1
        X_train, _, _ = util.get_experimental_X_y(random_state=RANDOM_STATE,
                                                  train_size=TRAIN_SIZE)
        vae_0 = util.build_vae(latent_dim=20,
                               n_tokens=20,
                               seq_length=X_train.shape[1],
                               enc1_units=50)
        vae_0.fit([X_train], [X_train, np.zeros(X_train.shape[0])],
                  epochs=100,
                  batch_size=10,
                  verbose=2)
        vae_0.encoder_.save_weights("../models/vae_0_encoder_weights%s_%i.h5" %
                                    (suffix, RANDOM_STATE))
        vae_0.decoder_.save_weights("../models/vae_0_decoder_weights%s_%i.h5" %
                                    (suffix, RANDOM_STATE))
        vae_0.vae_.save_weights("../models/vae_0_vae_weights%s_%i.h5" %
                                (suffix, RANDOM_STATE))
Ejemplo n.º 2
0
def weighted_ml_opt(X_train,
                    oracles,
                    ground_truth,
                    vae_0,
                    weights_type='dbas',
                    LD=20,
                    iters=20,
                    samples=500,
                    homoscedastic=False,
                    homo_y_var=0.1,
                    quantile=0.95,
                    verbose=False,
                    alpha=1,
                    train_gt_evals=None,
                    cutoff=1e-6,
                    it_epochs=10,
                    enc1_units=50):
    """
    Runs weighted maximum likelihood optimization algorithms ('CbAS', 'DbAS',
    RWR, and CEM-PI)
    """

    assert weights_type in ['cbas', 'dbas', 'rwr', 'cem-pi']
    L = X_train.shape[1]
    vae = util.build_vae(latent_dim=LD,
                         n_tokens=20,
                         seq_length=L,
                         enc1_units=enc1_units)

    traj = np.zeros((iters, 7))
    oracle_samples = np.zeros((iters, samples))
    gt_samples = np.zeros((iters, samples))
    oracle_max_seq = None
    oracle_max = -np.inf
    gt_of_oracle_max = -np.inf
    y_star = -np.inf

    for t in range(iters):
        ### Take Samples ###
        zt = np.random.randn(samples, LD)
        if t > 0:
            Xt_p = vae.decoder_.predict(zt)
            Xt = util.get_samples(Xt_p)
        else:
            Xt = X_train

        ### Evaluate ground truth and oracle ###
        yt, yt_var = util.get_balaji_predictions(oracles, Xt)
        if homoscedastic:
            yt_var = np.ones_like(yt) * homo_y_var
        Xt_aa = np.argmax(Xt, axis=-1)
        if t == 0 and train_gt_evals is not None:
            yt_gt = train_gt_evals
        else:
            yt_gt = ground_truth.predict(Xt_aa, print_every=1000000)[:, 0]

        ### Calculate weights for different schemes ###
        if t > 0:
            if weights_type == 'cbas':
                log_pxt = np.sum(np.log(Xt_p) * Xt, axis=(1, 2))
                X0_p = vae_0.decoder_.predict(zt)
                log_px0 = np.sum(np.log(X0_p) * Xt, axis=(1, 2))
                w1 = np.exp(log_px0 - log_pxt)
                y_star_1 = np.percentile(yt, quantile * 100)
                if y_star_1 > y_star:
                    y_star = y_star_1
                w2 = scipy.stats.norm.sf(y_star, loc=yt, scale=np.sqrt(yt_var))
                weights = w1 * w2
            elif weights_type == 'cem-pi':
                pi = scipy.stats.norm.sf(max_train_gt,
                                         loc=yt,
                                         scale=np.sqrt(yt_var))
                pi_thresh = np.percentile(pi, quantile * 100)
                weights = (pi > pi_thresh).astype(int)
            elif weights_type == 'dbas':
                y_star_1 = np.percentile(yt, quantile * 100)
                if y_star_1 > y_star:
                    y_star = y_star_1
                weights = scipy.stats.norm.sf(y_star,
                                              loc=yt,
                                              scale=np.sqrt(yt_var))
            elif weights_type == 'rwr':
                weights = np.exp(alpha * yt)
                weights /= np.sum(weights)
        else:
            weights = np.ones(yt.shape[0])
            max_train_gt = np.max(yt_gt)

        yt_max_idx = np.argmax(yt)
        yt_max = yt[yt_max_idx]
        if yt_max > oracle_max:
            oracle_max = yt_max
            try:
                oracle_max_seq = util.convert_idx_array_to_aas(
                    Xt_aa[yt_max_idx - 1:yt_max_idx])[0]
            except IndexError:
                print(Xt_aa[yt_max_idx - 1:yt_max_idx])
            gt_of_oracle_max = yt_gt[yt_max_idx]

        ### Record and print results ##
        if t == 0:
            rand_idx = np.random.randint(0, len(yt), samples)
            oracle_samples[t, :] = yt[rand_idx]
            gt_samples[t, :] = yt_gt[rand_idx]
        if t > 0:
            oracle_samples[t, :] = yt
            gt_samples[t, :] = yt_gt

        traj[t, 0] = np.max(yt_gt)
        traj[t, 1] = np.mean(yt_gt)
        traj[t, 2] = np.std(yt_gt)
        traj[t, 3] = np.max(yt)
        traj[t, 4] = np.mean(yt)
        traj[t, 5] = np.std(yt)
        traj[t, 6] = np.mean(yt_var)

        if verbose:
            print(weights_type.upper(), t, traj[t, 0],
                  color.BOLD + str(traj[t, 1]) + color.END, traj[t, 2],
                  traj[t, 3], color.BOLD + str(traj[t, 4]) + color.END,
                  traj[t, 5], traj[t, 6])

        ### Train model ###
        if t == 0:
            vae.encoder_.set_weights(vae_0.encoder_.get_weights())
            vae.decoder_.set_weights(vae_0.decoder_.get_weights())
            vae.vae_.set_weights(vae_0.vae_.get_weights())
        else:
            cutoff_idx = np.where(weights < cutoff)
            Xt = np.delete(Xt, cutoff_idx, axis=0)
            yt = np.delete(yt, cutoff_idx, axis=0)
            weights = np.delete(weights, cutoff_idx, axis=0)
            vae.fit([Xt], [Xt, np.zeros(Xt.shape[0])],
                    epochs=it_epochs,
                    batch_size=10,
                    shuffle=False,
                    sample_weight=[weights, weights],
                    verbose=0)

    max_dict = {
        'oracle_max': oracle_max,
        'oracle_max_seq': oracle_max_seq,
        'gt_of_oracle_max': gt_of_oracle_max
    }
    return traj, oracle_samples, gt_samples, max_dict
Ejemplo n.º 3
0
def fb_opt(X_train,
           oracles,
           ground_truth,
           vae_0,
           weights_type='fbvae',
           LD=20,
           iters=20,
           samples=500,
           quantile=0.8,
           verbose=False,
           train_gt_evals=None,
           it_epochs=10,
           enc1_units=50):
    """Runs FBVAE optimization algorithm"""

    assert weights_type in ['fbvae']
    L = X_train.shape[1]
    vae = util.build_vae(latent_dim=LD,
                         n_tokens=20,
                         seq_length=L,
                         enc1_units=enc1_units)

    traj = np.zeros((iters, 7))
    oracle_samples = np.zeros((iters, samples))
    gt_samples = np.zeros((iters, samples))
    oracle_max_seq = None
    oracle_max = -np.inf
    gt_of_oracle_max = -np.inf
    y_star = -np.inf
    for t in range(iters):
        ### Take Samples and evaluate ground truth and oracle ##
        zt = np.random.randn(samples, LD)
        if t > 0:
            Xt_sample_p = vae.decoder_.predict(zt)
            Xt_sample = get_samples(Xt_sample_p)
            yt_sample, _ = get_balaji_predictions(oracles, Xt_sample)
            Xt_aa_sample = np.argmax(Xt_sample, axis=-1)
            yt_gt_sample = ground_truth.predict(Xt_aa_sample,
                                                print_every=1000000)[:, 0]
        else:
            Xt = X_train
            yt, _ = util.get_balaji_predictions(oracles, Xt)
            Xt_aa = np.argmax(Xt, axis=-1)
            fb_thresh = np.percentile(yt, quantile * 100)
            if train_gt_evals is not None:
                yt_gt = train_gt_evals
            else:
                yt_gt = ground_truth.predict(Xt_aa, print_every=1000000)[:, 0]

        ### Calculate threshold ###
        if t > 0:
            threshold_idx = np.where(yt_sample >= fb_thresh)[0]
            n_top = len(threshold_idx)
            sample_arrs = [Xt_sample, yt_sample, yt_gt_sample, Xt_aa_sample]
            full_arrs = [Xt, yt, yt_gt, Xt_aa]

            for l in range(len(full_arrs)):
                sample_arr = sample_arrs[l]
                full_arr = full_arrs[l]
                sample_top = sample_arr[threshold_idx]
                full_arr = np.concatenate([sample_top, full_arr])
                full_arr = np.delete(full_arr,
                                     range(full_arr.shape[0] - n_top,
                                           full_arr.shape[0]),
                                     axis=0)
                full_arrs[l] = full_arr
            Xt, yt, yt_gt, Xt_aa = full_arrs
        yt_max_idx = np.argmax(yt)
        yt_max = yt[yt_max_idx]
        if yt_max > oracle_max:
            oracle_max = yt_max
            try:
                oracle_max_seq = util.convert_idx_array_to_aas(
                    Xt_aa[yt_max_idx - 1:yt_max_idx])[0]
            except IndexError:
                print(Xt_aa[yt_max_idx - 1:yt_max_idx])
            gt_of_oracle_max = yt_gt[yt_max_idx]

        ### Record and print results ##

        rand_idx = np.random.randint(0, len(yt), samples)
        oracle_samples[t, :] = yt[rand_idx]
        gt_samples[t, :] = yt_gt[rand_idx]

        traj[t, 0] = np.max(yt_gt)
        traj[t, 1] = np.mean(yt_gt)
        traj[t, 2] = np.std(yt_gt)
        traj[t, 3] = np.max(yt)
        traj[t, 4] = np.mean(yt)
        traj[t, 5] = np.std(yt)
        if t > 0:
            traj[t, 6] = n_top
        else:
            traj[t, 6] = 0

        if verbose:
            print(weights_type.upper(), t, traj[t, 0],
                  color.BOLD + str(traj[t, 1]) + color.END, traj[t, 2],
                  traj[t, 3], color.BOLD + str(traj[t, 4]) + color.END,
                  traj[t, 5], traj[t, 6])

        ### Train model ###
        if t == 0:
            vae.encoder_.set_weights(vae_0.encoder_.get_weights())
            vae.decoder_.set_weights(vae_0.decoder_.get_weights())
            vae.vae_.set_weights(vae_0.vae_.get_weights())
        else:

            vae.fit([Xt], [Xt, np.zeros(Xt.shape[0])],
                    epochs=1,
                    batch_size=10,
                    shuffle=False,
                    verbose=0)

    max_dict = {
        'oracle_max': oracle_max,
        'oracle_max_seq': oracle_max_seq,
        'gt_of_oracle_max': gt_of_oracle_max
    }
    return traj, oracle_samples, gt_samples, max_dict
Ejemplo n.º 4
0
def run_killoran(killoran=True):
    """Runs the GFP comparative tests on the Killoran (aka AM-VAE) optimization algorithm"""
    TRAIN_SIZE = 5000
    train_size_str = "%ik" % (TRAIN_SIZE / 1000)
    for i in range(3):
        RANDOM_STATE = i + 1
        print(RANDOM_STATE)
        num_models = [1, 5, 20][i]
        X_train, _, _ = util.get_experimental_X_y(random_state=RANDOM_STATE,
                                                  train_size=TRAIN_SIZE)

        LD = 20
        L = X_train.shape[1]

        vae_suffix = '_%s_%i' % (train_size_str, RANDOM_STATE)

        ground_truth = gfp_gp.SequenceGP(load=True, load_prefix="data/gfp_gp")
        loss = losses.neg_log_likelihood
        keras.utils.get_custom_objects().update({"neg_log_likelihood": loss})
        oracle_suffix = '_%s_%i_%i' % (train_size_str, num_models,
                                       RANDOM_STATE)

        sess = tf.Session(graph=tf.get_default_graph())
        K.set_session(sess)
        vae = util.build_vae(latent_dim=20,
                             n_tokens=20,
                             seq_length=X_train.shape[1],
                             enc1_units=50)
        vae.encoder_.load_weights("../models/vae_0_encoder_weights%s.h5" %
                                  vae_suffix)
        vae.decoder_.load_weights("../models/vae_0_decoder_weights%s.h5" %
                                  vae_suffix)
        vae.vae_.load_weights("../models/vae_0_vae_weights%s.h5" % vae_suffix)

        oracles = [
            keras.models.load_model("../models/oracle_%i%s.h5" %
                                    (i, oracle_suffix))
            for i in range(num_models)
        ]
        if not killoran:
            results, test_max = optimization_algs.killoran_opt(X_train,
                                                               vae,
                                                               oracles,
                                                               ground_truth,
                                                               steps=30000,
                                                               epsilon1=1e-5,
                                                               epsilon2=1.,
                                                               noise_std=1e-5,
                                                               LD=20,
                                                               verbose=False,
                                                               adam=False)

            np.save(
                "../results/mala_results_%s_%i.npy" %
                (train_size_str, RANDOM_STATE), results)
            suffix = "_%s_%i" % (train_size_str, RANDOM_STATE)
            with open('results/%s_max%s.json' % ('mala', suffix),
                      'w') as outfile:
                json.dump(test_max, outfile)

        else:
            results, test_max = optimization_algs.killoran_opt(X_train,
                                                               vae,
                                                               oracles,
                                                               ground_truth,
                                                               steps=10000,
                                                               epsilon1=0.,
                                                               epsilon2=0.1,
                                                               noise_std=1e-6,
                                                               LD=20,
                                                               verbose=False,
                                                               adam=True)
            np.save(
                "../results/killoran_may_results_%s_%i.npy" %
                (train_size_str, RANDOM_STATE), results)
            suffix = "_%s_%i" % (train_size_str, RANDOM_STATE)
            with open('../results/%s_max%s.json' % ('killoran', suffix),
                      'w') as outfile:
                json.dump(test_max, outfile)