Ejemplo n.º 1
0
 def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
     SparseGP.__init__(self,
                       X,
                       likelihood,
                       kernel,
                       Z,
                       X_variance=None,
                       normalize_X=False)
     assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"
Ejemplo n.º 2
0
y_train = y[permutation][0:np.int(np.round(0.9 * n))]
y_test = y[permutation][np.int(np.round(0.9 * n)):]

import os.path

np.random.seed(random_seed)

iteration = 0
while iteration < 5:

    # We fit the GP

    np.random.seed(iteration * random_seed)
    M = 500
    sgp = SparseGP(X_train, 0 * X_train, y_train, M)
    sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0,  \
        y_test, minibatch_size = 10 * M, max_iterations = 50, learning_rate = 0.0005)

    pred, uncert = sgp.predict(X_test, 0 * X_test)
    error = np.sqrt(np.mean((pred - y_test)**2))
    testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert)))
    print 'Test RMSE: ', error
    print 'Test ll: ', testll

    pred, uncert = sgp.predict(X_train, 0 * X_train)
    error = np.sqrt(np.mean((pred - y_train)**2))
    trainll = np.mean(sps.norm.logpdf(pred - y_train, scale=np.sqrt(uncert)))
    print 'Train RMSE: ', error
    print 'Train ll: ', trainll
Ejemplo n.º 3
0
    if os.path.exists(save_dir + 'Test_RMSE_ll.txt'):
        os.remove(save_dir + 'Test_RMSE_ll.txt')
    if os.path.exists(save_dir + 'best_arc_scores.txt'):
        os.remove(save_dir + 'best_arc_scores.txt')
    while iteration < BO_rounds:

        if args.predictor:
            pred = model.predictor(torch.FloatTensor(X_test).to(device))
            pred = pred.detach().cpu().numpy()
            pred = (-pred - mean_y_train) / std_y_train
            uncert = np.zeros_like(pred)
        else:
            # We fit the GP
            M = 500
            sgp = SparseGP(X_train, 0 * X_train, y_train, M)
            sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0,  \
                y_test, minibatch_size = 2 * M, max_iterations = max_iter, learning_rate = lr)
            pred, uncert = sgp.predict(X_test, 0 * X_test)

        print("predictions: ", pred.reshape(-1))
        print("real values: ", y_test.reshape(-1))
        error = np.sqrt(np.mean((pred - y_test)**2))
        testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert)))
        print('Test RMSE: ', error)
        print('Test ll: ', testll)
        pearson = float(pearsonr(pred.reshape(-1), y_test.reshape(-1))[0])
        print('Pearson r: ', pearson)
        with open(save_dir + 'Test_RMSE_ll.txt', 'a') as test_file:
            test_file.write(
                'Test RMSE: {:.4f}, ll: {:.4f}, Pearson r: {:.4f}\n'.format(
    y = y.reshape((-1, 1))
    n = X.shape[0]
    permutation = np.random.choice(n, n, replace=False)

    X_train = X[permutation, :][0:np.int(np.round(0.9 * n)), :]
    X_test = X[permutation, :][np.int(np.round(0.9 * n)):, :]

    y_train = y[permutation][0:np.int(np.round(0.9 * n))]
    y_test = y[permutation][np.int(np.round(0.9 * n)):]

    iteration = 0
    while iteration < 5:

        np.random.seed(iteration * args.seed)
        M = 500
        sgp = SparseGP(X_train, 0 * X_train, y_train, M)
        sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0,  \
            y_test, minibatch_size = 10 * M, max_iterations = cmd_args.num_epochs, learning_rate = args.gp_lr)

        # pred, uncert = sgp.predict(X_test, 0 * X_test)
        # error = np.sqrt(np.mean((pred - y_test)**2))
        # testll = np.mean(sps.norm.logpdf(pred - y_test, scale = np.sqrt(uncert)))
        # print 'Test RMSE: ', error
        # print 'Test ll: ', testll

        # pred, uncert = sgp.predict(X_train, 0 * X_train)
        # error = np.sqrt(np.mean((pred - y_train)**2))
        # trainll = np.mean(sps.norm.logpdf(pred - y_train, scale = np.sqrt(uncert)))
        # print 'Train RMSE: ', error
        # print 'Train ll: ', trainll
Ejemplo n.º 5
0
    
    # y /= np.max(y)
    assert X.shape[0] == y.shape[0]

    n = X.shape[ 0 ]
    permutation = np.random.choice(n, n, replace = False)

    X_train = X[ permutation, : ][ 0 : np.int(np.round(0.9 * n)), : ]
    X_test = X[ permutation, : ][ np.int(np.round(0.9 * n)) :, : ]

    y_train = y[ permutation ][ 0 : np.int(np.round(0.9 * n)) ]
    y_test = y[ permutation ][ np.int(np.round(0.9 * n)) : ]

    np.random.seed(0)
    M = 500
    sgp = SparseGP(X_train, 0 * X_train, y_train, M)
    sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0,  \
        y_test, minibatch_size = 10 * M, max_iterations = cmd_args.num_epochs, learning_rate = args.gp_lr)

    with open('%s/sgp-e-%d-seed-%d-lr-%.4f.txt' % (cmd_args.save_dir, cmd_args.num_epochs, args.seed, args.gp_lr), 'w') as f:
        pred, uncert = sgp.predict(X_test, 0 * X_test)
        error = np.sqrt(np.mean((pred - y_test)**2))
        testll = np.mean(sps.norm.logpdf(pred - y_test, scale = np.sqrt(uncert)))
        f.write('Test RMSE: %.10f\n' % error)
        f.write('Test ll: %.10f\n' % testll)
        print 'Test RMSE: ', error
        print 'Test ll: ', testll

        pred, uncert = sgp.predict(X_train, 0 * X_train)
        error = np.sqrt(np.mean((pred - y_train)**2))
        trainll = np.mean(sps.norm.logpdf(pred - y_train, scale = np.sqrt(uncert)))
Ejemplo n.º 6
0
    # y /= np.max(y)
    assert X.shape[0] == y.shape[0]

    n = X.shape[0]
    permutation = np.random.choice(n, n, replace=False)

    X_train = X[permutation, :][0:np.int(np.round(0.9 * n)), :]
    X_test = X[permutation, :][np.int(np.round(0.9 * n)):, :]

    y_train = y[permutation][0:np.int(np.round(0.9 * n))]
    y_test = y[permutation][np.int(np.round(0.9 * n)):]

    np.random.seed(0)
    M = 500
    sgp = SparseGP(X_train, 0 * X_train, y_train, M)
    sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0,  \
        y_test, minibatch_size = 10 * M, max_iterations = cmd_args.num_epochs, learning_rate = args.gp_lr)

    with open(
            '%s/sgp-e-%d-seed-%d-lr-%.4f.txt' %
        (cmd_args.save_dir, cmd_args.num_epochs, args.seed, args.gp_lr),
            'w') as f:
        pred, uncert = sgp.predict(X_test, 0 * X_test)
        error = np.sqrt(np.mean((pred - y_test)**2))
        testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert)))
        f.write('Test RMSE: %.10f\n' % error)
        f.write('Test ll: %.10f\n' % testll)
        print 'Test RMSE: ', error
        print 'Test ll: ', testll
Ejemplo n.º 7
0
 def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
     SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False)
     assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"
Ejemplo n.º 8
0
def run_bo_demo():
    import sys
    sys.path.append('/home/icml18-jtnn')
    import pickle
    import gzip
    from sparse_gp import SparseGP
    import scipy.stats as sps
    import numpy as np
    import os.path
    import time
    import rdkit
    from rdkit.Chem import MolFromSmiles, MolToSmiles
    from rdkit.Chem import Descriptors
    from rdkit.Chem import PandasTools
    import torch
    import torch.nn as nn
    from jtnn import create_var, JTNNVAE, Vocab

    start_time = time.time()
    lg = rdkit.RDLogger.logger()
    lg.setLevel(rdkit.RDLogger.CRITICAL)

    # We define the functions used to load and save objects
    def save_object(obj, filename):
        result = pickle.dumps(obj)
        with gzip.GzipFile(filename, 'wb') as dest:
            dest.write(result)
        dest.close()

    def load_object(filename):
        with gzip.GzipFile(filename, 'rb') as source:
            result = source.read()
        ret = pickle.loads(result)
        source.close()
        return ret

    vocab_path = '../data/vocab.txt'
    #save_dir=save_dir
    vocab = [x.strip("\r\n ") for x in open(vocab_path)]
    vocab = Vocab(vocab)
    #    print(opts.save_dir)
    hidden_size = 450
    latent_size = 56
    depth = 3
    random_seed = 1
    model = JTNNVAE(vocab, hidden_size, latent_size, depth)
    model.load_state_dict(
        torch.load('../molvae/MPNVAE-h450-L56-d3-beta0.005/model.iter-4',
                   map_location=lambda storage, loc: storage))
    #model = model.cuda()

    # We load the random seed
    np.random.seed(random_seed)

    # We load the data (y is minued!)
    X = np.loadtxt('latent_features_demo.txt')
    y = -np.loadtxt('targets_demo.txt')
    y = y.reshape((-1, 1))

    n = X.shape[0]
    #    print(X.shape[1])
    permutation = np.random.choice(n, n, replace=False)
    #   print(n)
    X_train = X[permutation, :][0:np.int(np.round(0.8 * n)), :]
    X_test = X[permutation, :][np.int(np.round(0.8 * n)):, :]
    #  print(X_train.shape)
    y_train = y[permutation][0:np.int(np.round(0.8 * n))]
    y_test = y[permutation][np.int(np.round(0.8 * n)):]

    np.random.seed(random_seed)

    logP_values = np.loadtxt('logP_values_demo.txt')
    SA_scores = np.loadtxt('SA_scores_demo.txt')
    cycle_scores = np.loadtxt('cycle_scores_demo.txt')
    SA_scores_normalized = (np.array(SA_scores) -
                            np.mean(SA_scores)) / np.std(SA_scores)
    logP_values_normalized = (np.array(logP_values) -
                              np.mean(logP_values)) / np.std(logP_values)
    cycle_scores_normalized = (np.array(cycle_scores) -
                               np.mean(cycle_scores)) / np.std(cycle_scores)

    iteration = 0
    while iteration < 1:
        # We fit the GP
        np.random.seed(iteration * random_seed)
        M = 1
        sgp = SparseGP(X_train, 0 * X_train, y_train, M)
        sgp.train_via_ADAM(X_train,
                           0 * X_train,
                           y_train,
                           X_test,
                           X_test * 0,
                           y_test,
                           minibatch_size=2,
                           max_iterations=100,
                           learning_rate=0.001)

        pred, uncert = sgp.predict(X_test, 0 * X_test)
        error = np.sqrt(np.mean((pred - y_test)**2))
        testll = np.mean(sps.norm.logpdf(pred - y_test, scale=np.sqrt(uncert)))
        #     print 'Test RMSE: ', error
        #    print 'Test ll: ', testll

        pred, uncert = sgp.predict(X_train, 0 * X_train)
        error = np.sqrt(np.mean((pred - y_train)**2))
        trainll = np.mean(
            sps.norm.logpdf(pred - y_train, scale=np.sqrt(uncert)))
        #   print 'Train RMSE: ', error
        #  print 'Train ll: ', trainll

        # We pick the next 60 inputs
        next_inputs = sgp.batched_greedy_ei(60, np.min(X_train, 0),
                                            np.max(X_train, 0))
        valid_smiles = []
        valid_mols = []
        new_features = []
        for i in xrange(60):
            all_vec = next_inputs[i].reshape((1, -1))
            tree_vec, mol_vec = np.hsplit(all_vec, 2)
            tree_vec = create_var(torch.from_numpy(tree_vec).float())
            mol_vec = create_var(torch.from_numpy(mol_vec).float())
            s = model.decode(tree_vec, mol_vec, prob_decode=False)
            if s is not None:
                valid_smiles.append(s)
                # print(MolFromSmiles(s))
                valid_mols.append(str(MolFromSmiles(s)))
                new_features.append(all_vec)

        print len(valid_smiles), "molecules are found"
        valid_smiles = valid_smiles[:50]
        valid_mols = valid_mols[:50]
        new_features = next_inputs[:50]
        new_features = np.vstack(new_features)

        #   save_object(valid_smiles, save_dir + "/valid_smiles{}.dat".format(iteration))
        #  save_object(valid_mols,save_dir + '/valid_mols{}.png'.format(iteration))
        #        save_object(mol1
        import sascorer
        import networkx as nx
        from rdkit.Chem import rdmolops

        scores = []
        for i in range(len(valid_smiles)):
            current_log_P_value = Descriptors.MolLogP(
                MolFromSmiles(valid_smiles[i]))
            current_SA_score = -sascorer.calculateScore(
                MolFromSmiles(valid_smiles[i]))
            cycle_list = nx.cycle_basis(
                nx.Graph(
                    rdmolops.GetAdjacencyMatrix(MolFromSmiles(
                        valid_smiles[i]))))
            if len(cycle_list) == 0:
                cycle_length = 0
            else:
                cycle_length = max([len(j) for j in cycle_list])
            if cycle_length <= 6:
                cycle_length = 0
            else:
                cycle_length = cycle_length - 6

            current_cycle_score = -cycle_length

            current_SA_score_normalized = (
                current_SA_score - np.mean(SA_scores)) / np.std(SA_scores)
            current_log_P_value_normalized = (
                current_log_P_value -
                np.mean(logP_values)) / np.std(logP_values)
            current_cycle_score_normalized = (
                current_cycle_score -
                np.mean(cycle_scores)) / np.std(cycle_scores)

            score = current_SA_score_normalized + current_log_P_value_normalized + current_cycle_score_normalized
            scores.append(-score)  #target is always minused

    # print valid_smiles
    # print scores

    #    save_object(scores, save_dir + "/scores{}.dat".format(iteration))

        if len(new_features) > 0:
            X_train = np.concatenate([X_train, new_features], 0)
            y_train = np.concatenate([y_train, np.array(scores)[:, None]], 0)

        iteration += 1

#    print('Seconds taken: %s' %(time.time() -start_time))
    all_smiles = []
    #all_smiles=valid_smiles+scores+valid_mols
    all_smiles.extend(zip(valid_smiles, scores, valid_mols))
    all_smiles = [(x, -y, z) for x, y, z in all_smiles]
    all_smiles = sorted(all_smiles, key=lambda x: x[1], reverse=True)
    # return valid_smiles[0:5],scores[0:5],valid_mols[0:5]
    return all_smiles[0:3]
Ejemplo n.º 9
0
    y = y.reshape((-1, 1))
    n = X.shape[ 0 ]
    permutation = np.random.choice(n, n, replace = False)

    X_train = X[ permutation, : ][ 0 : np.int(np.round(0.9 * n)), : ]
    X_test = X[ permutation, : ][ np.int(np.round(0.9 * n)) :, : ]

    y_train = y[ permutation ][ 0 : np.int(np.round(0.9 * n)) ]
    y_test = y[ permutation ][ np.int(np.round(0.9 * n)) : ]

    iteration = 0
    while iteration < 5:

        np.random.seed(iteration * args.seed)
        M = 500
        sgp = SparseGP(X_train, 0 * X_train, y_train, M)
        sgp.train_via_ADAM(X_train, 0 * X_train, y_train, X_test, X_test * 0,  \
            y_test, minibatch_size = 10 * M, max_iterations = cmd_args.num_epochs, learning_rate = args.gp_lr)
    
        # pred, uncert = sgp.predict(X_test, 0 * X_test)
        # error = np.sqrt(np.mean((pred - y_test)**2))
        # testll = np.mean(sps.norm.logpdf(pred - y_test, scale = np.sqrt(uncert)))
        # print 'Test RMSE: ', error
        # print 'Test ll: ', testll

        # pred, uncert = sgp.predict(X_train, 0 * X_train)
        # error = np.sqrt(np.mean((pred - y_train)**2))
        # trainll = np.mean(sps.norm.logpdf(pred - y_train, scale = np.sqrt(uncert)))
        # print 'Train RMSE: ', error
        # print 'Train ll: ', trainll    
def main(input_directory, output_directory):
    """

    :param input_directory: directory to which the output of Branin_Sampler.py was saved.
    :param output_directory: directory in which to save the plots.
    """

    np.random.seed(2)

    # Load the dataset

    X_bran = genfromtxt(input_directory + '/inputs.csv',
                        delimiter=',',
                        dtype='float32')
    y_con = genfromtxt(input_directory + '/constraint_targets.csv',
                       delimiter=',',
                       dtype='int')
    y_reg = genfromtxt(input_directory + '/branin_targets.csv',
                       delimiter=',',
                       dtype='float32')
    y_reg = y_reg.reshape((-1, 1))

    # We convert constraint targets from one-hot to categorical.

    y_con_cat = np.zeros(len(y_con), dtype=int)
    i = 0

    for element in y_con:
        if element[0] == 1:
            y_con_cat[i] = 1
        else:
            y_con_cat[i] = 0
        i += 1

    y_con = y_con_cat

    n_bran = X_bran.shape[0]  # number of examples

    permutation = np.random.choice(n_bran, n_bran,
                                   replace=False)  # We shuffle the data

    X_tr_bran = X_bran[permutation, :][40:np.int(np.round(
        0.9 * n_bran)), :]  # 50/10 train/test split.
    X_te_bran = X_bran[permutation, :][
        np.int(np.round(0.8 * n_bran)):np.int(np.round(0.9 * n_bran)), :]

    y_tr_reg = y_reg[permutation][40:np.int(
        np.round(0.9 * n_bran)
    )]  # 10:20 have balanced class split after the permutation is applied with random seed = 1
    y_te_reg = y_reg[permutation][np.int(np.round(0.8 * n_bran)):np.
                                  int(np.round(0.9 * n_bran))]
    y_tr_con = y_con[permutation][40:np.int(
        np.round(0.9 * n_bran)
    )]  # no test set for constraint as traning subroutine for BNN doesn't require it
    y_te_con = y_con[permutation][np.int(np.round(0.8 * n_bran)):np.
                                  int(np.round(0.9 * n_bran))]

    # We plot the data used to initialise the surrogate model

    X1 = X_tr_bran[:, 0]
    X2 = X_tr_bran[:, 1]

    save_object(X1, output_directory + "/X1.dat")
    save_object(X2, output_directory + "/X2.dat")

    # We store the best feasible value found in the training set for reference

    feasible_vals = []

    for i in range(X_tr_bran.shape[0]):

        if y_tr_con[i] == 0:
            continue

        feasible_vals.append([branin(tuple(X_tr_bran[i]))])

    best_tr = min(feasible_vals)
    best_tr = best_tr[0]

    save_object(best_tr,
                output_directory + "/best_feasible_training_point.dat")

    # We set the number of data colletion iterations

    num_iters = 4

    for iteration in range(num_iters):

        # We train the regression model

        # We fit the GP

        # M = np.int(np.maximum(10,np.round(0.1 * n_bran)))

        M = 20

        sgp = SparseGP(X_tr_bran, 0 * X_tr_bran, y_tr_reg, M)
        sgp.train_via_ADAM(X_tr_bran,
                           0 * X_tr_bran,
                           y_tr_reg,
                           X_te_bran,
                           X_te_bran * 0,
                           y_te_reg,
                           minibatch_size=M,
                           max_iterations=400,
                           learning_rate=0.005)

        save_object(sgp, output_directory + "/sgp{}.dat".format(iteration))

        # We load the saved gp

        sgp = load_object(output_directory + "/sgp{}.dat".format(iteration))

        # We load some previous trained gp

        pred, uncert = sgp.predict(X_te_bran, 0 * X_te_bran)
        error = np.sqrt(np.mean((pred - y_te_reg)**2))
        testll = np.mean(
            sps.norm.logpdf(pred - y_te_reg, scale=np.sqrt(uncert)))
        print('Test RMSE: ', error)
        print('Test ll: ', testll)

        pred, uncert = sgp.predict(X_tr_bran, 0 * X_tr_bran)
        error = np.sqrt(np.mean((pred - y_tr_reg)**2))
        trainll = np.mean(
            sps.norm.logpdf(pred - y_tr_reg, scale=np.sqrt(uncert)))
        print('Train RMSE: ', error)
        print('Train ll: ', trainll)

        # we train the constraint network

        # We load the random seed

        seed = 1
        np.random.seed(seed)

        # We load the data

        datasets, n, d, n_labels = load_data(X_tr_bran, y_tr_con, X_te_bran,
                                             y_te_con)

        train_set_x, train_set_y = datasets[0]
        test_set_x, test_set_y = datasets[1]

        N_train = train_set_x.get_value(borrow=True).shape[0]
        N_test = test_set_x.get_value(borrow=True).shape[0]
        layer_sizes = [d, 50, n_labels]
        n_samples = 50
        alpha = 0.5
        learning_rate = 0.001
        v_prior = 1.0
        batch_size = 10
        print('... building model')
        sys.stdout.flush()
        bb_alpha = BB_alpha(layer_sizes, n_samples, alpha, learning_rate,
                            v_prior, batch_size, train_set_x, train_set_y,
                            N_train, test_set_x, test_set_y, N_test)
        print('... training')
        sys.stdout.flush()

        test_error, test_ll = bb_alpha.train(400)

        # We save the trained BNN

        sys.setrecursionlimit(4000)  # Required to save the BNN

        save_object(bb_alpha,
                    output_directory + "/bb_alpha{}.dat".format(iteration))

        # We pick the next 5 inputs based on random sampling

        np.random.seed()

        num_inputs = 1

        x1 = np.random.uniform(-5, 10, size=num_inputs)
        x2 = np.random.uniform(0, 15, size=num_inputs)
        random_inputs = np.zeros([num_inputs, 2])
        random_inputs[:, 0] = x1
        random_inputs[:, 1] = x2

        reg_scores = []  # collect y-values for Branin-Hoo function
        con_scores = []  # collect y-values for Constraint function
        probs = []  # collect the probabilities of satisfying the constraint
        log_probs = [
        ]  # collect the log probabilities of satisfying the constraint

        for i in range(random_inputs.shape[0]):

            reg_scores.append([branin(tuple(random_inputs[i]))])

            if (random_inputs[i][0] - 2.5)**2 + (random_inputs[i][1] -
                                                 7.5)**2 <= 50:
                con_scores.append(np.int64(1))
            else:
                con_scores.append(np.int64(0))

            probs.append(
                bb_alpha.prediction_probs(random_inputs[i].reshape(
                    1, d))[0][0][1])
            log_probs.append(
                bb_alpha.pred_log_probs(random_inputs[i].reshape(1,
                                                                 d))[0][0][1])

            print(i)

        # print the value of the Branin-Hoo function at the data points we have acquired

        print(reg_scores)

        # save y-values and (x1,x2)-coordinates of locations chosen for evaluation

        save_object(reg_scores,
                    output_directory + "/scores{}.dat".format(iteration))
        save_object(random_inputs,
                    output_directory + "/next_inputs{}.dat".format(iteration))
        save_object(con_scores,
                    output_directory + "/con_scores{}.dat".format(iteration))
        save_object(probs, output_directory + "/probs{}.dat".format(iteration))
        save_object(log_probs,
                    output_directory + "/log_probs{}.dat".format(iteration))

        # extend labelled training data for next cycle

        X_tr_bran = np.concatenate([X_tr_bran, random_inputs], 0)
        y_tr_reg = np.concatenate([y_tr_reg, np.array(reg_scores)], 0)
        y_tr_con = np.concatenate([y_tr_con, np.array(con_scores)], 0)

    best_so_far(
        output_directory, num_iters
    )  # Plot the best point as a function of the data collection iteration number
    GP_contours(output_directory,
                num_iters)  # Plot the contours of the GP regression model
    BNN_contours(output_directory,
                 num_iters)  # Plot the contours of the BNN constraint model
    initial_data(
        output_directory)  # Plot the data used to initialise the model