def main():
    np.random.seed(RANDOM_SEED)

    data_train, data_test, charset = load_dataset('data/processed.h5')
    print("Charset", charset)
    model = MoleculeVAE()
    model.create(charset, latent_rep_size=292)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.2,
                                  patience=3,
                                  min_lr=0.0001)
    checkpointer = ModelCheckpoint(filepath='model.h5',
                                   verbose=1,
                                   save_best_only=True)

    history = model.autoencoder.fit(data_train[:1000],
                                    data_train[:1000],
                                    shuffle=True,
                                    nb_epoch=NUM_EPOCHS,
                                    batch_size=100,
                                    callbacks=[checkpointer, reduce_lr],
                                    validation_data=(data_test[:1000],
                                                     data_test[:1000]))
    with open('trainHistoryDict', 'wb') as file_pi:
        pickle.dump(history.history, file_pi)
Beispiel #2
0
def main():
    args = get_arguments()
    np.random.seed(args.random_seed)

    from molecules.model import MoleculeVAE
    from molecules.utils import one_hot_array, one_hot_index, from_one_hot_array, \
        decode_smiles_from_indexes, load_dataset
    from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
    
    data_train, data_test, charset = load_dataset(args.data)
    model = MoleculeVAE()
    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size = args.latent_dim)
    else:
        model.create(charset, latent_rep_size = args.latent_dim)

    checkpointer = ModelCheckpoint(filepath = args.model,
                                   verbose = 1,
                                   save_best_only = True)

    reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                                  factor = 0.2,
                                  patience = 3,
                                  min_lr = 0.0001)

    model.autoencoder.fit(
        data_train,
        data_train,
        shuffle = True,
        nb_epoch = args.epochs,
        batch_size = args.batch_size,
        callbacks = [checkpointer, reduce_lr],
        validation_data = (data_test, data_test)
    )
Beispiel #3
0
def visualize_model(args):
    model = MoleculeVAE()

    data, charset = load_dataset(args.data, split = False)

    if os.path.isfile(args.model):
        model.load(charset, args.model)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    plot(model.autoencoder, to_file = args.outfile)
Beispiel #4
0
def visualize_model(args):
    model = MoleculeVAE()

    data, charset = load_dataset(args.data, split = False)

    if os.path.isfile(args.model):
        model.load(charset, args.model)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    plot(model.autoencoder, to_file = args.outfile)
Beispiel #5
0
def main():
    args = get_arguments()
    np.random.seed(args.random_seed)
    
    from molecules.model import MoleculeVAE
    from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
    
    data = pd.read_hdf(args.data, 'table')
    structures = data['structure']

    # import gzip
    # filepath = args.data
    # structures = [line.split()[0].strip() for line in gzip.open(filepath) if line]

    # can also use CanonicalSmilesDataGenerator
    datobj = SmilesDataGenerator(structures, MAX_LEN,
                                 test_split=args.test_split,
                                 random_seed=args.random_seed)
    test_divisor = int((1 - datobj.test_split) / (datobj.test_split))
    train_gen = datobj.train_generator(args.batch_size)
    test_gen = datobj.test_generator(args.batch_size)

    # reformulate generators to not use weights
    train_gen = ((tens, tens) for (tens, _, weights) in train_gen)
    test_gen = ((tens, tens) for (tens, _, weights) in test_gen)

    model = MoleculeVAE()
    if os.path.isfile(args.model):
        model.load(datobj.chars, args.model, latent_rep_size = args.latent_dim)
    else:
        model.create(datobj.chars, latent_rep_size = args.latent_dim)

    checkpointer = ModelCheckpoint(filepath = args.model,
                                   verbose = 1,
                                   save_best_only = True)

    reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                                  factor = 0.2,
                                  patience = 3,
                                  min_lr = 0.0001)

    model.autoencoder.fit_generator(
        train_gen,
        args.epoch_size,
        nb_epoch = args.epochs,
        callbacks = [checkpointer, reduce_lr],
        validation_data = test_gen,
        nb_val_samples = args.epoch_size / test_divisor,
        pickle_safe = True
    )
def main():
    args = get_arguments()

    if os.path.isfile(args.data):
        h5f = h5py.File(args.data, 'r')
        charset = list(h5f['charset'][:])
        h5f.close()
    else:
        raise ValueError("Data file %s doesn't exist" % args.data)

    model = MoleculeVAE()
    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size = args.latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    results = interpolate(args.source, args.dest, args.steps, charset, model, args.latent_dim, args.width)
    for result in results:
        print(result[0], result[2])
def main():
    args = get_arguments()
    model = MoleculeVAE()

    if args.target == 'autoencoder':
        autoencoder(args, model)
    elif args.target == 'encoder':
        encoder(args, model)
    elif args.target == 'decoder':
        decoder(args, model)
Beispiel #8
0
def main():
    args = get_arguments()

    if os.path.isfile(args.data):
        h5f = h5py.File(args.data, 'r')
        charset = list(h5f['charset'][:])
        h5f.close()
    else:
        raise ValueError("Data file %s doesn't exist" % args.data)

    model = MoleculeVAE()
    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=args.latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    results = interpolate(args.source, args.dest, args.steps, charset, model,
                          args.latent_dim, args.width)
    for result in results:
        print(result[0], result[2])
Beispiel #9
0
def main():
    args = get_arguments()
    np.random.seed(args.random_seed)

    from molecules.model import MoleculeVAE
    from molecules.utils import one_hot_array, one_hot_index, from_one_hot_array, \
        decode_smiles_from_indexes, load_dataset
    from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

    data_train, data_test, charset = load_dataset(args.data)
    model = MoleculeVAE()
    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=args.latent_dim)
    else:
        model.create(charset, latent_rep_size=args.latent_dim)

    checkpointer = ModelCheckpoint(filepath=args.model,
                                   verbose=1,
                                   save_best_only=True)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.2,
                                  patience=3,
                                  min_lr=0.0001)

    model.autoencoder.fit(data_train,
                          data_train,
                          shuffle=True,
                          epochs=args.epochs,
                          batch_size=args.batch_size,
                          callbacks=[checkpointer, reduce_lr],
                          validation_data=(data_test, data_test))
Beispiel #10
0
def main():
    args = get_arguments()
    data_train, data_test, charset = load_dataset(args.data)
    model = MoleculeVAE()
    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=args.latent_dim)
    else:
        model.create(charset, latent_rep_size=args.latent_dim)

    checkpointer = ModelCheckpoint(filepath=args.model,
                                   verbose=1,
                                   save_best_only=True)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.2,
                                  patience=3,
                                  min_lr=0.0001)

    model.autoencoder.fit(data_train,
                          data_train,
                          shuffle=True,
                          epochs=args.epochs,
                          batch_size=args.batch_size,
                          callbacks=[checkpointer, reduce_lr],
                          validation_data=(data_test, data_test))
def main():
    args = get_arguments()
    model = MoleculeVAE()

    data, data_test, charset = load_dataset(args.data)

    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=args.latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    x_latent = model.encoder.predict(data)
    if not args.visualize:
        if not args.save_h5:
            np.savetxt(sys.stdout, x_latent, delimiter='\t')
        else:
            h5f = h5py.File(args.save_h5, 'w')
            h5f.create_dataset('charset', data=charset)
            h5f.create_dataset('latent_vectors', data=x_latent)
            h5f.close()
    else:
        visualize_latent_rep(args, model, x_latent)
def main():
    args = get_arguments()
    model = MoleculeVAE()

    data, data_test, charset = load_dataset(args.data)

    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size = args.latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    x_latent = model.encoder.predict(data)
    if not args.visualize:
        if not args.save_h5:
            np.savetxt(sys.stdout, x_latent, delimiter = '\t')
        else:
            h5f = h5py.File(args.save_h5, 'w')
            h5f.create_dataset('charset', data = charset)
            h5f.create_dataset('latent_vectors', data = x_latent)
            h5f.close()
    else:
        visualize_latent_rep(args, model, x_latent)
Beispiel #13
0
def main():
    args = get_arguments()
    data_train, data_test, tokens = load_dataset(args.data, "VAE")
    params = AttnParams()
    params.load(args.model + "params.pkl")
    model = MoleculeVAE(tokens, params)

    IBUPROFEN_SMILES = 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O'
    sampled = model.encode_sample.predict(
        tokens.onehotify(IBUPROFEN_SMILES, params["len_limit"]))
    print(sampled, np.shape(sampled))
    sampled = model.decode.predict(sampled.reshape(1, params["latent_dim"]))
    print(sampled, np.shape(sampled))
    sampled = sampled.argmax(axis=2)[0]
    print(sampled, np.shape(sampled))
    print("Final output", ''.join([tokens.id2t[s] for s in sampled]))
Beispiel #14
0
def main():
    args = get_arguments()
    np.random.seed(args.random_seed)

    from molecules.model import MoleculeVAE, SimpleMoleculeVAE
    from molecules.utils import one_hot_array, one_hot_index, from_one_hot_array, \
        decode_smiles_from_indexes, load_dataset
    from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

    if args.num_cores != -1:
        config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, \
                                allow_soft_placement=True, device_count = {'CPU': args.num_cores})
        session = tf.Session(config=config)
        K.set_session(session)

    data_train, data_test, charset = load_dataset(args.data)

    if args.simple:
        model = SimpleMoleculeVAE()
    else:
        model = MoleculeVAE()

    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=args.latent_dim)
    else:
        model.create(charset, latent_rep_size=args.latent_dim)

    checkpointer = ModelCheckpoint(filepath=args.model,
                                   verbose=1,
                                   save_best_only=True)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.2,
                                  patience=3,
                                  min_lr=0.0001)

    # plot_model(model, to_file='model.png')
    history = model.autoencoder.fit(data_train,
                                    data_train,
                                    shuffle=True,
                                    epochs=args.epochs,
                                    batch_size=args.batch_size,
                                    callbacks=[checkpointer, reduce_lr],
                                    validation_data=(data_test, data_test))
    with open('history.p', 'wb') as f:
        cPickle.dump(history.history, f)
def main():
    args = get_arguments()
    np.random.seed(args.random_seed)

    from molecules.model import MoleculeVAE
    from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

    data = pd.read_hdf(args.data, 'table')
    structures = data['structure']

    # import gzip
    # filepath = args.data
    # structures = [line.split()[0].strip() for line in gzip.open(filepath) if line]

    # can also use CanonicalSmilesDataGenerator
    datobj = SmilesDataGenerator(structures, MAX_LEN,
                                 test_split=args.test_split,
                                 random_seed=args.random_seed)
    test_divisor = int((1 - datobj.test_split) / (datobj.test_split))
    train_gen = datobj.train_generator(args.batch_size)
    test_gen = datobj.test_generator(args.batch_size)

    # reformulate generators to not use weights
    train_gen = ((tens, tens) for (tens, _, weights) in train_gen)
    test_gen = ((tens, tens) for (tens, _, weights) in test_gen)

    model = MoleculeVAE()
    if os.path.isfile(args.model):
        model.load(datobj.chars, args.model, latent_rep_size = args.latent_dim)
    else:
        model.create(datobj.chars, latent_rep_size = args.latent_dim)

    checkpointer = ModelCheckpoint(filepath = args.model,
                                   verbose = 1,
                                   save_best_only = True)

    reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                                  factor = 0.2,
                                  patience = 3,
                                  min_lr = 0.0001)

    model.autoencoder.fit_generator(
        train_gen,
        args.epoch_size,
        epochs = args.epochs,
        callbacks = [checkpointer, reduce_lr],
        validation_data = test_gen,
        nb_val_samples = args.epoch_size / test_divisor,
        pickle_safe = True
    )
Beispiel #16
0
import h5py
import numpy as np
from molecules.model import MoleculeVAE
from molecules.utils import one_hot_array, one_hot_index, from_one_hot_array, \
    decode_smiles_from_indexes, load_dataset
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

NUM_EPOCHS = 100
BATCH_SIZE = 10
LATENT_DIM = 128
RANDOM_SEED = 123

np.random.seed(RANDOM_SEED)  #args.random_seed)

data_train, data_test, charset = load_dataset('./data/processed.h5')
model = MoleculeVAE()
#model.load(charset, args.model, latent_rep_size = args.latent_dim)
model.create(charset, latent_rep_size=LATENT_DIM)

checkpointer = ModelCheckpoint(
    filepath='./test_models/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    verbose=1,
    save_best_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.2,
                              patience=3,
                              min_lr=0.0001)

data_train = data_train[:1]
model.autoencoder.fit(data_train,
Beispiel #17
0
def main():
    args = get_arguments()
    model_dir = args.model_path

    # Get Params
    model_params = AttnParams()
    model_params.load(model_dir + "params.pkl")
    print("Analysing model", model_dir)
    model_params.dump()
    # Get data
    d_file = model_params["data"]
    if model_params["bottleneck"] == "conv" or model_params["decoder"] == "VAE":
        d_type = "onehot"
    else:
        d_type = "cat"
    data_train, data_test, props_train, props_test, tokens = load_dataset(d_file, d_type, False)
    props_train, props_test, prop_labels = load_properties(d_file)

    if "TRANSFORMER" in model_params["decoder"]:
        # Model is an attention based model
        model = TriTransformer(tokens, model_params)
        model.build_models()
        model.compile_vae(Adam(0.001, 0.9, 0.98, epsilon=1e-9))
    else:
        # Model is GRU
        model = MoleculeVAE(tokens, model_params)

    # Assess how close each dimension is to a Gaussian
    # Try to load property training data
    if not exists(model_dir + "latents.h5") and "dothis" == "nothanks":
        print("Generating latent representations from auto-encoder")
        z_train = model.encode_sample.predict([data_train], 64)
        z_test = model.encode_sample.predict([data_test], 64)

        with h5py.File(model_dir + "latents.h5", 'w') as dfile:
            dfile.create_dataset('z_test', data=z_test)
            dfile.create_dataset('z_train', data=z_train)

    print("KURTOSIS:")
    # latent_distributions(model_dir + 'latents.h5', plot_kd=True)

    # Test random molecule
    print("Example decodings with ibruprofen (beam width = 5):")
    print("\tIbuprofen smiles:\t{}".format(IBUPROFEN_SMILES))
    s = model.decode_from_string(IBUPROFEN_SMILES, beam_width=5)
    [print("\t\tDecoding {}:\t\t{}".format(i + 1, seq[0])) for (i, seq) in enumerate(s)]

    print("Exploring property distributions of chemicals from {} decoding(s) of {} random seed(s):".format(
        args.n_decodings,
        args.n_seeds))
    with supress_stderr():
        if args.prior_sample:
            output = rand_mols(args.n_seeds, model_params["latent_dim"], model, args.beam_width)
        else:
            output = property_distributions(data_test, props_test,
                                            num_seeds=args.n_seeds,
                                            num_decodings=args.n_decodings,
                                            model=model,
                                            beam_width=args.beam_width,
                                            data_file=None)  # ,

    print("Generated {} molecules, of which {} were valid.".format(output["num_mols"], output["num_valid"]))
    print("\tValid mols:\t {:.2f}".format(output["num_valid"] / output["num_mols"]))
    if "num_novel" in output: print("\tNovel mols:\t{:.2f}".format(output["num_novel"]))
    print("\tSuccess frac:\t{:.2f}".format(output["success_frac"]))
    print("\tYield:\t{:.2f}".format(output["yield"]))
    for (i, key) in enumerate(rdkit_funcs):
        if key in prop_labels:
            k = prop_labels.index(key)

            print("\t{}:".format(key))
            dat = props_test[:, k]
            print("\t\tTest distribution:\t {:.2f} ± {:.2f}".format(np.mean(dat), np.std(dat)))

            gen_dat = output["gen_props"][:, i]
            print("\t\tGenerated distribution:\t {:.2f} ± {:.2f}".format(np.mean(gen_dat), np.std(gen_dat)))