def main(): np.random.seed(RANDOM_SEED) data_train, data_test, charset = load_dataset('data/processed.h5') print("Charset", charset) model = MoleculeVAE() model.create(charset, latent_rep_size=292) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001) checkpointer = ModelCheckpoint(filepath='model.h5', verbose=1, save_best_only=True) history = model.autoencoder.fit(data_train[:1000], data_train[:1000], shuffle=True, nb_epoch=NUM_EPOCHS, batch_size=100, callbacks=[checkpointer, reduce_lr], validation_data=(data_test[:1000], data_test[:1000])) with open('trainHistoryDict', 'wb') as file_pi: pickle.dump(history.history, file_pi)
def main(): args = get_arguments() np.random.seed(args.random_seed) from molecules.model import MoleculeVAE from molecules.utils import one_hot_array, one_hot_index, from_one_hot_array, \ decode_smiles_from_indexes, load_dataset from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau data_train, data_test, charset = load_dataset(args.data) model = MoleculeVAE() if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size = args.latent_dim) else: model.create(charset, latent_rep_size = args.latent_dim) checkpointer = ModelCheckpoint(filepath = args.model, verbose = 1, save_best_only = True) reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, min_lr = 0.0001) model.autoencoder.fit( data_train, data_train, shuffle = True, nb_epoch = args.epochs, batch_size = args.batch_size, callbacks = [checkpointer, reduce_lr], validation_data = (data_test, data_test) )
def visualize_model(args): model = MoleculeVAE() data, charset = load_dataset(args.data, split = False) if os.path.isfile(args.model): model.load(charset, args.model) else: raise ValueError("Model file %s doesn't exist" % args.model) plot(model.autoencoder, to_file = args.outfile)
def main(): args = get_arguments() np.random.seed(args.random_seed) from molecules.model import MoleculeVAE from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau data = pd.read_hdf(args.data, 'table') structures = data['structure'] # import gzip # filepath = args.data # structures = [line.split()[0].strip() for line in gzip.open(filepath) if line] # can also use CanonicalSmilesDataGenerator datobj = SmilesDataGenerator(structures, MAX_LEN, test_split=args.test_split, random_seed=args.random_seed) test_divisor = int((1 - datobj.test_split) / (datobj.test_split)) train_gen = datobj.train_generator(args.batch_size) test_gen = datobj.test_generator(args.batch_size) # reformulate generators to not use weights train_gen = ((tens, tens) for (tens, _, weights) in train_gen) test_gen = ((tens, tens) for (tens, _, weights) in test_gen) model = MoleculeVAE() if os.path.isfile(args.model): model.load(datobj.chars, args.model, latent_rep_size = args.latent_dim) else: model.create(datobj.chars, latent_rep_size = args.latent_dim) checkpointer = ModelCheckpoint(filepath = args.model, verbose = 1, save_best_only = True) reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, min_lr = 0.0001) model.autoencoder.fit_generator( train_gen, args.epoch_size, nb_epoch = args.epochs, callbacks = [checkpointer, reduce_lr], validation_data = test_gen, nb_val_samples = args.epoch_size / test_divisor, pickle_safe = True )
def main(): args = get_arguments() if os.path.isfile(args.data): h5f = h5py.File(args.data, 'r') charset = list(h5f['charset'][:]) h5f.close() else: raise ValueError("Data file %s doesn't exist" % args.data) model = MoleculeVAE() if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size = args.latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) results = interpolate(args.source, args.dest, args.steps, charset, model, args.latent_dim, args.width) for result in results: print(result[0], result[2])
def main(): args = get_arguments() model = MoleculeVAE() if args.target == 'autoencoder': autoencoder(args, model) elif args.target == 'encoder': encoder(args, model) elif args.target == 'decoder': decoder(args, model)
def main(): args = get_arguments() if os.path.isfile(args.data): h5f = h5py.File(args.data, 'r') charset = list(h5f['charset'][:]) h5f.close() else: raise ValueError("Data file %s doesn't exist" % args.data) model = MoleculeVAE() if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=args.latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) results = interpolate(args.source, args.dest, args.steps, charset, model, args.latent_dim, args.width) for result in results: print(result[0], result[2])
def main(): args = get_arguments() np.random.seed(args.random_seed) from molecules.model import MoleculeVAE from molecules.utils import one_hot_array, one_hot_index, from_one_hot_array, \ decode_smiles_from_indexes, load_dataset from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau data_train, data_test, charset = load_dataset(args.data) model = MoleculeVAE() if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=args.latent_dim) else: model.create(charset, latent_rep_size=args.latent_dim) checkpointer = ModelCheckpoint(filepath=args.model, verbose=1, save_best_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001) model.autoencoder.fit(data_train, data_train, shuffle=True, epochs=args.epochs, batch_size=args.batch_size, callbacks=[checkpointer, reduce_lr], validation_data=(data_test, data_test))
def main(): args = get_arguments() data_train, data_test, charset = load_dataset(args.data) model = MoleculeVAE() if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=args.latent_dim) else: model.create(charset, latent_rep_size=args.latent_dim) checkpointer = ModelCheckpoint(filepath=args.model, verbose=1, save_best_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001) model.autoencoder.fit(data_train, data_train, shuffle=True, epochs=args.epochs, batch_size=args.batch_size, callbacks=[checkpointer, reduce_lr], validation_data=(data_test, data_test))
def main(): args = get_arguments() model = MoleculeVAE() data, data_test, charset = load_dataset(args.data) if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=args.latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) x_latent = model.encoder.predict(data) if not args.visualize: if not args.save_h5: np.savetxt(sys.stdout, x_latent, delimiter='\t') else: h5f = h5py.File(args.save_h5, 'w') h5f.create_dataset('charset', data=charset) h5f.create_dataset('latent_vectors', data=x_latent) h5f.close() else: visualize_latent_rep(args, model, x_latent)
def main(): args = get_arguments() model = MoleculeVAE() data, data_test, charset = load_dataset(args.data) if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size = args.latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) x_latent = model.encoder.predict(data) if not args.visualize: if not args.save_h5: np.savetxt(sys.stdout, x_latent, delimiter = '\t') else: h5f = h5py.File(args.save_h5, 'w') h5f.create_dataset('charset', data = charset) h5f.create_dataset('latent_vectors', data = x_latent) h5f.close() else: visualize_latent_rep(args, model, x_latent)
def main(): args = get_arguments() data_train, data_test, tokens = load_dataset(args.data, "VAE") params = AttnParams() params.load(args.model + "params.pkl") model = MoleculeVAE(tokens, params) IBUPROFEN_SMILES = 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O' sampled = model.encode_sample.predict( tokens.onehotify(IBUPROFEN_SMILES, params["len_limit"])) print(sampled, np.shape(sampled)) sampled = model.decode.predict(sampled.reshape(1, params["latent_dim"])) print(sampled, np.shape(sampled)) sampled = sampled.argmax(axis=2)[0] print(sampled, np.shape(sampled)) print("Final output", ''.join([tokens.id2t[s] for s in sampled]))
def main(): args = get_arguments() np.random.seed(args.random_seed) from molecules.model import MoleculeVAE, SimpleMoleculeVAE from molecules.utils import one_hot_array, one_hot_index, from_one_hot_array, \ decode_smiles_from_indexes, load_dataset from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau if args.num_cores != -1: config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, \ allow_soft_placement=True, device_count = {'CPU': args.num_cores}) session = tf.Session(config=config) K.set_session(session) data_train, data_test, charset = load_dataset(args.data) if args.simple: model = SimpleMoleculeVAE() else: model = MoleculeVAE() if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=args.latent_dim) else: model.create(charset, latent_rep_size=args.latent_dim) checkpointer = ModelCheckpoint(filepath=args.model, verbose=1, save_best_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001) # plot_model(model, to_file='model.png') history = model.autoencoder.fit(data_train, data_train, shuffle=True, epochs=args.epochs, batch_size=args.batch_size, callbacks=[checkpointer, reduce_lr], validation_data=(data_test, data_test)) with open('history.p', 'wb') as f: cPickle.dump(history.history, f)
def main(): args = get_arguments() np.random.seed(args.random_seed) from molecules.model import MoleculeVAE from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau data = pd.read_hdf(args.data, 'table') structures = data['structure'] # import gzip # filepath = args.data # structures = [line.split()[0].strip() for line in gzip.open(filepath) if line] # can also use CanonicalSmilesDataGenerator datobj = SmilesDataGenerator(structures, MAX_LEN, test_split=args.test_split, random_seed=args.random_seed) test_divisor = int((1 - datobj.test_split) / (datobj.test_split)) train_gen = datobj.train_generator(args.batch_size) test_gen = datobj.test_generator(args.batch_size) # reformulate generators to not use weights train_gen = ((tens, tens) for (tens, _, weights) in train_gen) test_gen = ((tens, tens) for (tens, _, weights) in test_gen) model = MoleculeVAE() if os.path.isfile(args.model): model.load(datobj.chars, args.model, latent_rep_size = args.latent_dim) else: model.create(datobj.chars, latent_rep_size = args.latent_dim) checkpointer = ModelCheckpoint(filepath = args.model, verbose = 1, save_best_only = True) reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 3, min_lr = 0.0001) model.autoencoder.fit_generator( train_gen, args.epoch_size, epochs = args.epochs, callbacks = [checkpointer, reduce_lr], validation_data = test_gen, nb_val_samples = args.epoch_size / test_divisor, pickle_safe = True )
import h5py import numpy as np from molecules.model import MoleculeVAE from molecules.utils import one_hot_array, one_hot_index, from_one_hot_array, \ decode_smiles_from_indexes, load_dataset from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau NUM_EPOCHS = 100 BATCH_SIZE = 10 LATENT_DIM = 128 RANDOM_SEED = 123 np.random.seed(RANDOM_SEED) #args.random_seed) data_train, data_test, charset = load_dataset('./data/processed.h5') model = MoleculeVAE() #model.load(charset, args.model, latent_rep_size = args.latent_dim) model.create(charset, latent_rep_size=LATENT_DIM) checkpointer = ModelCheckpoint( filepath='./test_models/weights.{epoch:02d}-{val_loss:.2f}.hdf5', verbose=1, save_best_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001) data_train = data_train[:1] model.autoencoder.fit(data_train,
def main(): args = get_arguments() model_dir = args.model_path # Get Params model_params = AttnParams() model_params.load(model_dir + "params.pkl") print("Analysing model", model_dir) model_params.dump() # Get data d_file = model_params["data"] if model_params["bottleneck"] == "conv" or model_params["decoder"] == "VAE": d_type = "onehot" else: d_type = "cat" data_train, data_test, props_train, props_test, tokens = load_dataset(d_file, d_type, False) props_train, props_test, prop_labels = load_properties(d_file) if "TRANSFORMER" in model_params["decoder"]: # Model is an attention based model model = TriTransformer(tokens, model_params) model.build_models() model.compile_vae(Adam(0.001, 0.9, 0.98, epsilon=1e-9)) else: # Model is GRU model = MoleculeVAE(tokens, model_params) # Assess how close each dimension is to a Gaussian # Try to load property training data if not exists(model_dir + "latents.h5") and "dothis" == "nothanks": print("Generating latent representations from auto-encoder") z_train = model.encode_sample.predict([data_train], 64) z_test = model.encode_sample.predict([data_test], 64) with h5py.File(model_dir + "latents.h5", 'w') as dfile: dfile.create_dataset('z_test', data=z_test) dfile.create_dataset('z_train', data=z_train) print("KURTOSIS:") # latent_distributions(model_dir + 'latents.h5', plot_kd=True) # Test random molecule print("Example decodings with ibruprofen (beam width = 5):") print("\tIbuprofen smiles:\t{}".format(IBUPROFEN_SMILES)) s = model.decode_from_string(IBUPROFEN_SMILES, beam_width=5) [print("\t\tDecoding {}:\t\t{}".format(i + 1, seq[0])) for (i, seq) in enumerate(s)] print("Exploring property distributions of chemicals from {} decoding(s) of {} random seed(s):".format( args.n_decodings, args.n_seeds)) with supress_stderr(): if args.prior_sample: output = rand_mols(args.n_seeds, model_params["latent_dim"], model, args.beam_width) else: output = property_distributions(data_test, props_test, num_seeds=args.n_seeds, num_decodings=args.n_decodings, model=model, beam_width=args.beam_width, data_file=None) # , print("Generated {} molecules, of which {} were valid.".format(output["num_mols"], output["num_valid"])) print("\tValid mols:\t {:.2f}".format(output["num_valid"] / output["num_mols"])) if "num_novel" in output: print("\tNovel mols:\t{:.2f}".format(output["num_novel"])) print("\tSuccess frac:\t{:.2f}".format(output["success_frac"])) print("\tYield:\t{:.2f}".format(output["yield"])) for (i, key) in enumerate(rdkit_funcs): if key in prop_labels: k = prop_labels.index(key) print("\t{}:".format(key)) dat = props_test[:, k] print("\t\tTest distribution:\t {:.2f} ± {:.2f}".format(np.mean(dat), np.std(dat))) gen_dat = output["gen_props"][:, i] print("\t\tGenerated distribution:\t {:.2f} ± {:.2f}".format(np.mean(gen_dat), np.std(gen_dat)))