def main(model, config): set_seed(config.seed) device = torch.device(config.device) if config.config_save is not None: torch.save(config, config.config_save) # For CUDNN to work properly if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) train_data = read_smiles_csv(config.train_load) if config.val_load: val_data = read_smiles_csv(config.val_load) else: val_data = None trainer = MODELS.get_model_trainer(model)(config) if config.vocab_load is not None: assert os.path.exists(config.vocab_load), \ 'vocab_load path does not exist!' vocab = torch.load(config.vocab_load) else: vocab = trainer.get_vocabulary(train_data) if config.vocab_save is not None: torch.save(vocab, config.vocab_save) model = MODELS.get_model_class(model)(vocab, config).to(device) trainer.fit(model, train_data, val_data) model = model.to('cpu') torch.save(model.state_dict(), config.model_save)
def main(config, print_metrics=True): test = None test_scaffolds = None ptest = None ptest_scaffolds = None train = None if config.test_path: test = read_smiles_csv(config.test_path) if config.test_scaffolds_path is not None: test_scaffolds = read_smiles_csv(config.test_scaffolds_path) if config.train_path is not None: train = read_smiles_csv(config.train_path) if config.ptest_path is not None: ptest = np.load(config.ptest_path, allow_pickle=True)['stats'].item() if config.ptest_scaffolds_path is not None: ptest_scaffolds = np.load(config.ptest_scaffolds_path, allow_pickle=True)['stats'].item() gen = read_smiles_csv(config.gen_path) metrics = get_all_metrics(gen=gen, k=config.ks, n_jobs=config.n_jobs, device=config.device, test_scaffolds=test_scaffolds, ptest=ptest, ptest_scaffolds=ptest_scaffolds, test=test, train=train) if print_metrics: for name, value in metrics.items(): print('{},{}'.format(name, value)) else: return metrics
def main(config, print_metrics=True): test = read_smiles_csv(config.test_path) test_scaffolds = None ptest = None ptest_scaffolds = None if config.test_scaffolds_path is not None: test_scaffolds = read_smiles_csv(config.test_scaffolds_path) if config.ptest_path is not None: if not os.path.exists(config.ptest_path): warnings.warn(f'{config.ptest_path} does not exist') ptest = None else: ptest = np.load(config.ptest_path)['stats'].item() if config.ptest_scaffolds_path is not None: if not os.path.exists(config.ptest_scaffolds_path): warnings.warn(f'{config.ptest_scaffolds_path} does not exist') ptest_scaffolds = None else: ptest_scaffolds = np.load( config.ptest_scaffolds_path)['stats'].item() gen = read_smiles_csv(config.gen_path) metrics = get_all_metrics(test, gen, k=config.ks, n_jobs=config.n_jobs, device=config.device, test_scaffolds=test_scaffolds, ptest=ptest, ptest_scaffolds=ptest_scaffolds) if print_metrics: for name, value in metrics.items(): print('{},{}'.format(name, value)) else: return metrics
def main(config, print_metrics=True): ref = read_smiles_csv(config.ref_path) gen = read_smiles_csv(config.gen_path) metrics = get_all_metrics(ref, gen, k=config.ks, n_jobs=config.n_jobs, gpu=config.device_code) if print_metrics: print('Metrics:') for name, value in metrics.items(): print('\t' + name + ' = {}'.format(value)) else: return metrics
def main(config): data = read_smiles_csv(config.train_load) trainer = JTreeTrainer(config) vocab = trainer.get_vocabulary() torch.save(vocab, config.vocab_save)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) if config.conditional_model: labels = read_label_csv(config.train_load) config.labels_size = len(labels[0]) labels = [[int(x) for x in list(l)] for l in labels] train_data = [(x, y) for (x, y) in zip(train, labels)] else: train_data = [(x) for x in train] shuffle(train_data) train_data = train_data[:500000] vocab = CharVocab.from_data(train) torch.save(config, config.config_save) torch.save(vocab, config.vocab_save) device = torch.device(config.device) model = AAE(vocab, config) model = model.to(device) trainer = AAETrainer(config) trainer.fit(model, train_data) model.to('cpu') torch.save(model.state_dict(), config.model_save)
def main(config): set_seed(config.seed) device = torch.device(config.device) data = read_smiles_csv(config.train_load) vocab = None if config.vocab_save is not None and os.path.exists(config.vocab_save): vocab = torch.load(config.vocab_save) corpus = JTreeCorpus(config.n_batch, device).fit(dataset=data, vocabulary=vocab, n_jobs=config.n_jobs) torch.save(corpus.vocab, config.vocab_save) train_dataloader = corpus.transform(data, num_workers=config.n_jobs) model = JTNNVAE(corpus.vocab, config.hidden, config.latent, config.depth) model = model.to(device=device) for param in model.parameters(): if param.dim() == 1: nn.init.constant_(param, 0) else: nn.init.xavier_normal_(param) trainer = JTreeTrainer(config) trainer.fit(model, train_dataloader) torch.save(model.state_dict(), config.model_save) torch.save(config, config.config_save)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) vocab = CharVocab.from_data(train) torch.save(vocab, config.vocab_save) torch.save(config, config.config_save) device = torch.device(config.device) # condition mode if config.conditional: fps = read_fps_csv(config.train_load) fps = fps_to_list(fps) fps = [torch.tensor(f, dtype=torch.float, device=device) for f in fps] # fingerprints length fps_len = len(fps[0]) else: fps = None fps_len = 0 with Pool(config.n_jobs) as pool: reward_func = MetricsReward(train, config.n_ref_subsample, config.rollouts, pool, config.addition_rewards) model = ORGAN(vocab, config, fps_len, reward_func) model = model.to(device) trainer = ORGANTrainer(config) trainer.fit(model, train, fps) torch.save(model.state_dict(), config.model_save)
def main(model, config): set_seed(config.seed) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) if (config.lbann_weights_dir): assert os.path.exists(config.lbann_weights_dir), ( "LBANN inference mode is specified but directory " " to load weights does not exist: '{}'".format( config.lbann_weights_dir)) model_config = torch.load(config.config_load) trainer = MODELS.get_model_trainer(model)(model_config) model_vocab = torch.load(config.vocab_load) model_state = torch.load(config.model_load) model = MODELS.get_model_class(model)(model_vocab, model_config) if os.path.exists(config.lbann_weights_dir): model.load_lbann_weights(config.lbann_weights_dir, config.lbann_epoch_counts) else: # assume that a non-LBANN model is being loaded model.load_state_dict(model_state) model = model.to(device) model.eval() if (config.save_reconstruction): test_data = read_smiles_csv(config.test_path) print("Reconstructing ", len(test_data), " of ", config.test_path, " test samples") test_loader = trainer.get_dataloader(model, test_data, shuffle=False) #tqdm_data = tqdm(test_loader, desc='Reconstruction (batch #{})'.format(batch+1)) tqdm_data = tqdm(test_loader, desc='Reconstruction') model.reconstruct(tqdm_data, config.pred_save) print("Reconstructed samples of ", config.test_path, " saved to ", config.pred_save) samples = [] n = config.n_samples print("Generating Samples") with tqdm(total=config.n_samples, desc='Generating samples') as T: while n > 0: current_samples = model.sample(min(n, config.n_batch), config.max_len) samples.extend(current_samples) n -= len(current_samples) T.update(len(current_samples)) samples = pd.DataFrame(samples, columns=['SMILES']) print("Save generated samples to ", config.gen_save) samples.to_csv(config.gen_save, index=False)
def main(config): data = read_smiles_csv(config.train_load) clusters = set() for smiles in tqdm.tqdm(data): mol = MolTree(smiles) for c in mol.nodes: clusters.add(c.smiles) vocab = JTreeVocab(sorted(list(clusters))) torch.save(vocab, config.vocab_save)
def eval_metrics(eval_config, print_metrics=True): # need to detect if file has the header or not test = read_smiles_csv(model_config.test_path) test_scaffolds = None ptest = None ptest_scaffolds = None if model_config.test_scaffolds_path is not None: test_scaffolds = read_smiles_csv(model_config.test_scaffolds_path) if model_config.ptest_path is not None: if not os.path.exists(model_config.ptest_path): warnings.warn(f"{model_config.ptest_path} does not exist") ptest = None else: ptest = np.load(model_config.ptest_path)["stats"].item() if model_config.ptest_scaffolds_path is not None: if not os.path.exists(model_config.ptest_scaffolds_path): warnings.warn(f"{model_config.ptest_scaffolds_path} does not exist") ptest_scaffolds = None else: ptest_scaffolds = np.load(model_config.ptest_scaffolds_path)["stats"].item() gen = read_smiles_csv(model_config.gen_save) metrics = get_all_metrics( test, gen, k=model_config.ks, n_jobs=model_config.n_jobs, gpu=model_config.gpu, test_scaffolds=test_scaffolds, ptest=ptest, ptest_scaffolds=ptest_scaffolds, ) if print_metrics: print("Metrics:") for name, value in metrics.items(): print("\t" + name + " = {}".format(value)) return metrics else: return metrics
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) device = torch.device(config.device) corpus = OneHotCorpus(config.batch, device) train_dataloader = corpus.fit(train).transform(train) model = CharRNN(corpus.vocab, config.hidden, config.num_layers, config.dropout, device).to(device) trainer = CharRNNTrainer(config) trainer.fit(model, train_dataloader) torch.save(model.state_dict(), config.model_save) torch.save(config, config.config_save) torch.save(corpus.vocab, config.vocab_save)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) vocab = CharVocab.from_data(train) torch.save(config, config.config_save) torch.save(vocab, config.vocab_save) device = torch.device(config.device) model = AAE(vocab, config) model = model.to(device) trainer = AAETrainer(config) trainer.fit(model, train) model.to('cpu') torch.save(model.state_dict(), config.model_save)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) vocab = CharVocab.from_data(train) device = torch.device(config.device) with Pool(config.n_jobs) as pool: reward_func = MetricsReward(train, config.n_ref_subsample, config.rollouts, pool, config.addition_rewards) model = ORGAN(vocab, config, reward_func) model = model.to(device) trainer = ORGANTrainer(config) trainer.fit(model, train) torch.save(model.state_dict(), config.model_save) torch.save(config, config.config_save) torch.save(vocab, config.vocab_save)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) corpus = OneHotCorpus(config.n_batch, device) train = corpus.fit(train).transform(train) # condition mode if config.conditional: fps = read_fps_csv(config.train_load) fps = fps_to_list(fps) fps = [torch.tensor(f, dtype=torch.float, device=device) for f in fps] # fingerprints length fps_len = len(fps[0]) # fingerprints dataloader fps = corpus.fps_transform(fps) # training data train = zip(train, fps) shuffle(train) else: fps_len = 0 model = VAE(corpus.vocab, fps_len, config).to(device) trainer = VAETrainer(config) torch.save(config, config.config_save) torch.save(corpus.vocab, config.vocab_save) trainer.fit(model, train, config.conditional)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) corpus = OneHotCorpus(config.n_batch, device) train = corpus.fit(train).transform(train) model = VAE(corpus.vocab, config).to(device) trainer = VAETrainer(config) torch.save(config, config.config_save) torch.save(corpus.vocab, config.vocab_save) trainer.fit(model, train)
print("Validity % ", (is_valid.count(1)/data_size)*100) print("Same % ", (is_same.count(1)/data_size)*100) valid_tani_dist = [ t for t in tani_dist if t >= 0 ] print("Average tanimoto ", np.mean(np.array(valid_tani_dist))) if output_file is not None: output_columns = ['original', 'decoded', 'is_valid', 'is_same', 'smile_accuracy','tanimoto_distance','total_avg_accuracy'] res_df.to_csv(output_file, index=False, columns=output_columns) return(res_df) fdir = sys.argv[1] #directory of LBANN tensor outputs sd = sys.argv[2] #tag for different noise pertubation values vocab_file = sys.argv[3] #vocab file from PyTorch MOSES library sequence_length = 102 #Max sequence lenght use in LBANN training (100+bos+eos) zdim = 512 #latent space dimension batch_num = 0 #use to control loading different batches of dump (default 0) get_smiles_from_lbann_tensors(fdir,sequence_length, zdim,vocab_file) orig_file = read_smiles_csv(fdir+"gt_batch"+str(batch_num)+"smiles.txt") pred_file = read_smiles_csv(fdir+"pred_batch"+str(batch_num)+"smiles.txt") diff_file = fdir+"sd"+sd+"_smiles_metrics.csv" print("Input/pred SMILES file sizes ", len(orig_file), " ", len(pred_file)) compare_decoded_to_original_smiles(orig_file, pred_file, diff_file) print("Input/pred SMILES diff file saved to", diff_file)