def main(model, config): set_seed(2048) model_config = torch.load('./temp') model_state = torch.load(config.model_load) model_vocab = None model = MODELS.get_model_class(model)(model_vocab, model_config) model.load_state_dict(model_state) model = model.cuda() model.eval() model.model_loaded = True _, smi2vec = load_model() content_test = pickle.load( open(f'./data/content_test_{config.target}.pkl', 'rb')) # [:100] style_instance = pickle.load( open(f'./data/style_instance_test_{config.target}.pkl', 'rb')) # [:30000] latent_content_test = model.heteroencoder.encode(smi2vec(content_test)) latent_style_instance = model.heteroencoder.encode(smi2vec(style_instance)) print(latent_content_test.shape, latent_style_instance.shape) samples = model.sample_per_act(latent_content_test, latent_style_instance, num_per_act=config.num_per_act, n_ins=config.n_ins) os.makedirs('./eval/results', exist_ok=True) pickle.dump(samples, open(f'./eval/results/{config.target}_result.pkl', 'wb'))
def main(config): set_seed(config.seed) model_config = torch.load(config.config_load) model_vocab = torch.load(config.vocab_load) model_state = torch.load(config.model_load) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) model = VAE(model_vocab, model_config) model.load_state_dict(model_state) model = model.to(device) model.eval() gen, n = [], config.n_samples T = tqdm.tqdm(range(config.n_samples), desc='Generating mols') while n > 0: x = model.sample(min(n, config.n_batch), config.max_len)[-1] mols = [model_vocab.ids2string(i_x.tolist()) for i_x in x] n -= len(mols) T.update(len(mols)) T.refresh() gen.extend(mols) df = pd.DataFrame(gen, columns=['SMILES']) df.to_csv(config.gen_save, index=False)
def main(model, config): os.makedirs(config.model_save, exist_ok=True) set_seed(config.seed) device = torch.device(config.device) if config.config_save is not None: torch.save(config, config.config_save) # For CUDNN to work properly if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) content_train = pickle.load( open(f'./data/content_train_{config.target}.pkl', 'rb')) style_instance = pickle.load( open(f'./data/style_instance_train_{config.target}.pkl', 'rb')) trainer = MODELS.get_model_trainer(model)(config) vocab = None model = MODELS.get_model_class(model)(vocab, config) if config.model_load is not None: print(f'load model from {config.model_load}') model_state = torch.load(config.model_load) model.load_state_dict(model_state) model = model.to(device) trainer.fit(model, content_train=content_train, style_instance=style_instance)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) if config.conditional_model: labels = read_label_csv(config.train_load) config.labels_size = len(labels[0]) labels = [[int(x) for x in list(l)] for l in labels] train_data = [(x, y) for (x, y) in zip(train, labels)] else: train_data = [(x) for x in train] shuffle(train_data) train_data = train_data[:500000] vocab = CharVocab.from_data(train) torch.save(config, config.config_save) torch.save(vocab, config.vocab_save) device = torch.device(config.device) model = AAE(vocab, config) model = model.to(device) trainer = AAETrainer(config) trainer.fit(model, train_data) model.to('cpu') torch.save(model.state_dict(), config.model_save)
def main(model, config): set_seed(config.seed) device = torch.device(config.device) if config.config_save is not None: torch.save(config, config.config_save) # For CUDNN to work properly if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) train_data = read_smiles_csv(config.train_load) if config.val_load: val_data = read_smiles_csv(config.val_load) else: val_data = None trainer = MODELS.get_model_trainer(model)(config) if config.vocab_load is not None: assert os.path.exists(config.vocab_load), \ 'vocab_load path does not exist!' vocab = torch.load(config.vocab_load) else: vocab = trainer.get_vocabulary(train_data) if config.vocab_save is not None: torch.save(vocab, config.vocab_save) model = MODELS.get_model_class(model)(vocab, config).to(device) trainer.fit(model, train_data, val_data) model = model.to('cpu') torch.save(model.state_dict(), config.model_save)
def main(config): set_seed(config.seed) model_config = torch.load(config.config_load) model_vocab = torch.load(config.vocab_load) model_state = torch.load(config.model_load) device = torch.device(config.device) model = AAE(model_vocab, model_config) model.load_state_dict(model_state) model = model.to(device) model.eval() samples = [] n = config.n_samples with tqdm.tqdm(total=config.n_samples, desc='Generating samples') as T: while n > 0: current_samples = model.sample(min(n, config.n_batch), config.max_len) samples.extend(current_samples) n -= len(current_samples) T.update(len(current_samples)) samples = pd.DataFrame(samples, columns=['SMILES']) samples.to_csv(config.gen_save, index=False)
def main(model, config): set_seed(config.seed) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) model_config = torch.load(config.config_load) model_vocab = torch.load(config.vocab_load) model_state = torch.load(config.model_load) model = MODELS.get_model_class(model)(model_vocab, model_config) model.load_state_dict(model_state) model = model.to(device) model.eval() samples = [] n = config.n_samples with tqdm(total=config.n_samples, desc='Generating samples') as T: while n > 0: current_samples = model.sample( min(n, config.n_batch), config.max_len ) samples.extend(current_samples) n -= len(current_samples) T.update(len(current_samples)) samples = pd.DataFrame(samples, columns=['SMILES']) samples.to_csv(config.gen_save, index=False)
def main(config): set_seed(config.seed) model_vocab = torch.load(config.vocab_load) model_config = torch.load(config.config_load) model_state = torch.load(config.model_load) device = torch.device(config.device) model = CharRNN(model_vocab, model_config.hidden, model_config.num_layers, model_config.dropout, device) model.load_state_dict(model_state) model = model.to(device=device) model.eval() gen_smiles = [] # TODO: n_samples % batch = 0 for i in tqdm.tqdm(range(config.n_samples // config.n_batch)): smiles_list = model.sample_smiles(config.max_len, config.n_batch) for t in smiles_list: gen_smiles.append(model_vocab.ids2string([i.item() for i in t])) df = pd.DataFrame(gen_smiles, columns=['SMILES']) df.to_csv(config.gen_save, index=False)
def main(config): set_seed(config.seed) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) # Hardcode model_config = torch.load( pjoin(BASE_DIR, 'checkpoints', 'vae', 'vae_1', 'vae_config.pt')) model_vocab = torch.load( pjoin(BASE_DIR, 'checkpoints', 'vae', 'vae_1', 'vae_vocab.pt')) model_state = torch.load( pjoin(BASE_DIR, 'checkpoints', 'vae', 'vae_1', 'vae_model.pt')) model = VAEEncode(model_vocab, model_config) model.load_state_dict(model_state) model = model.to(device) model.eval() smile = pd.read_csv(pjoin('/mol_data', 'DeepAffinity', 'merged_data.tsv'), delimiter='\t') smile = smile['Canonical SMILE'].values embeds = [] for smi in tqdm(smile, desc='Running VAE encoder'): smi = model.string2tensor(smi) embeds.append(model.forward_encoder_no_noise([smi]).cpu().numpy()[0]) samples = pd.DataFrame(embeds) samples.to_csv(pjoin(BASE_DIR, 'checkpoints', 'vae', 'vae_1', 'embeds.csv'), index=False)
def main(config): set_seed(config.seed) device = torch.device(config.device) data = read_smiles_csv(config.train_load) vocab = None if config.vocab_save is not None and os.path.exists(config.vocab_save): vocab = torch.load(config.vocab_save) corpus = JTreeCorpus(config.n_batch, device).fit(dataset=data, vocabulary=vocab, n_jobs=config.n_jobs) torch.save(corpus.vocab, config.vocab_save) train_dataloader = corpus.transform(data, num_workers=config.n_jobs) model = JTNNVAE(corpus.vocab, config.hidden, config.latent, config.depth) model = model.to(device=device) for param in model.parameters(): if param.dim() == 1: nn.init.constant_(param, 0) else: nn.init.xavier_normal_(param) trainer = JTreeTrainer(config) trainer.fit(model, train_dataloader) torch.save(model.state_dict(), config.model_save) torch.save(config, config.config_save)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) vocab = CharVocab.from_data(train) torch.save(vocab, config.vocab_save) torch.save(config, config.config_save) device = torch.device(config.device) # condition mode if config.conditional: fps = read_fps_csv(config.train_load) fps = fps_to_list(fps) fps = [torch.tensor(f, dtype=torch.float, device=device) for f in fps] # fingerprints length fps_len = len(fps[0]) else: fps = None fps_len = 0 with Pool(config.n_jobs) as pool: reward_func = MetricsReward(train, config.n_ref_subsample, config.rollouts, pool, config.addition_rewards) model = ORGAN(vocab, config, fps_len, reward_func) model = model.to(device) trainer = ORGANTrainer(config) trainer.fit(model, train, fps) torch.save(model.state_dict(), config.model_save)
def main(model, config): set_seed(config.seed) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) if (config.lbann_weights_dir): assert os.path.exists(config.lbann_weights_dir), ( "LBANN inference mode is specified but directory " " to load weights does not exist: '{}'".format( config.lbann_weights_dir)) model_config = torch.load(config.config_load) trainer = MODELS.get_model_trainer(model)(model_config) model_vocab = torch.load(config.vocab_load) model_state = torch.load(config.model_load) model = MODELS.get_model_class(model)(model_vocab, model_config) if os.path.exists(config.lbann_weights_dir): model.load_lbann_weights(config.lbann_weights_dir, config.lbann_epoch_counts) else: # assume that a non-LBANN model is being loaded model.load_state_dict(model_state) model = model.to(device) model.eval() if (config.save_reconstruction): test_data = read_smiles_csv(config.test_path) print("Reconstructing ", len(test_data), " of ", config.test_path, " test samples") test_loader = trainer.get_dataloader(model, test_data, shuffle=False) #tqdm_data = tqdm(test_loader, desc='Reconstruction (batch #{})'.format(batch+1)) tqdm_data = tqdm(test_loader, desc='Reconstruction') model.reconstruct(tqdm_data, config.pred_save) print("Reconstructed samples of ", config.test_path, " saved to ", config.pred_save) samples = [] n = config.n_samples print("Generating Samples") with tqdm(total=config.n_samples, desc='Generating samples') as T: while n > 0: current_samples = model.sample(min(n, config.n_batch), config.max_len) samples.extend(current_samples) n -= len(current_samples) T.update(len(current_samples)) samples = pd.DataFrame(samples, columns=['SMILES']) print("Save generated samples to ", config.gen_save) samples.to_csv(config.gen_save, index=False)
def main(config): setup_default_logger() set_seed(config.seed) generator = OrganGenerator(config) json_file_path = os.path.join(config.output_dir, 'distribution_learning_results.json') assess_distribution_learning(generator, chembl_training_file=config.dist_file, json_output_file=json_file_path, benchmark_version=config.suite)
def main(config): setup_default_logger() set_seed(config.seed) if config.output_dir is None: config.output_dir = os.path.dirname(os.path.realpath(__file__)) generator = VaeGenerator(config) json_file_path = os.path.join(config.output_dir, 'distribution_learning_results.json') assess_distribution_learning(generator, chembl_training_file=config.dist_file, json_output_file=json_file_path, benchmark_version=config.suite)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) device = torch.device(config.device) corpus = OneHotCorpus(config.batch, device) train_dataloader = corpus.fit(train).transform(train) model = CharRNN(corpus.vocab, config.hidden, config.num_layers, config.dropout, device).to(device) trainer = CharRNNTrainer(config) trainer.fit(model, train_dataloader) torch.save(model.state_dict(), config.model_save) torch.save(config, config.config_save) torch.save(corpus.vocab, config.vocab_save)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) vocab = CharVocab.from_data(train) torch.save(config, config.config_save) torch.save(vocab, config.vocab_save) device = torch.device(config.device) model = AAE(vocab, config) model = model.to(device) trainer = AAETrainer(config) trainer.fit(model, train) model.to('cpu') torch.save(model.state_dict(), config.model_save)
def main(config): set_seed(config.seed) model_config = torch.load(config.config_load) model_vocab = torch.load(config.vocab_load) model_state = torch.load(config.model_load) device = torch.device(config.device) model = AAE(model_vocab, model_config) model.load_state_dict(model_state) model = model.to(device) model.eval() if model_config.conditional_model: test = pd.read_csv(config.label_load, usecols=['fingerprints_center'], squeeze=True).astype(str).tolist() labels = [[int(x) for x in list(t)] for t in test] labels = np.array(labels) labels = torch.FloatTensor(labels).cuda() else: labels = None samples = [] n = config.n_samples n_labels = config.n_labels with tqdm.tqdm(total=config.n_samples * n_labels, desc='Generating samples') as T: while n > 0: current_samples = model.sample(n_labels, config.max_len, labels) samples.append(current_samples) n -= 1 T.update(n_labels) samples = np.transpose(np.array(samples)).tolist() output = open(config.gen_save, 'w') output.write('SMILES\n') for i in range(len(samples)): for j in range(len(samples[0])): output.write('{0}\n'.format(samples[i][j])) output.close()
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) vocab = CharVocab.from_data(train) device = torch.device(config.device) with Pool(config.n_jobs) as pool: reward_func = MetricsReward(train, config.n_ref_subsample, config.rollouts, pool, config.addition_rewards) model = ORGAN(vocab, config, reward_func) model = model.to(device) trainer = ORGANTrainer(config) trainer.fit(model, train) torch.save(model.state_dict(), config.model_save) torch.save(config, config.config_save) torch.save(vocab, config.vocab_save)
def main(config): set_seed(config.seed) train = read_smiles_csv(config.train_load) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) corpus = OneHotCorpus(config.n_batch, device) train = corpus.fit(train).transform(train) # condition mode if config.conditional: fps = read_fps_csv(config.train_load) fps = fps_to_list(fps) fps = [torch.tensor(f, dtype=torch.float, device=device) for f in fps] # fingerprints length fps_len = len(fps[0]) # fingerprints dataloader fps = corpus.fps_transform(fps) # training data train = zip(train, fps) shuffle(train) else: fps_len = 0 model = VAE(corpus.vocab, fps_len, config).to(device) trainer = VAETrainer(config) torch.save(config, config.config_save) torch.save(corpus.vocab, config.vocab_save) trainer.fit(model, train, config.conditional)
def main(config): set_seed(config.seed) train = read_smiles(config.train_load) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) corpus = OneHotCorpus(config.n_batch, device) train = corpus.fit(train).transform(train) model = VAE(corpus.vocab, config).to(device) trainer = VAETrainer(config) torch.save(config, config.config_save) torch.save(corpus.vocab, config.vocab_save) trainer.fit(model, train)
def main(config): set_seed(config.seed) model_vocab = torch.load(config.vocab_load) model_config = torch.load(config.config_load) model_state = torch.load(config.model_load) device = torch.device(config.device) model = JTNNVAE(model_vocab, model_config.hidden, model_config.latent, model_config.depth) model.load_state_dict(model_state) model = model.to(device=device) model.eval() gen_smiles = [] for _ in tqdm.trange(config.n_samples): gen_smiles.append(model.sample_prior(prob_decode=True)) df = pd.DataFrame(gen_smiles, columns=['SMILES']) df.to_csv(config.gen_save, index=False)
def main(config): set_seed(config.seed) device = torch.device(config.device) if config.config_save is not None: torch.save(config, config.config_save) # For CUDNN to work properly if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) aff_data = pd.read_csv(pjoin('/mol_data', 'DeepAffinity', 'merged_data.tsv'), delimiter='\t') print(aff_data) embed_data = pd.read_csv(pjoin('/mol_data', 'embeds.csv'), delimiter='\t') train_data = ( embed_data.values[10000:], aff_data['Sequence'].values[10000:], aff_data[['pEC50_[M]', 'pIC50_[M]', 'pKd_[M]', 'pKi_[M]']].values[10000:] ) val_data = ( embed_data.values[:10000], aff_data['Sequence'].values[:10000], aff_data[['pEC50_[M]', 'pIC50_[M]', 'pKd_[M]', 'pKi_[M]']].values[:10000], ) trainer = BindingTrainer(config) vocab = trainer.get_vocabulary(train_data[1]) if config.vocab_save is not None: torch.save(vocab, config.vocab_save) model = Binding(vocab, config).to(device) trainer.fit(model, train_data, val_data) model = model.to('cpu') torch.save(model.state_dict(), config.model_save)
def main(config): set_seed(config.seed) model_config = torch.load(config.config_load) model_vocab = torch.load(config.vocab_load) model_state = torch.load(config.model_load) device = torch.device(config.device) # condition mode if config.conditional: print('Conditional generation') # target fingerprints fps_center = read_fps_csv(config.condition_load) fps_center = fps_to_list(list(set(fps_center))) fps_center = [ torch.tensor(f, dtype=torch.float, device=device) for f in fps_center ] # target fingerprints length fps_len = len(fps_center[0]) # target fingerprints number fps_num = len(fps_center) else: fps_center = [None] fps_len = 0 fps_num = 1 model = ORGAN(model_vocab, model_config, fps_len) model.load_state_dict(model_state) model = model.to(device) model.eval() # sample numbers gen_samples = [] n = config.n_samples with tqdm.tqdm(total=config.n_samples, desc='Generating samples') as T: for i in range(fps_num): samples = [] while n > 0: fps = fps_center[i] if config.conditional: fps = fps_center[i].unsqueeze(0) current_samples = model.sample(fps, config.conditional, min(n, config.n_batch), config.max_len) samples.extend(current_samples) n -= len(current_samples) T.update(len(current_samples)) gen_samples.extend(samples) df = pd.DataFrame(gen_samples, columns=['SMILES']) df.to_csv(config.gen_save, index=False) # tanimoto similarity score and summary if config.conditional: calculate_score(config.gen_save, config.condition_load, config.n_samples)
def main(config): set_seed(config.seed) model_config = torch.load(config.config_load) model_vocab = torch.load(config.vocab_load) model_state = torch.load(config.model_load) device = torch.device(config.device) # For CUDNN to work properly: if device.type.startswith('cuda'): torch.cuda.set_device(device.index or 0) # condition mode if config.conditional: # target fingerprints fps_center = read_fps_csv(config.condition_load) fps_center = fps_to_list(list(set(fps_center))) fps_center = [ torch.tensor(f, dtype=torch.float, device=device) for f in fps_center ] # target fingerprints length fps_len = len(fps_center[0]) # target fingerprints number fps_num = len(fps_center) else: fps_center = [None] fps_len = 0 fps_num = 1 model = VAE(model_vocab, fps_len, model_config) model.load_state_dict(model_state) model = model.to(device) model.eval() # sample numbers gen, n = [], config.n_samples gen_samples = [] # sample T = tqdm.tqdm(range(config.n_samples), desc='Generating mols') for i in range(fps_num): gen, n = [], config.n_samples while n > 0: x = model.sample(fps_center[i], config.conditional, min(n, config.n_batch), config.max_len)[-1] mols = [model_vocab.ids2string(i_x.tolist()) for i_x in x] n -= len(mols) T.update(len(mols)) T.refresh() gen.extend(mols) gen_samples.extend(gen) df = pd.DataFrame(gen_samples, columns=['SMILES']) df.to_csv(config.gen_save, index=False) # tanimoto similarity score and summary if config.conditional: calculate_score(config.gen_save, config.condition_load, config.n_samples)