def test_Oracle(self): from tdc import Oracle from tdc import Oracle oracle = Oracle(name='SA') x = oracle(['CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O']) oracle = Oracle(name='Hop') x = oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])
def get(self, benchmark, num_max_call=5000): dataset = fuzzy_search(benchmark, self.dataset_names) data_path = os.path.join(self.path, dataset) if self.file_format == 'csv': train = pd.read_csv(os.path.join(data_path, 'train_val.csv')) test = pd.read_csv(os.path.join(data_path, 'test.csv')) elif self.file_format == 'pkl': train = pd.read_pickle(os.path.join(data_path, 'train_val.pkl')) test = pd.read_pickle(os.path.join(data_path, 'test.pkl')) elif self.file_format == 'oracle': target_pdb_file = os.path.join(self.path, dataset + '.pdb') if self.name == 'docking_group': oracle = Oracle(name="Docking_Score", software="vina", pyscreener_path=self.pyscreener_path, receptors=[target_pdb_file], center=docking_target_info[dataset]['center'], size=docking_target_info[dataset]['size'], buffer=10, path=data_path, num_worker=self.num_workers, ncpu=self.num_cpus, num_max_call=num_max_call) return {'oracle': oracle, 'name': dataset} else: return {'train_val': train, 'test': test, 'name': dataset}
def __init__(self, name): ## DRD2 GSK3B JNK3 cyp3a4_benchmark from tdc import Oracle self.name = name super().__init__(score_modifier=None) if 'docking' not in self.name.lower(): ### drd2 gsk3 JNK3 self.oracle = Oracle(name=self.name) elif self.name.lower() == 'docking_5wiu': self.oracle = Oracle( name='Docking_Score', software='vina', pyscreener_path='/project/molecular_data/graphnn/pyscreener', receptors=[ '/project/molecular_data/graphnn/pyscreener/testing_inputs/5WIU.pdb' ], docked_ligand_file= '/project/molecular_data/graphnn/pyscreener/testing_inputs/5WIU_with_ligand.pdb', buffer=10, path='/project/molecular_data/graphnn/pyscreener/my_test/', num_worker=1, ncpu=4) elif self.name.lower() == 'docking_drd3': self.oracle = Oracle( name='Docking_Score', software='vina', pyscreener_path='/project/molecular_data/graphnn/pyscreener', receptors=[ '/project/molecular_data/graphnn/pyscreener/testing_inputs/DRD3.pdb' ], center=(9, 22.5, 26), size=(15, 15, 15), buffer=10, path='/project/molecular_data/graphnn/pyscreener/my_test/', num_worker=1, ncpu=10) self.docking_num_file = "/project/molecular_data/graphnn/pyscreener/docking_num.txt" write_num(self.docking_num_file, 0) print('----------initialize docking_num_file-------------')
def __next__(self): if self.index < self.num_datasets: dataset = self.dataset_names[self.index] print_sys('--- ' + dataset + ' ---') data_path = os.path.join(self.path, dataset) if not os.path.exists(data_path): os.mkdir(data_path) if self.file_format == 'csv': train = pd.read_csv(os.path.join(data_path, 'train_val.csv')) test = pd.read_csv(os.path.join(data_path, 'test.csv')) elif self.file_format == 'pkl': train = pd.read_pickle(os.path.join(data_path, 'train_val.pkl')) test = pd.read_pickle(os.path.join(data_path, 'test.pkl')) elif self.file_format == 'oracle': target_pdb_file = os.path.join(self.path, dataset + '.pdb') self.index += 1 if self.name == 'docking_group': oracle = Oracle(name="Docking_Score", software="vina", pyscreener_path=self.pyscreener_path, receptors=[target_pdb_file], center=docking_target_info[dataset]['center'], size=docking_target_info[dataset]['size'], buffer=10, path=data_path, num_worker=self.num_workers, ncpu=self.num_cpus, num_max_call=self.num_max_call) return {'oracle': oracle, 'name': dataset} else: return {'train_val': train, 'test': test, 'name': dataset} else: raise StopIteration
from tdc import Oracle oracle = Oracle(name='isomers_c7h8n2o2') print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
from absl import app from absl import flags from rdkit import Chem from rdkit import DataStructs from rdkit.Chem import AllChem from tensorflow.compat.v1 import gfile from dqn import deep_q_networks from dqn import molecules as molecules_mdp from dqn import run_dqn from dqn.py import molecules from dqn.tensorflow_core import core from tdc import Oracle qed = Oracle(name = 'qed') logp = Oracle(name = 'logp') jnk = Oracle(name = 'JNK3') gsk = Oracle(name = 'GSK3B') from scipy.stats import gmean def logp_modifier(logp_score): return max(0.0,min(1.0,1/14*(logp_score+10))) def qed_logp_jnk_gsk_fusion(qed_score, logp_score, jsn_score, gsk_score): logp_score = logp_modifier(logp_score) gmean_score = gmean([qed_score, logp_score, jsn_score, gsk_score])
def test_Oracle(self): from tdc import Oracle oracle = Oracle(name='Hop') print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
def test_Oracle(self): from tdc import Oracle oracle = Oracle(name='celecoxib rediscovery') print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
from tdc import Oracle oracle = Oracle(name='aripiprazole_similarity') print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--oracle_num', type=int, default=1500) parser.add_argument('--oracle_name', type=str, default="qed", choices=['jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk']) parser.add_argument('--generations', type=int, default=50) parser.add_argument('--population_size', type=int, default=20) args = parser.parse_args() oracle_num = args.oracle_num oracle_name = args.oracle_name generations = args.generations population_size = args.population_size start_smiles_lst = ['C1(N)=NC=CC=N1'] ## 'C1=CC=CC=C1NC2=NC=CC=N2' qed = Oracle('qed') sa = Oracle('sa') jnk = Oracle('JNK3') gsk = Oracle('GSK3B') logp = Oracle('logp') mu = 2.230044 sigma = 0.6526308 def normalize_sa(smiles): sa_score = sa(smiles) mod_score = np.maximum(sa_score, mu) return np.exp(-0.5 * np.power((mod_score - mu) / sigma, 2.)) if oracle_name == 'jnkgsk': def oracle(smiles): return np.mean((jnk(smiles), gsk(smiles))) elif oracle_name == 'qedsajnkgsk': def oracle(smiles): return np.mean( (qed(smiles), normalize_sa(smiles), jnk(smiles), gsk(smiles))) elif oracle_name == 'qed': def oracle(smiles): return qed(smiles) elif oracle_name == 'jnk': def oracle(smiles): return jnk(smiles) elif oracle_name == 'gsk': def oracle(smiles): return gsk(smiles) elif oracle_name == 'logp': def oracle(smiles): return logp(smiles) # device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cpu' ## cpu is better model_ckpt = "save_model/GNN_epoch_0_validloss_1.61160.ckpt" gnn = torch.load(model_ckpt) gnn.switch_device(device) result_pkl = "result/" + oracle_name + ".pkl" optimization(start_smiles_lst, gnn, oracle, oracle_num, oracle_name, generations=generations, population_size=population_size, lamb=2, topk=5, epsilon=0.7, result_pkl=result_pkl)
def test_Oracle(self): # Molecule Generation Oracles from tdc import Oracle oracle = Oracle(name='GSK3B') smiles_lst = [ 'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'] oracle(smiles_lst) oracle = Oracle(name='DRD2') smiles_lst = [ 'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'] oracle(smiles_lst) oracle = Oracle(name='Hop') oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']) oracle = Oracle(name='Valsartan_SMARTS') oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']) oracle = Oracle(name='Rediscovery') oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']) oracle = Oracle(name='SA') smiles_lst = [ 'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'] oracle(smiles_lst) oracle = Oracle(name='Uniqueness') smiles_lst = [ 'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'] oracle(smiles_lst) oracle = Oracle(name='Novelty') smiles_lst = [ 'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'] oracle(smiles_lst, smiles_lst) oracle = Oracle(name='Diversity') smiles_lst = [ 'CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'] oracle(smiles_lst) oracle = Oracle(name='Scaffold Hop') oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])
def test_Oracle(self): from tdc import Oracle oracle = Oracle(name='isomers_c7h8n2o2') print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
from tdc import Oracle oracle = Oracle(name = 'celecoxib rediscovery') print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
from tdc import Oracle oracle = Oracle(name = 'Hop') print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
def test_Oracle(self): from tdc import Oracle oracle = Oracle(name='aripiprazole_similarity') print(oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O']))
from guacamol.goal_directed_generator import GoalDirectedGenerator # from guacamol.scoring_function import ScoringFunction from guacamol.utils.chemistry import canonicalize_list, canonicalize from joblib import delayed from smiles_lstm_hc.rnn_generator import SmilesRnnMoleculeGenerator from smiles_lstm_hc.rnn_utils import load_rnn_model from tdc import Oracle drd3_oracle = Oracle( name='Docking_Score', software='vina', pyscreener_path='/project/molecular_data/graphnn/pyscreener', receptors=[ '/project/molecular_data/graphnn/pyscreener/testing_inputs/DRD3.pdb' ], center=(9, 22.5, 26), size=(15, 15, 15), buffer=10, path='/project/molecular_data/graphnn/pyscreener/my_test/', num_worker=1, ncpu=10) global oracle_num oracle_num = 0 def drd3_docking_oracle(smiles): # oracle_num += 1 # print('Docking call', oracle_num) return min(max(-drd3_oracle(smiles) / 15.0, 0), 1)
def evaluate(self, pred, true=None, benchmark=None, criteria='all', m1_api=None): if self.name == 'docking_group': results_all = {} for data_name, pred_ in pred.items(): results = {} ## pred is a list of smiles strings if len(pred_) != 100: raise ValueError( "The expected output is a list of top 100 molecules!") dataset = fuzzy_search(benchmark, self.dataset_names) # docking scores for the top K smiles (K <= 100) target_pdb_file = os.path.join(self.path, dataset + '.pdb') oracle = Oracle(name="Docking_Score", software="vina", pyscreener_path=self.pyscreener_path, receptors=[target_pdb_file], center=docking_target_info[dataset]['center'], size=docking_target_info[dataset]['size'], buffer=10, path=data_path, num_worker=self.num_workers, ncpu=self.num_cpus, num_max_call=10000) docking_scores = oracle(pred_) results['docking_scores_dict'] = docking_scores values = np.array(list(docking_scores.values())) results['AVG_Top100'] = np.mean(values) results['AVG_Top10'] = np.mean(sorted(values)[:10]) results['Top1'] = max(values) all_criteria = [ 'm1', 'filters', 'diversity', 'validity', 'uniqueness' ] if criteria == 'all': criteria = all_criteria elif criteria == 'none': criteria = [] else: if sum([1 if i in all_criteria else 0 for i in criteria]) != len(criteria): # there is at least one criteria does not match the supported evaluation raise ValueError( "Please select the criteria from a list of 'm1', 'filters', 'diversity', 'validity', 'uniqueness'!" ) if 'm1' in criteria: if m1_api is None: raise ValueError( "Please input the m1_api token in the evaluate function call! You can obtain it via: https://tdcommons.ai/functions/oracles/#moleculeone" ) m1 = Oracle(name='Molecule One Synthesis', api_token=m1_api) m1_scores = m1(pred_) scores_array = list(m1_scores.values()) results['m1_scores_dict'] = m1_scores results['AVG_m1_scores'] = np.mean(scores_array) ## TODO: how good is the m1 score? ask stan; 0.5 placeholder results['AVG_docking_scores_synthesizable'] = np.mean([ docking_scores[i] for i, j in m1_scores.items() if j > 0.5 ]) if 'filters' in criteria: from tdc.chem_utils import MolFilter ## TODO: select an optimal set of filters. test a bit. filters = MolFilter(filters=['PAINS'], HBD=[0, 6]) pred_filter = filters(pred_) results['pass_filter_smiles_list'] = pred_filter results['unfiltered_fractions'] = float( len(pred_filter)) / 100 results['AVG_docking_scores_unfiltered'] = np.mean( [docking_scores[i] for i in pred_filter]) if 'diversity' in criteria: from tdc import Evaluator evaluator = Evaluator(name='Diversity') score = evaluator(pred_) results['diversity'] = score if 'validity' in criteria: from tdc import Evaluator evaluator = Evaluator(name='Validity') score = evaluator(pred_) results['validity'] = score if 'uniqueness' in criteria: from tdc import Evaluator evaluator = Evaluator(name='Uniqueness') score = evaluator(pred_) results['uniqueness'] = score results_all[dataset_name] = results return results_all if true is None: # test set evaluation metric_dict = bm_metric_names[self.name] out = {} for data_name, pred_ in pred.items(): data_name = fuzzy_search(data_name, self.dataset_names) data_path = os.path.join(self.path, data_name) if self.file_format == 'csv': test = pd.read_csv(os.path.join(data_path, 'test.csv')) elif self.file_format == 'pkl': test = pd.read_pickle(os.path.join(data_path, 'test.pkl')) y = test.Y.values evaluator = eval('Evaluator(name = \'' + metric_dict[data_name] + '\')') out[data_name] = { metric_dict[data_name]: round(evaluator(y, pred_), 3) } # If reporting accuracy across target classes if 'target_class' in test.columns: test['pred'] = pred_ for c in test['target_class'].unique(): data_name_subset = data_name + '_' + c test_subset = test[test['target_class'] == c] y_subset = test_subset.Y.values pred_subset = test_subset.pred.values evaluator = eval('Evaluator(name = \'' + metric_dict[data_name_subset] + '\')') out[data_name_subset] = { metric_dict[data_name_subset]: round(evaluator(y_subset, pred_subset), 3) } return out else: # validation set evaluation if benchmark is None: raise ValueError( 'Please specify the benchmark name for us to retrieve the standard metric!' ) data_name = fuzzy_search(benchmark, self.dataset_names) metric_dict = bm_metric_names[self.name] evaluator = eval('Evaluator(name = \'' + metric_dict[data_name] + '\')') return {metric_dict[data_name]: round(evaluator(true, pred), 3)}
sys.path.append("..") import os import json import numpy as np import pandas as pd import functools from dqn import molecules from dqn import deep_q_networks from dqn.py.SA_Score import sascorer from chemutil import similarity from rdkit import Chem, DataStructs from rdkit.Chem import AllChem, Draw, Descriptors, QED from tdc import Oracle qed_oracle = Oracle(name='qed') # import matplotlib.pyplot as plt import tensorflow as tf from pathlib import Path def latest_ckpt(path): return max([ int(p.stem.split('-')[1]) for p in path.iterdir() if p.stem[:4] == 'ckpt' ]) # basepath = '/Users/odin/sherlock_scratch/moldqn2/target_sas/mol%i_target_%.1f' path = Path("save_qed")
seq = uniprot2seq('P49122') # data split from tdc.single_pred import ADME data = ADME(name='Caco2_Wang') split = data.get_split(method='scaffold') from tdc.multi_pred import DTI data = DTI(name='DAVIS') split = data.get_split(method='cold_split', column_name='Drug') # Molecule Generation Oracles from tdc import Oracle oracle = Oracle(name='GSK3B') smiles_lst = ['CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'] oracle(smiles_lst) oracle = Oracle(name='DRD2') smiles_lst = ['CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'] oracle(smiles_lst) oracle = Oracle(name='Hop') oracle(['CC(=O)OC1=CC=CC=C1C(=O)O', 'C1=CC=C(C=C1)C=O'])