def test_distribution(self): from tdc import Evaluator evaluator = Evaluator(name='Diversity') x = evaluator(['CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \ 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \ 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \ 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'])
def __init__(self, input_shape, num_classes, num_domains, hparams): super(ERM, self).__init__(input_shape, num_classes, num_domains, hparams) self.featurizer = networks.DTI_Encoder() self.classifier = networks.Classifier( self.featurizer.n_outputs, num_classes, self.hparams['nonlinear_classifier']) self.network = mySequential(self.featurizer, self.classifier) self.optimizer = torch.optim.Adam( self.network.parameters(), lr=self.hparams["lr"], weight_decay=self.hparams['weight_decay']) from tdc import Evaluator self.evaluator = Evaluator(name='PCC') self.loss_fct = torch.nn.MSELoss()
def pcc(network, loader, weights, device): from tdc import Evaluator evaluator = Evaluator(name = 'PCC') pred_all = [] y_all = [] network.eval() with torch.no_grad(): for d, t, y in loader: d = d.to(device) t = t.to(device) y_pred = network.predict(d, t) pred_all = pred_all + y_pred.reshape(-1,).detach().cpu().numpy().tolist() y_all = y_all + y.cpu().numpy().tolist() network.train() return pred_all, y_all, evaluator(y_all, pred_all)
def __init__(self, input_shape, num_classes, num_domains, hparams): super(MTL, self).__init__(input_shape, num_classes, num_domains, hparams) self.featurizer = networks.DTI_Encoder() self.classifier = networks.Classifier( self.featurizer.n_outputs * 2, num_classes, self.hparams['nonlinear_classifier']) self.optimizer = torch.optim.Adam( list(self.featurizer.parameters()) +\ list(self.classifier.parameters()), lr=self.hparams["lr"], weight_decay=self.hparams['weight_decay'] ) self.register_buffer( 'embeddings', torch.zeros(num_domains, self.featurizer.n_outputs)) self.ema = self.hparams['mtl_ema'] self.loss_fct = torch.nn.MSELoss() from tdc import Evaluator self.evaluator = Evaluator(name='PCC')
def test_Evaluator(self): from tdc import Evaluator evaluator = Evaluator(name='ROC-AUC') print(evaluator([0, 1], [0.5, 0.6]))
def evaluate(self, pred, true=None, benchmark=None, criteria='all', m1_api=None): if self.name == 'docking_group': results_all = {} for data_name, pred_ in pred.items(): results = {} ## pred is a list of smiles strings if len(pred_) != 100: raise ValueError( "The expected output is a list of top 100 molecules!") dataset = fuzzy_search(benchmark, self.dataset_names) # docking scores for the top K smiles (K <= 100) target_pdb_file = os.path.join(self.path, dataset + '.pdb') oracle = Oracle(name="Docking_Score", software="vina", pyscreener_path=self.pyscreener_path, receptors=[target_pdb_file], center=docking_target_info[dataset]['center'], size=docking_target_info[dataset]['size'], buffer=10, path=data_path, num_worker=self.num_workers, ncpu=self.num_cpus, num_max_call=10000) docking_scores = oracle(pred_) results['docking_scores_dict'] = docking_scores values = np.array(list(docking_scores.values())) results['AVG_Top100'] = np.mean(values) results['AVG_Top10'] = np.mean(sorted(values)[:10]) results['Top1'] = max(values) all_criteria = [ 'm1', 'filters', 'diversity', 'validity', 'uniqueness' ] if criteria == 'all': criteria = all_criteria elif criteria == 'none': criteria = [] else: if sum([1 if i in all_criteria else 0 for i in criteria]) != len(criteria): # there is at least one criteria does not match the supported evaluation raise ValueError( "Please select the criteria from a list of 'm1', 'filters', 'diversity', 'validity', 'uniqueness'!" ) if 'm1' in criteria: if m1_api is None: raise ValueError( "Please input the m1_api token in the evaluate function call! You can obtain it via: https://tdcommons.ai/functions/oracles/#moleculeone" ) m1 = Oracle(name='Molecule One Synthesis', api_token=m1_api) m1_scores = m1(pred_) scores_array = list(m1_scores.values()) results['m1_scores_dict'] = m1_scores results['AVG_m1_scores'] = np.mean(scores_array) ## TODO: how good is the m1 score? ask stan; 0.5 placeholder results['AVG_docking_scores_synthesizable'] = np.mean([ docking_scores[i] for i, j in m1_scores.items() if j > 0.5 ]) if 'filters' in criteria: from tdc.chem_utils import MolFilter ## TODO: select an optimal set of filters. test a bit. filters = MolFilter(filters=['PAINS'], HBD=[0, 6]) pred_filter = filters(pred_) results['pass_filter_smiles_list'] = pred_filter results['unfiltered_fractions'] = float( len(pred_filter)) / 100 results['AVG_docking_scores_unfiltered'] = np.mean( [docking_scores[i] for i in pred_filter]) if 'diversity' in criteria: from tdc import Evaluator evaluator = Evaluator(name='Diversity') score = evaluator(pred_) results['diversity'] = score if 'validity' in criteria: from tdc import Evaluator evaluator = Evaluator(name='Validity') score = evaluator(pred_) results['validity'] = score if 'uniqueness' in criteria: from tdc import Evaluator evaluator = Evaluator(name='Uniqueness') score = evaluator(pred_) results['uniqueness'] = score results_all[dataset_name] = results return results_all if true is None: # test set evaluation metric_dict = bm_metric_names[self.name] out = {} for data_name, pred_ in pred.items(): data_name = fuzzy_search(data_name, self.dataset_names) data_path = os.path.join(self.path, data_name) if self.file_format == 'csv': test = pd.read_csv(os.path.join(data_path, 'test.csv')) elif self.file_format == 'pkl': test = pd.read_pickle(os.path.join(data_path, 'test.pkl')) y = test.Y.values evaluator = eval('Evaluator(name = \'' + metric_dict[data_name] + '\')') out[data_name] = { metric_dict[data_name]: round(evaluator(y, pred_), 3) } # If reporting accuracy across target classes if 'target_class' in test.columns: test['pred'] = pred_ for c in test['target_class'].unique(): data_name_subset = data_name + '_' + c test_subset = test[test['target_class'] == c] y_subset = test_subset.Y.values pred_subset = test_subset.pred.values evaluator = eval('Evaluator(name = \'' + metric_dict[data_name_subset] + '\')') out[data_name_subset] = { metric_dict[data_name_subset]: round(evaluator(y_subset, pred_subset), 3) } return out else: # validation set evaluation if benchmark is None: raise ValueError( 'Please specify the benchmark name for us to retrieve the standard metric!' ) data_name = fuzzy_search(benchmark, self.dataset_names) metric_dict = bm_metric_names[self.name] evaluator = eval('Evaluator(name = \'' + metric_dict[data_name] + '\')') return {metric_dict[data_name]: round(evaluator(true, pred), 3)}
for k, v in sorted(hparams.items()): print('\t{}: {}'.format(k, v)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if torch.cuda.is_available(): device = "cuda" else: device = "cpu" from tdc import Evaluator evaluator = Evaluator(name='PCC') print("preparing datasets...") ENVIRONMENTS = [str(i) for i in list(range(2013, 2022))] TRAIN_ENV = [str(i) for i in list(range(2013, 2019))] TEST_ENV = ['2019', '2020', '2021'] idx2train_env = dict(zip(range(len(TRAIN_ENV)), TRAIN_ENV)) idx2test_env = dict(zip(range(len(TEST_ENV)), TEST_ENV)) dataset = datasets.TdcDtiDg(args.data_dir, args.test_envs, hparams) in_splits = [] out_splits = [] uda_splits = [] test_set = []
from tdc import Evaluator from chemutils import * ## 2. data and oracle # qed = Oracle(name = 'qed') # logp = Oracle(name = 'logp') # jnk = Oracle(name = 'JNK3') # gsk = Oracle(name = 'GSK3B') # def foracle(smiles): # return logp(smiles) oracle_name = sys.argv[1] # 'jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk' diversity = Evaluator(name = 'Diversity') novelty = Evaluator(name = 'Novelty') file = "data/zinc_clean.txt" with open(file, 'r') as fin: lines = fin.readlines() train_smiles_lst = [line.strip().split()[0] for line in lines][:1000] ## 5. run if __name__ == "__main__": # result_file = "result/denovo_from_" + start_smiles_lst[0] + "_generation_" + str(generations) + "_population_" + str(population_size) + ".pkl" # result_pkl = "result/ablation_dmg_topo_dmg_substr.pkl" # pkl_file = "result/denovo_qedlogpjnkgsk_start_ncncccn.pkl"
import functools import matplotlib.pyplot as plt import tensorflow as tf from dqn import molecules from dqn import deep_q_networks from dqn.py.SA_Score import sascorer from chemutil import similarity from rdkit import Chem, DataStructs from rdkit.Chem import AllChem, Draw, Descriptors, QED from tdc import Oracle qed_oracle = Oracle(name='qed') from tdc import Evaluator diversity = Evaluator(name='Diversity') import pyscreener from tdc import Oracle oracle2 = Oracle( name='Docking_Score', software='vina', pyscreener_path='./', receptors=[ '/project/molecular_data/graphnn/pyscreener/testing_inputs/DRD3.pdb' ], center=(9, 22.5, 26), size=(15, 15, 15), buffer=10, path='./', num_worker=3,
# evaluators from tdc import Evaluator evaluator = Evaluator(name='ROC-AUC') print(evaluator([0, 1], [0.5, 0.6])) # Processing Helpers from tdc.single_pred import ADME data = ADME(name='Caco2_Wang') data.label_distribution() from tdc.multi_pred import DTI data = DTI(name='DAVIS') data.binarize(threshold=30, order='descending') from tdc.multi_pred import DTI data = DTI(name='DAVIS') data.convert_to_log() from tdc.multi_pred import DDI from tdc.utils import get_label_map data = DDI(name='DrugBank') split = data.get_split() get_label_map(name='DrugBank', task='DDI') from tdc.multi_pred import GDA data = GDA(name='DisGeNET') data.print_stats() from tdc.single_pred import HTS