Beispiel #1
0
 def test_distribution(self):
     from tdc import Evaluator
     evaluator = Evaluator(name='Diversity')
     x = evaluator(['CC(C)(C)[C@H]1CCc2c(sc(NC(=O)COc3ccc(Cl)cc3)c2C(N)=O)C1', \
                 'C[C@@H]1CCc2c(sc(NC(=O)c3ccco3)c2C(N)=O)C1', \
                 'CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1', \
                 'C[C@@H]1CCN(C(=O)CCCc2ccccc2)C[C@@H]1O'])
Beispiel #2
0
    def __init__(self, input_shape, num_classes, num_domains, hparams):
        super(ERM, self).__init__(input_shape, num_classes, num_domains,
                                  hparams)

        self.featurizer = networks.DTI_Encoder()
        self.classifier = networks.Classifier(
            self.featurizer.n_outputs, num_classes,
            self.hparams['nonlinear_classifier'])

        self.network = mySequential(self.featurizer, self.classifier)
        self.optimizer = torch.optim.Adam(
            self.network.parameters(),
            lr=self.hparams["lr"],
            weight_decay=self.hparams['weight_decay'])

        from tdc import Evaluator
        self.evaluator = Evaluator(name='PCC')
        self.loss_fct = torch.nn.MSELoss()
Beispiel #3
0
def pcc(network, loader, weights, device):
    
    from tdc import Evaluator
    evaluator = Evaluator(name = 'PCC')
    
    pred_all = []
    y_all = []
    
    network.eval()
    with torch.no_grad():
        for d, t, y in loader:
            d = d.to(device)
            t = t.to(device)
            y_pred = network.predict(d, t)
            pred_all = pred_all + y_pred.reshape(-1,).detach().cpu().numpy().tolist()
            y_all = y_all + y.cpu().numpy().tolist()
    network.train()   
    return pred_all, y_all, evaluator(y_all, pred_all)
Beispiel #4
0
    def __init__(self, input_shape, num_classes, num_domains, hparams):
        super(MTL, self).__init__(input_shape, num_classes, num_domains,
                                  hparams)
        self.featurizer = networks.DTI_Encoder()
        self.classifier = networks.Classifier(
            self.featurizer.n_outputs * 2, num_classes,
            self.hparams['nonlinear_classifier'])
        self.optimizer = torch.optim.Adam(
            list(self.featurizer.parameters()) +\
            list(self.classifier.parameters()),
            lr=self.hparams["lr"],
            weight_decay=self.hparams['weight_decay']
        )

        self.register_buffer(
            'embeddings', torch.zeros(num_domains, self.featurizer.n_outputs))

        self.ema = self.hparams['mtl_ema']
        self.loss_fct = torch.nn.MSELoss()
        from tdc import Evaluator
        self.evaluator = Evaluator(name='PCC')
Beispiel #5
0
 def test_Evaluator(self):
     from tdc import Evaluator
     evaluator = Evaluator(name='ROC-AUC')
     print(evaluator([0, 1], [0.5, 0.6]))
Beispiel #6
0
    def evaluate(self,
                 pred,
                 true=None,
                 benchmark=None,
                 criteria='all',
                 m1_api=None):

        if self.name == 'docking_group':
            results_all = {}

            for data_name, pred_ in pred.items():

                results = {}

                ## pred is a list of smiles strings
                if len(pred_) != 100:
                    raise ValueError(
                        "The expected output is a list of top 100 molecules!")
                dataset = fuzzy_search(benchmark, self.dataset_names)

                # docking scores for the top K smiles (K <= 100)
                target_pdb_file = os.path.join(self.path, dataset + '.pdb')

                oracle = Oracle(name="Docking_Score",
                                software="vina",
                                pyscreener_path=self.pyscreener_path,
                                receptors=[target_pdb_file],
                                center=docking_target_info[dataset]['center'],
                                size=docking_target_info[dataset]['size'],
                                buffer=10,
                                path=data_path,
                                num_worker=self.num_workers,
                                ncpu=self.num_cpus,
                                num_max_call=10000)

                docking_scores = oracle(pred_)
                results['docking_scores_dict'] = docking_scores
                values = np.array(list(docking_scores.values()))
                results['AVG_Top100'] = np.mean(values)
                results['AVG_Top10'] = np.mean(sorted(values)[:10])
                results['Top1'] = max(values)

                all_criteria = [
                    'm1', 'filters', 'diversity', 'validity', 'uniqueness'
                ]

                if criteria == 'all':
                    criteria = all_criteria
                elif criteria == 'none':
                    criteria = []
                else:
                    if sum([1 if i in all_criteria else 0
                            for i in criteria]) != len(criteria):
                        # there is at least one criteria does not match the supported evaluation
                        raise ValueError(
                            "Please select the criteria from a list of 'm1', 'filters', 'diversity', 'validity', 'uniqueness'!"
                        )

                if 'm1' in criteria:
                    if m1_api is None:
                        raise ValueError(
                            "Please input the m1_api token in the evaluate function call! You can obtain it via: https://tdcommons.ai/functions/oracles/#moleculeone"
                        )
                    m1 = Oracle(name='Molecule One Synthesis',
                                api_token=m1_api)
                    m1_scores = m1(pred_)
                    scores_array = list(m1_scores.values())
                    results['m1_scores_dict'] = m1_scores
                    results['AVG_m1_scores'] = np.mean(scores_array)
                    ## TODO: how good is the m1 score? ask stan; 0.5 placeholder
                    results['AVG_docking_scores_synthesizable'] = np.mean([
                        docking_scores[i] for i, j in m1_scores.items()
                        if j > 0.5
                    ])

                if 'filters' in criteria:
                    from tdc.chem_utils import MolFilter
                    ## TODO: select an optimal set of filters. test a bit.
                    filters = MolFilter(filters=['PAINS'], HBD=[0, 6])
                    pred_filter = filters(pred_)
                    results['pass_filter_smiles_list'] = pred_filter
                    results['unfiltered_fractions'] = float(
                        len(pred_filter)) / 100
                    results['AVG_docking_scores_unfiltered'] = np.mean(
                        [docking_scores[i] for i in pred_filter])

                if 'diversity' in criteria:
                    from tdc import Evaluator
                    evaluator = Evaluator(name='Diversity')
                    score = evaluator(pred_)
                    results['diversity'] = score

                if 'validity' in criteria:
                    from tdc import Evaluator
                    evaluator = Evaluator(name='Validity')
                    score = evaluator(pred_)
                    results['validity'] = score

                if 'uniqueness' in criteria:
                    from tdc import Evaluator
                    evaluator = Evaluator(name='Uniqueness')
                    score = evaluator(pred_)
                    results['uniqueness'] = score

                results_all[dataset_name] = results
            return results_all

        if true is None:
            # test set evaluation
            metric_dict = bm_metric_names[self.name]
            out = {}
            for data_name, pred_ in pred.items():
                data_name = fuzzy_search(data_name, self.dataset_names)
                data_path = os.path.join(self.path, data_name)
                if self.file_format == 'csv':
                    test = pd.read_csv(os.path.join(data_path, 'test.csv'))
                elif self.file_format == 'pkl':
                    test = pd.read_pickle(os.path.join(data_path, 'test.pkl'))
                y = test.Y.values
                evaluator = eval('Evaluator(name = \'' +
                                 metric_dict[data_name] + '\')')
                out[data_name] = {
                    metric_dict[data_name]: round(evaluator(y, pred_), 3)
                }

                # If reporting accuracy across target classes
                if 'target_class' in test.columns:
                    test['pred'] = pred_
                    for c in test['target_class'].unique():
                        data_name_subset = data_name + '_' + c
                        test_subset = test[test['target_class'] == c]
                        y_subset = test_subset.Y.values
                        pred_subset = test_subset.pred.values

                        evaluator = eval('Evaluator(name = \'' +
                                         metric_dict[data_name_subset] + '\')')
                        out[data_name_subset] = {
                            metric_dict[data_name_subset]:
                            round(evaluator(y_subset, pred_subset), 3)
                        }
            return out
        else:
            # validation set evaluation
            if benchmark is None:
                raise ValueError(
                    'Please specify the benchmark name for us to retrieve the standard metric!'
                )
            data_name = fuzzy_search(benchmark, self.dataset_names)
            metric_dict = bm_metric_names[self.name]
            evaluator = eval('Evaluator(name = \'' + metric_dict[data_name] +
                             '\')')
            return {metric_dict[data_name]: round(evaluator(true, pred), 3)}
Beispiel #7
0
for k, v in sorted(hparams.items()):
    print('\t{}: {}'.format(k, v))

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

from tdc import Evaluator
evaluator = Evaluator(name='PCC')

print("preparing datasets...")
ENVIRONMENTS = [str(i) for i in list(range(2013, 2022))]
TRAIN_ENV = [str(i) for i in list(range(2013, 2019))]
TEST_ENV = ['2019', '2020', '2021']
idx2train_env = dict(zip(range(len(TRAIN_ENV)), TRAIN_ENV))
idx2test_env = dict(zip(range(len(TEST_ENV)), TEST_ENV))
dataset = datasets.TdcDtiDg(args.data_dir, args.test_envs, hparams)

in_splits = []
out_splits = []
uda_splits = []

test_set = []
Beispiel #8
0
from tdc import Evaluator

from chemutils import * 
## 2. data and oracle 
# qed = Oracle(name = 'qed')
# logp = Oracle(name = 'logp')
# jnk = Oracle(name = 'JNK3')
# gsk = Oracle(name = 'GSK3B')
# def foracle(smiles):
# 	return logp(smiles)

oracle_name = sys.argv[1]
# 'jnkgsk', 'qedsajnkgsk', 'qed', 'jnk', 'gsk'


diversity = Evaluator(name = 'Diversity')
novelty = Evaluator(name = 'Novelty')


file = "data/zinc_clean.txt"
with open(file, 'r') as fin:
	lines = fin.readlines() 
train_smiles_lst = [line.strip().split()[0] for line in lines][:1000] 


## 5. run 
if __name__ == "__main__":

	# result_file = "result/denovo_from_" + start_smiles_lst[0] + "_generation_" + str(generations) + "_population_" + str(population_size) + ".pkl"
	# result_pkl = "result/ablation_dmg_topo_dmg_substr.pkl"
	# pkl_file = "result/denovo_qedlogpjnkgsk_start_ncncccn.pkl"
Beispiel #9
0
import functools
import matplotlib.pyplot as plt
import tensorflow as tf

from dqn import molecules
from dqn import deep_q_networks
from dqn.py.SA_Score import sascorer
from chemutil import similarity

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors, QED

from tdc import Oracle
qed_oracle = Oracle(name='qed')
from tdc import Evaluator
diversity = Evaluator(name='Diversity')

import pyscreener
from tdc import Oracle
oracle2 = Oracle(
    name='Docking_Score',
    software='vina',
    pyscreener_path='./',
    receptors=[
        '/project/molecular_data/graphnn/pyscreener/testing_inputs/DRD3.pdb'
    ],
    center=(9, 22.5, 26),
    size=(15, 15, 15),
    buffer=10,
    path='./',
    num_worker=3,
Beispiel #10
0
# evaluators

from tdc import Evaluator
evaluator = Evaluator(name='ROC-AUC')
print(evaluator([0, 1], [0.5, 0.6]))

# Processing Helpers

from tdc.single_pred import ADME
data = ADME(name='Caco2_Wang')
data.label_distribution()

from tdc.multi_pred import DTI
data = DTI(name='DAVIS')
data.binarize(threshold=30, order='descending')

from tdc.multi_pred import DTI
data = DTI(name='DAVIS')
data.convert_to_log()

from tdc.multi_pred import DDI
from tdc.utils import get_label_map
data = DDI(name='DrugBank')
split = data.get_split()
get_label_map(name='DrugBank', task='DDI')

from tdc.multi_pred import GDA
data = GDA(name='DisGeNET')
data.print_stats()

from tdc.single_pred import HTS