def test_valid_unique(self): disable_rdkit_log() mols = ['CCNC', 'CCC', 'INVALID', 'CCC'] assert np.allclose(fraction_valid(mols), 3 / 4), "Failed valid" assert np.allclose(fraction_unique(mols, check_validity=False), 3 / 4), "Failed unique" assert np.allclose(fraction_unique(mols, k=2), 1), "Failed unique" mols = [Chem.MolFromSmiles(x) for x in mols] assert np.allclose(fraction_valid(mols), 3 / 4), "Failed valid" assert np.allclose(fraction_unique(mols, check_validity=False), 3 / 4), "Failed unique" assert np.allclose(fraction_unique(mols, k=2), 1), "Failed unique" enable_rdkit_log()
def get_all_metrics(test, gen, k=None, n_jobs=1, device='cpu', batch_size=512, test_scaffolds=None, ptest=None, ptest_scaffolds=None, pool=None, gpu=None, train=None): """ Computes all available metrics between test (scaffold test) and generated sets of SMILES. Parameters: test: list of test SMILES gen: list of generated SMILES k: int or list with values for unique@k. Will calculate number of unique molecules in the first k molecules. Default [1000, 10000] n_jobs: number of workers for parallel processing device: 'cpu' or 'cuda:n', where n is GPU device number batch_size: batch size for FCD metric test_scaffolds: list of scaffold test SMILES Will compute only on the general test set if not specified ptest: dict with precalculated statistics of the test set ptest_scaffolds: dict with precalculated statistics of the scaffold test set pool: optional multiprocessing pool to use for parallelization gpu: deprecated, use `device` train: list of train SMILES Available metrics: * %valid * %unique@k * Frechet ChemNet Distance (FCD) * Fragment similarity (Frag) * Scaffold similarity (Scaf) * Similarity to nearest neighbour (SNN) * Internal diversity (IntDiv) * Internal diversity 2: using square root of mean squared Tanimoto similarity (IntDiv2) * %passes filters (Filters) * Distribution difference for logP, SA, QED, NP, weight * Novelty (molecules not present in train) """ if k is None: k = [1000, 10000] disable_rdkit_log() metrics = {} if gpu is not None: warnings.warn( "parameter `gpu` is deprecated. Use `device`", DeprecationWarning ) if gpu == -1: device = 'cpu' else: device = 'cuda:{}'.format(gpu) close_pool = False if pool is None: if n_jobs != 1: pool = Pool(n_jobs) close_pool = True else: pool = 1 metrics['valid'] = fraction_valid(gen, n_jobs=pool) gen = remove_invalid(gen, canonize=True) if not isinstance(k, (list, tuple)): k = [k] for _k in k: metrics['unique@{}'.format(_k)] = fraction_unique(gen, _k, pool) if ptest is None: ptest = compute_intermediate_statistics(test, n_jobs=n_jobs, device=device, batch_size=batch_size, pool=pool) if test_scaffolds is not None and ptest_scaffolds is None: ptest_scaffolds = compute_intermediate_statistics( test_scaffolds, n_jobs=n_jobs, device=device, batch_size=batch_size, pool=pool ) mols = mapper(pool)(get_mol, gen) kwargs = {'n_jobs': pool, 'device': device, 'batch_size': batch_size} kwargs_fcd = {'n_jobs': n_jobs, 'device': device, 'batch_size': batch_size} metrics['FCD/Test'] = FCDMetric(**kwargs_fcd)(gen=gen, pref=ptest['FCD']) metrics['SNN/Test'] = SNNMetric(**kwargs)(gen=mols, pref=ptest['SNN']) metrics['Frag/Test'] = FragMetric(**kwargs)(gen=mols, pref=ptest['Frag']) metrics['Scaf/Test'] = ScafMetric(**kwargs)(gen=mols, pref=ptest['Scaf']) if ptest_scaffolds is not None: metrics['FCD/TestSF'] = FCDMetric(**kwargs_fcd)( gen=gen, pref=ptest_scaffolds['FCD'] ) metrics['SNN/TestSF'] = SNNMetric(**kwargs)( gen=mols, pref=ptest_scaffolds['SNN'] ) metrics['Frag/TestSF'] = FragMetric(**kwargs)( gen=mols, pref=ptest_scaffolds['Frag'] ) metrics['Scaf/TestSF'] = ScafMetric(**kwargs)( gen=mols, pref=ptest_scaffolds['Scaf'] ) metrics['IntDiv'] = internal_diversity(mols, pool, device=device) metrics['IntDiv2'] = internal_diversity(mols, pool, device=device, p=2) metrics['Filters'] = fraction_passes_filters(mols, pool) # Properties for name, func in [('logP', logP), ('SA', SA), ('QED', QED), ('NP', NP), ('weight', weight)]: metrics[name] = FrechetMetric(func, **kwargs)(gen=mols, pref=ptest[name]) if train is not None: metrics['Novelty'] = novelty(mols, train, pool) enable_rdkit_log() if close_pool: pool.close() pool.join() return metrics
help='Path to the config csv with `name` and `path` columns. ' '`name` is a model name, and ' '`path` is a path to generated samples`') parser.add_argument('--n_jobs', type=int, default=1, help='number of processes to use') parser.add_argument('--img_folder', type=str, default='images/', help='Store images in this folder') return parser if __name__ == "__main__": disable_rdkit_log() parser = get_parser() config, unknown = parser.parse_known_args() if len(unknown) != 0: raise ValueError("Unknown argument " + unknown[0]) os.makedirs(config.img_folder, exist_ok=True) generated = OrderedDict( {'MOSES': pd.DataFrame({'SMILES': get_dataset('test')})}) models = pd.read_csv(config.config) for path, name in zip(models['path'], models['name']): generated[name] = pd.read_csv(path) metrics = {'weight': weight, 'logP': logP, 'SA': SA, 'QED': QED}
def get_all_metrics(test, gen, k=[1000, 10000], n_jobs=1, gpu=-1, batch_size=512, test_scaffolds=None, ptest=None, ptest_scaffolds=None): ''' Computes all available metrics between test (scaffold test) and generated sets of SMILES. Parameters: test: list of test SMILES gen: list of generated SMILES k: list with values for unique@k. Will calculate number of unique molecules in the first k molecules. n_jobs: number of workers for parallel processing gpu: index of GPU for FCD metric and internal diversity, -1 means use CPU batch_size: batch size for FCD metric test_scaffolds: list of scaffold test SMILES Will compute only on the general test set if not specified ptest: dict with precalculated statistics of the test set ptest_scaffolds: dict with precalculated statistics of the scaffold test set Available metrics: * %valid * %unique@k * Frechet ChemNet Distance (FCD) * Fragment similarity (Frag) * Scaffold similarity (Scaf) * Similarity to nearest neighbour (SNN) * Internal diversity (IntDiv) * Internal diversity 2: using square root of mean squared Tanimoto similarity (IntDiv2) * %passes filters (Filters) * Distribution difference for logP, SA, QED, NP, weight ''' disable_rdkit_log() metrics = {} if n_jobs != 1: pool = Pool(n_jobs) else: pool = 1 metrics['valid'] = fraction_valid(gen, n_jobs=n_jobs) gen = remove_invalid(gen, canonize=True) if not isinstance(k, (list, tuple)): k = [k] for _k in k: metrics['unique@{}'.format(_k)] = fraction_unique(gen, _k, pool) if ptest is None: ptest = compute_intermediate_statistics(test, n_jobs=n_jobs, gpu=gpu, batch_size=batch_size) if test_scaffolds is not None and ptest_scaffolds is None: ptest_scaffolds = compute_intermediate_statistics( test_scaffolds, n_jobs=n_jobs, gpu=gpu, batch_size=batch_size) mols = mapper(pool)(get_mol, gen) kwargs = {'n_jobs': pool, 'gpu': gpu, 'batch_size': batch_size} metrics['FCD/Test'] = FCDMetric(**kwargs)(gen=gen, ptest=ptest['FCD']) metrics['SNN/Test'] = SNNMetric(**kwargs)(gen=mols, ptest=ptest['SNN']) metrics['Frag/Test'] = FragMetric(**kwargs)(gen=mols, ptest=ptest['Frag']) metrics['Scaf/Test'] = ScafMetric(**kwargs)(gen=mols, ptest=ptest['Scaf']) if ptest_scaffolds is not None: metrics['FCD/TestSF'] = FCDMetric(**kwargs)( gen=gen, ptest=ptest_scaffolds['FCD']) metrics['SNN/TestSF'] = SNNMetric(**kwargs)( gen=mols, ptest=ptest_scaffolds['SNN']) metrics['Frag/TestSF'] = FragMetric(**kwargs)( gen=mols, ptest=ptest_scaffolds['Frag']) metrics['Scaf/TestSF'] = ScafMetric(**kwargs)( gen=mols, ptest=ptest_scaffolds['Scaf']) metrics['IntDiv'] = internal_diversity(mols, pool, gpu=gpu) metrics['IntDiv2'] = internal_diversity(mols, pool, gpu=gpu, p=2) metrics['Filters'] = fraction_passes_filters(mols, pool) # Properties for name, func in [('logP', logP), ('SA', SA), ('QED', QED), ('NP', NP), ('weight', weight)]: metrics[name] = FrechetMetric(func, **kwargs)(gen=mols, ptest=ptest[name]) enable_rdkit_log() if n_jobs != 1: pool.terminate() return metrics
def get_all_metrics(gen, k=None, n_jobs=1, device='cpu', batch_size=512, pool=None, test=None, test_scaffolds=None, ptest=None, ptest_scaffolds=None, train=None): """ Computes all available metrics between test (scaffold test) and generated sets of SMILES. Parameters: gen: list of generated SMILES k: int or list with values for unique@k. Will calculate number of unique molecules in the first k molecules. Default [1000, 10000] n_jobs: number of workers for parallel processing device: 'cpu' or 'cuda:n', where n is GPU device number batch_size: batch size for FCD metric pool: optional multiprocessing pool to use for parallelization test (None or list): test SMILES. If None, will load a default test set test_scaffolds (None or list): scaffold test SMILES. If None, will load a default scaffold test set ptest (None or dict): precalculated statistics of the test set. If None, will load default test statistics. If you specified a custom test set, default test statistics will be ignored ptest_scaffolds (None or dict): precalculated statistics of the scaffold test set If None, will load default scaffold test statistics. If you specified a custom test set, default test statistics will be ignored train (None or list): train SMILES. If None, will load a default train set Available metrics: * %valid * %unique@k * Frechet ChemNet Distance (FCD) * Fragment similarity (Frag) * Scaffold similarity (Scaf) * Similarity to nearest neighbour (SNN) * Internal diversity (IntDiv) * Internal diversity 2: using square root of mean squared Tanimoto similarity (IntDiv2) * %passes filters (Filters) * Distribution difference for logP, SA, QED, weight * Novelty (molecules not present in train) """ if test is None: if ptest is not None: raise ValueError("You cannot specify custom test " "statistics for default test set") test = get_dataset('test') ptest = get_statistics('test') if test_scaffolds is None: if ptest_scaffolds is not None: raise ValueError("You cannot specify custom scaffold test " "statistics for default scaffold test set") test_scaffolds = get_dataset('test_scaffolds') ptest_scaffolds = get_statistics('test_scaffolds') train = train or get_dataset('train') if k is None: k = [1000, 10000] disable_rdkit_log() metrics = {} close_pool = False if pool is None: if n_jobs != 1: pool = Pool(n_jobs) close_pool = True else: pool = 1 metrics['valid'] = fraction_valid(gen, n_jobs=pool) gen = remove_invalid(gen, canonize=True) if not isinstance(k, (list, tuple)): k = [k] for _k in k: metrics['unique@{}'.format(_k)] = fraction_unique(gen, _k, pool) if ptest is None: ptest = compute_intermediate_statistics(test, n_jobs=n_jobs, device=device, batch_size=batch_size, pool=pool) if test_scaffolds is not None and ptest_scaffolds is None: ptest_scaffolds = compute_intermediate_statistics( test_scaffolds, n_jobs=n_jobs, device=device, batch_size=batch_size, pool=pool) mols = mapper(pool)(get_mol, gen) kwargs = {'n_jobs': pool, 'device': device, 'batch_size': batch_size} kwargs_fcd = {'n_jobs': n_jobs, 'device': device, 'batch_size': batch_size} metrics['FCD/Test'] = FCDMetric(**kwargs_fcd)(gen=gen, pref=ptest['FCD']) metrics['SNN/Test'] = SNNMetric(**kwargs)(gen=mols, pref=ptest['SNN']) metrics['Frag/Test'] = FragMetric(**kwargs)(gen=mols, pref=ptest['Frag']) metrics['Scaf/Test'] = ScafMetric(**kwargs)(gen=mols, pref=ptest['Scaf']) if ptest_scaffolds is not None: metrics['FCD/TestSF'] = FCDMetric(**kwargs_fcd)( gen=gen, pref=ptest_scaffolds['FCD']) metrics['SNN/TestSF'] = SNNMetric(**kwargs)( gen=mols, pref=ptest_scaffolds['SNN']) metrics['Frag/TestSF'] = FragMetric(**kwargs)( gen=mols, pref=ptest_scaffolds['Frag']) metrics['Scaf/TestSF'] = ScafMetric(**kwargs)( gen=mols, pref=ptest_scaffolds['Scaf']) metrics['IntDiv'] = internal_diversity(mols, pool, device=device) metrics['IntDiv2'] = internal_diversity(mols, pool, device=device, p=2) metrics['Filters'] = fraction_passes_filters(mols, pool) # Properties for name, func in [('logP', logP), ('SA', SA), ('QED', QED), ('weight', weight)]: metrics[name] = WassersteinMetric(func, **kwargs)(gen=mols, pref=ptest[name]) if train is not None: metrics['Novelty'] = novelty(mols, train, pool) enable_rdkit_log() if close_pool: pool.close() pool.join() return metrics