Beispiel #1
0
 def test_valid_unique(self):
     disable_rdkit_log()
     mols = ['CCNC', 'CCC', 'INVALID', 'CCC']
     assert np.allclose(fraction_valid(mols), 3 / 4), "Failed valid"
     assert np.allclose(fraction_unique(mols, check_validity=False),
                        3 / 4), "Failed unique"
     assert np.allclose(fraction_unique(mols, k=2), 1), "Failed unique"
     mols = [Chem.MolFromSmiles(x) for x in mols]
     assert np.allclose(fraction_valid(mols), 3 / 4), "Failed valid"
     assert np.allclose(fraction_unique(mols, check_validity=False),
                        3 / 4), "Failed unique"
     assert np.allclose(fraction_unique(mols, k=2), 1), "Failed unique"
     enable_rdkit_log()
Beispiel #2
0
def get_all_metrics(test,
                    gen,
                    k=[1000, 10000],
                    n_jobs=1,
                    gpu=-1,
                    batch_size=512,
                    test_scaffolds=None,
                    ptest=None,
                    ptest_scaffolds=None):
    '''
    Computes all available metrics between test (scaffold test) and generated sets of SMILES.
    Parameters:
        test: list of test SMILES
        gen: list of generated SMILES
        k: list with values for unique@k.
            Will calculate number of unique molecules in the first k molecules.
        n_jobs: number of workers for parallel processing
        gpu: index of GPU for FCD metric and internal diversity, -1 means use CPU
        batch_size: batch size for FCD metric
        test_scaffolds: list of scaffold test SMILES
            Will compute only on the general test set if not specified
        ptest: dict with precalculated statistics of the test set
        ptest_scaffolds: dict with precalculated statistics of the scaffold test set
        
    
    Available metrics:
        * %valid
        * %unique@k
        * Frechet ChemNet Distance (FCD)
        * Fragment similarity (Frag)
        * Scaffold similarity (Scaf)
        * Similarity to nearest neighbour (SNN)
        * Internal diversity (IntDiv)
        * Internal diversity 2: using square root of mean squared Tanimoto similarity (IntDiv2)
        * %passes filters (Filters)
        * Distribution difference for logP, SA, QED, NP, weight
    '''
    disable_rdkit_log()
    metrics = {}
    if n_jobs != 1:
        pool = Pool(n_jobs)
    else:
        pool = 1
    metrics['valid'] = fraction_valid(gen, n_jobs=n_jobs)
    gen = remove_invalid(gen, canonize=True)
    if not isinstance(k, (list, tuple)):
        k = [k]
    for _k in k:
        metrics['unique@{}'.format(_k)] = fraction_unique(gen, _k, pool)

    if ptest is None:
        ptest = compute_intermediate_statistics(test,
                                                n_jobs=n_jobs,
                                                gpu=gpu,
                                                batch_size=batch_size)
    if test_scaffolds is not None and ptest_scaffolds is None:
        ptest_scaffolds = compute_intermediate_statistics(
            test_scaffolds, n_jobs=n_jobs, gpu=gpu, batch_size=batch_size)
    mols = mapper(pool)(get_mol, gen)
    kwargs = {'n_jobs': pool, 'gpu': gpu, 'batch_size': batch_size}
    metrics['FCD/Test'] = FCDMetric(**kwargs)(gen=gen, ptest=ptest['FCD'])
    metrics['SNN/Test'] = SNNMetric(**kwargs)(gen=mols, ptest=ptest['SNN'])
    metrics['Frag/Test'] = FragMetric(**kwargs)(gen=mols, ptest=ptest['Frag'])
    metrics['Scaf/Test'] = ScafMetric(**kwargs)(gen=mols, ptest=ptest['Scaf'])
    if ptest_scaffolds is not None:
        metrics['FCD/TestSF'] = FCDMetric(**kwargs)(
            gen=gen, ptest=ptest_scaffolds['FCD'])
        metrics['SNN/TestSF'] = SNNMetric(**kwargs)(
            gen=mols, ptest=ptest_scaffolds['SNN'])
        metrics['Frag/TestSF'] = FragMetric(**kwargs)(
            gen=mols, ptest=ptest_scaffolds['Frag'])
        metrics['Scaf/TestSF'] = ScafMetric(**kwargs)(
            gen=mols, ptest=ptest_scaffolds['Scaf'])

    metrics['IntDiv'] = internal_diversity(mols, pool, gpu=gpu)
    metrics['IntDiv2'] = internal_diversity(mols, pool, gpu=gpu, p=2)
    metrics['Filters'] = fraction_passes_filters(mols, pool)

    # Properties
    for name, func in [('logP', logP), ('SA', SA), ('QED', QED), ('NP', NP),
                       ('weight', weight)]:
        metrics[name] = FrechetMetric(func, **kwargs)(gen=mols,
                                                      ptest=ptest[name])
    enable_rdkit_log()
    if n_jobs != 1:
        pool.terminate()
    return metrics
Beispiel #3
0
def get_all_metrics(test, gen, k=None, n_jobs=1, device='cpu',
                    batch_size=512, test_scaffolds=None,
                    ptest=None, ptest_scaffolds=None,
                    pool=None, gpu=None, train=None):
    """
    Computes all available metrics between test (scaffold test)
    and generated sets of SMILES.
    Parameters:
        test: list of test SMILES
        gen: list of generated SMILES
        k: int or list with values for unique@k. Will calculate number of
            unique molecules in the first k molecules. Default [1000, 10000]
        n_jobs: number of workers for parallel processing
        device: 'cpu' or 'cuda:n', where n is GPU device number
        batch_size: batch size for FCD metric
        test_scaffolds: list of scaffold test SMILES
            Will compute only on the general test set if not specified
        ptest: dict with precalculated statistics of the test set
        ptest_scaffolds: dict with precalculated statistics
            of the scaffold test set
        pool: optional multiprocessing pool to use for parallelization
        gpu: deprecated, use `device`
        train: list of train SMILES
    Available metrics:
        * %valid
        * %unique@k
        * Frechet ChemNet Distance (FCD)
        * Fragment similarity (Frag)
        * Scaffold similarity (Scaf)
        * Similarity to nearest neighbour (SNN)
        * Internal diversity (IntDiv)
        * Internal diversity 2: using square root of mean squared
            Tanimoto similarity (IntDiv2)
        * %passes filters (Filters)
        * Distribution difference for logP, SA, QED, NP, weight
        * Novelty (molecules not present in train)
    """
    if k is None:
        k = [1000, 10000]
    disable_rdkit_log()
    metrics = {}
    if gpu is not None:
        warnings.warn(
            "parameter `gpu` is deprecated. Use `device`",
            DeprecationWarning
        )
        if gpu == -1:
            device = 'cpu'
        else:
            device = 'cuda:{}'.format(gpu)
    close_pool = False
    if pool is None:
        if n_jobs != 1:
            pool = Pool(n_jobs)
            close_pool = True
        else:
            pool = 1
    metrics['valid'] = fraction_valid(gen, n_jobs=pool)
    gen = remove_invalid(gen, canonize=True)
    if not isinstance(k, (list, tuple)):
        k = [k]
    for _k in k:
        metrics['unique@{}'.format(_k)] = fraction_unique(gen, _k, pool)

    if ptest is None:
        ptest = compute_intermediate_statistics(test, n_jobs=n_jobs,
                                                device=device,
                                                batch_size=batch_size,
                                                pool=pool)
    if test_scaffolds is not None and ptest_scaffolds is None:
        ptest_scaffolds = compute_intermediate_statistics(
            test_scaffolds, n_jobs=n_jobs,
            device=device, batch_size=batch_size,
            pool=pool
        )
    mols = mapper(pool)(get_mol, gen)
    kwargs = {'n_jobs': pool, 'device': device, 'batch_size': batch_size}
    kwargs_fcd = {'n_jobs': n_jobs, 'device': device, 'batch_size': batch_size}
    metrics['FCD/Test'] = FCDMetric(**kwargs_fcd)(gen=gen, pref=ptest['FCD'])
    metrics['SNN/Test'] = SNNMetric(**kwargs)(gen=mols, pref=ptest['SNN'])
    metrics['Frag/Test'] = FragMetric(**kwargs)(gen=mols, pref=ptest['Frag'])
    metrics['Scaf/Test'] = ScafMetric(**kwargs)(gen=mols, pref=ptest['Scaf'])
    if ptest_scaffolds is not None:
        metrics['FCD/TestSF'] = FCDMetric(**kwargs_fcd)(
            gen=gen, pref=ptest_scaffolds['FCD']
        )
        metrics['SNN/TestSF'] = SNNMetric(**kwargs)(
            gen=mols, pref=ptest_scaffolds['SNN']
        )
        metrics['Frag/TestSF'] = FragMetric(**kwargs)(
            gen=mols, pref=ptest_scaffolds['Frag']
        )
        metrics['Scaf/TestSF'] = ScafMetric(**kwargs)(
            gen=mols, pref=ptest_scaffolds['Scaf']
        )

    metrics['IntDiv'] = internal_diversity(mols, pool, device=device)
    metrics['IntDiv2'] = internal_diversity(mols, pool, device=device, p=2)
    metrics['Filters'] = fraction_passes_filters(mols, pool)

    # Properties
    for name, func in [('logP', logP), ('SA', SA),
                       ('QED', QED), ('NP', NP),
                       ('weight', weight)]:
        metrics[name] = FrechetMetric(func, **kwargs)(gen=mols,
                                                      pref=ptest[name])

    if train is not None:
        metrics['Novelty'] = novelty(mols, train, pool)
    enable_rdkit_log()
    if close_pool:
        pool.close()
        pool.join()
    return metrics
Beispiel #4
0
def get_all_metrics(gen,
                    k=None,
                    n_jobs=1,
                    device='cpu',
                    batch_size=512,
                    pool=None,
                    test=None,
                    test_scaffolds=None,
                    ptest=None,
                    ptest_scaffolds=None,
                    train=None):
    """
    Computes all available metrics between test (scaffold test)
    and generated sets of SMILES.
    Parameters:
        gen: list of generated SMILES
        k: int or list with values for unique@k. Will calculate number of
            unique molecules in the first k molecules. Default [1000, 10000]
        n_jobs: number of workers for parallel processing
        device: 'cpu' or 'cuda:n', where n is GPU device number
        batch_size: batch size for FCD metric
        pool: optional multiprocessing pool to use for parallelization

        test (None or list): test SMILES. If None, will load
            a default test set
        test_scaffolds (None or list): scaffold test SMILES. If None, will
            load a default scaffold test set
        ptest (None or dict): precalculated statistics of the test set. If
            None, will load default test statistics. If you specified a custom
            test set, default test statistics will be ignored
        ptest_scaffolds (None or dict): precalculated statistics of the
            scaffold test set If None, will load default scaffold test
            statistics. If you specified a custom test set, default test
            statistics will be ignored
        train (None or list): train SMILES. If None, will load a default
            train set
    Available metrics:
        * %valid
        * %unique@k
        * Frechet ChemNet Distance (FCD)
        * Fragment similarity (Frag)
        * Scaffold similarity (Scaf)
        * Similarity to nearest neighbour (SNN)
        * Internal diversity (IntDiv)
        * Internal diversity 2: using square root of mean squared
            Tanimoto similarity (IntDiv2)
        * %passes filters (Filters)
        * Distribution difference for logP, SA, QED, weight
        * Novelty (molecules not present in train)
    """
    if test is None:
        if ptest is not None:
            raise ValueError("You cannot specify custom test "
                             "statistics for default test set")
        test = get_dataset('test')
        ptest = get_statistics('test')

    if test_scaffolds is None:
        if ptest_scaffolds is not None:
            raise ValueError("You cannot specify custom scaffold test "
                             "statistics for default scaffold test set")
        test_scaffolds = get_dataset('test_scaffolds')
        ptest_scaffolds = get_statistics('test_scaffolds')

    train = train or get_dataset('train')

    if k is None:
        k = [1000, 10000]
    disable_rdkit_log()
    metrics = {}
    close_pool = False
    if pool is None:
        if n_jobs != 1:
            pool = Pool(n_jobs)
            close_pool = True
        else:
            pool = 1
    metrics['valid'] = fraction_valid(gen, n_jobs=pool)
    gen = remove_invalid(gen, canonize=True)
    if not isinstance(k, (list, tuple)):
        k = [k]
    for _k in k:
        metrics['unique@{}'.format(_k)] = fraction_unique(gen, _k, pool)

    if ptest is None:
        ptest = compute_intermediate_statistics(test,
                                                n_jobs=n_jobs,
                                                device=device,
                                                batch_size=batch_size,
                                                pool=pool)
    if test_scaffolds is not None and ptest_scaffolds is None:
        ptest_scaffolds = compute_intermediate_statistics(
            test_scaffolds,
            n_jobs=n_jobs,
            device=device,
            batch_size=batch_size,
            pool=pool)
    mols = mapper(pool)(get_mol, gen)
    kwargs = {'n_jobs': pool, 'device': device, 'batch_size': batch_size}
    kwargs_fcd = {'n_jobs': n_jobs, 'device': device, 'batch_size': batch_size}
    metrics['FCD/Test'] = FCDMetric(**kwargs_fcd)(gen=gen, pref=ptest['FCD'])
    metrics['SNN/Test'] = SNNMetric(**kwargs)(gen=mols, pref=ptest['SNN'])
    metrics['Frag/Test'] = FragMetric(**kwargs)(gen=mols, pref=ptest['Frag'])
    metrics['Scaf/Test'] = ScafMetric(**kwargs)(gen=mols, pref=ptest['Scaf'])
    if ptest_scaffolds is not None:
        metrics['FCD/TestSF'] = FCDMetric(**kwargs_fcd)(
            gen=gen, pref=ptest_scaffolds['FCD'])
        metrics['SNN/TestSF'] = SNNMetric(**kwargs)(
            gen=mols, pref=ptest_scaffolds['SNN'])
        metrics['Frag/TestSF'] = FragMetric(**kwargs)(
            gen=mols, pref=ptest_scaffolds['Frag'])
        metrics['Scaf/TestSF'] = ScafMetric(**kwargs)(
            gen=mols, pref=ptest_scaffolds['Scaf'])

    metrics['IntDiv'] = internal_diversity(mols, pool, device=device)
    metrics['IntDiv2'] = internal_diversity(mols, pool, device=device, p=2)
    metrics['Filters'] = fraction_passes_filters(mols, pool)

    # Properties
    for name, func in [('logP', logP), ('SA', SA), ('QED', QED),
                       ('weight', weight)]:
        metrics[name] = WassersteinMetric(func, **kwargs)(gen=mols,
                                                          pref=ptest[name])

    if train is not None:
        metrics['Novelty'] = novelty(mols, train, pool)
    enable_rdkit_log()
    if close_pool:
        pool.close()
        pool.join()
    return metrics