Esempio n. 1
0
def remove_invalid(gen, canonize=True, n_jobs=1):
    """
    Removes invalid molecules from the dataset
    """
    if not canonize:
        mols = mapper(n_jobs)(get_mol, gen)
        return [gen_ for gen_, mol in zip(gen, mols) if mol is not None]
    return [x for x in mapper(n_jobs)(canonic_smiles, gen) if x is not None]
Esempio n. 2
0
def compute_intermediate_statistics(smiles,
                                    n_jobs=1,
                                    device='cpu',
                                    batch_size=512,
                                    pool=None):
    """
    The function precomputes statistics such as mean and variance for FCD, etc.
    It is useful to compute the statistics for test and scaffold test sets to
        speedup metrics calculation.
    """
    close_pool = False
    if pool is None:
        if n_jobs != 1:
            pool = Pool(n_jobs)
            close_pool = True
        else:
            pool = 1
    statistics = {}
    mols = mapper(pool)(get_mol, smiles)
    kwargs = {'n_jobs': pool, 'device': device, 'batch_size': batch_size}
    #kwargs_fcd = {'n_jobs': n_jobs, 'device': device, 'batch_size': batch_size}
    #statistics['FCD'] = FCDMetric(**kwargs_fcd).precalc(smiles)
    statistics['SNN'] = SNNMetric(**kwargs).precalc(mols)
    statistics['Frag'] = FragMetric(**kwargs).precalc(mols)
    statistics['Scaf'] = ScafMetric(**kwargs).precalc(mols)
    #for name, func in [('logP', logP), ('SA', SA),
    #                   ('QED', QED),
    #                   ('weight', weight)]:
    #    statistics[name] = WassersteinMetric(func, **kwargs).precalc(mols)
    if close_pool:
        pool.terminate()
    return statistics
Esempio n. 3
0
def novelty(gen, train, n_jobs=1):
    if isinstance(gen[0], rdkit.Chem.rdchem.Mol):
        gen_smiles = mapper(n_jobs)(canonic_smiles, gen)
    else:
        gen_smiles = gen
    gen_smiles_set = set(gen_smiles) - {None}
    train_set = set(train)
    return len(gen_smiles_set - train_set) / len(gen_smiles_set)
Esempio n. 4
0
def fraction_valid(gen, n_jobs=1):
    """
    Computes a number of valid molecules
    Parameters:
        gen: list of SMILES
        n_jobs: number of threads for calculation
    """
    gen = mapper(n_jobs)(get_mol, gen)
    return 1 - gen.count(None) / len(gen)
Esempio n. 5
0
def compute_fragments(mol_list, n_jobs=1):
    """
    fragment list of mols using BRICS and return smiles list
    """
    fragments = Counter()
    for mol_frag in mapper(n_jobs)(fragmenter, mol_list):
        fragments.update(mol_frag)
    if None in fragments:  #
        fragments.pop(None)  #
    return fragments
Esempio n. 6
0
def fraction_passes_filters(gen, n_jobs=1):
    """
    Computes the fraction of molecules that pass filters:
    * MCF
    * PAINS
    * Only allowed atoms ('C','N','S','O','F','Cl','Br','H')
    * No charges
    """
    passes = mapper(n_jobs)(mol_passes_filters, gen)
    return np.mean(passes)
Esempio n. 7
0
def compute_scaffolds(mol_list, n_jobs=1, min_rings=2):
    """
    Extracts a scaffold from a molecule in a form of a canonic SMILES
    """
    scaffolds = Counter()
    map_ = mapper(n_jobs)
    scaffolds = Counter(
        map_(partial(compute_scaffold, min_rings=min_rings), mol_list))
    if None in scaffolds:
        scaffolds.pop(None)
    return scaffolds
Esempio n. 8
0
def fingerprints(smiles_mols_array,
                 n_jobs=1,
                 already_unique=False,
                 *args,
                 **kwargs):
    '''
    Computes fingerprints of smiles np.array/list/pd.Series with n_jobs workers
    e.g.fingerprints(smiles_mols_array, type='morgan', n_jobs=10)
    Inserts np.NaN to rows corresponding to incorrect smiles.
    IMPORTANT: if there is at least one np.NaN, the dtype would be float
    Parameters:
        smiles_mols_array: list/array/pd.Series of smiles or already computed
            RDKit molecules
        n_jobs: number of parralel workers to execute
        already_unique: flag for performance reasons, if smiles array is big
            and already unique. Its value is set to True if smiles_mols_array
            contain RDKit molecules already.
    '''
    if isinstance(smiles_mols_array, pd.Series):
        smiles_mols_array = smiles_mols_array.values
    else:
        smiles_mols_array = np.asarray(smiles_mols_array)
    if not isinstance(smiles_mols_array[0], str):
        already_unique = True

    if not already_unique:
        smiles_mols_array, inv_index = np.unique(smiles_mols_array,
                                                 return_inverse=True)

    fps = mapper(n_jobs)(partial(fingerprint, *args, **kwargs),
                         smiles_mols_array)

    length = 1
    for fp in fps:
        if fp is not None:
            length = fp.shape[-1]
            first_fp = fp
            break
    fps = [
        fp if fp is not None else np.array([np.NaN]).repeat(length)[None, :]
        for fp in fps
    ]
    if scipy.sparse.issparse(first_fp):
        fps = scipy.sparse.vstack(fps).tocsr()
    else:
        fps = np.vstack(fps)
    if not already_unique:
        return fps[inv_index]
    return fps
Esempio n. 9
0
def fraction_unique(gen, k=None, n_jobs=1, check_validity=True):
    """
    Computes a number of unique molecules
    Parameters:
        gen: list of SMILES
        k: compute unique@k
        n_jobs: number of threads for calculation
        check_validity: raises ValueError if invalid molecules are present
    """
    if k is not None:
        if len(gen) < k:
            warnings.warn("Can't compute unique@{}.".format(k) +
                          "gen contains only {} molecules".format(len(gen)))
        gen = gen[:k]
    canonic = set(mapper(n_jobs)(canonic_smiles, gen))
    if None in canonic and check_validity:
        raise ValueError("Invalid molecule passed to unique@k")
    return len(canonic) / len(gen)
Esempio n. 10
0
def numpy_fps_to_bitvectors(fps, n_jobs=1):
    bit_vectors = mapper(n_jobs)(partial(numpy_fp_to_bitvector), fps)
    return list(set(bit_vectors) - {None})
Esempio n. 11
0
 def precalc(self, mols):
     if self.func is not None:
         values = mapper(self.n_jobs)(self.func, mols)
     else:
         values = mols
     return {'values': values}
Esempio n. 12
0
    def calculate(self,
                  gen,
                  calc_valid=False,
                  calc_unique=False,
                  unique_k=None,
                  se_k=None):
        metrics = {}
        metrics['#'] = len(gen)

        # Calculate validity
        if calc_valid:
            metrics['Validity'] = fraction_valid(gen, self.pool)

        gen = remove_invalid(gen, canonize=True)
        mols = mapper(self.pool)(get_mol, gen)
        metrics['# valid'] = len(gen)

        # Calculate Uniqueness
        if calc_unique:
            metrics['Uniqueness'] = fraction_unique(gen=gen,
                                                    k=None,
                                                    n_jobs=self.pool)
            if unique_k is not None:
                metrics[f'Unique@{unique_k/1000:.0f}k'] = fraction_unique(
                    gen=gen, k=unique_k, n_jobs=self.pool)

        # Now subset only unique molecules
        gen = list(set(gen))
        mols = mapper(self.pool)(get_mol, gen)
        # Precalculate some things
        mol_fps = fingerprints(mols,
                               self.pool,
                               already_unique=True,
                               fp_type='morgan')
        scaffs = compute_scaffolds(mols, n_jobs=self.n_jobs)
        scaff_gen = list(scaffs.keys())
        scaff_mols = mapper(self.pool)(get_mol, scaff_gen)
        metrics['# valid & unique'] = len(gen)

        # Calculate diversity related metrics
        if self.train is not None:
            metrics['Novelty'] = novelty(gen, self.train, self.pool)
        metrics['IntDiv1'] = internal_diversity(gen=mol_fps,
                                                n_jobs=self.pool,
                                                device=self.device)
        metrics['IntDiv2'] = internal_diversity(gen=mol_fps,
                                                n_jobs=self.pool,
                                                device=self.device,
                                                p=2)
        metrics['SEDiv'] = se_diversity(gen=mols, n_jobs=self.pool)
        if se_k is not None:
            metrics[f'SEDiv@{se_k/1000:.0f}k'] = se_diversity(gen=mols,
                                                              k=se_k,
                                                              n_jobs=self.pool,
                                                              normalize=True)
        metrics['ScaffDiv'] = internal_diversity(gen=scaff_mols,
                                                 n_jobs=self.pool,
                                                 device=self.device,
                                                 fp_type='morgan')
        metrics['Scaff uniqueness'] = len(scaff_gen) / len(gen)
        # Calculate % pass filters
        metrics['Filters'] = fraction_passes_filters(mols, self.pool)

        # Calculate FCD
        pgen = FCDMetric(**self.kwargs_fcd).precalc(gen)
        if self.ptrain:
            metrics['FCD_train'] = FCDMetric(**self.kwargs_fcd)(
                pgen=pgen, pref=self.ptrain)
        if self.ptest:
            metrics['FCD_test'] = FCDMetric(**self.kwargs_fcd)(pgen=pgen,
                                                               pref=self.ptest)
        if self.ptest_scaffolds:
            metrics['FCD_testSF'] = FCDMetric(**self.kwargs_fcd)(
                pgen=pgen, pref=self.ptest_scaffolds)
        if self.ptarget:
            metrics['FCD_target'] = FCDMetric(**self.kwargs_fcd)(
                pgen=pgen, pref=self.ptarget)
        # Test metrics
        if self.test_int is not None:
            metrics['SNN_test'] = SNNMetric(**self.kwargs)(
                pgen={
                    'fps': mol_fps
                }, pref=self.test_int['SNN'])
            metrics['Frag_test'] = FragMetric(**self.kwargs)(
                gen=mols, pref=self.test_int['Frag'])
            metrics['Scaf_test'] = ScafMetric(**self.kwargs)(
                pgen={
                    'scaf': scaffs
                }, pref=self.test_int['Scaf'])
        # Test scaff metrics
        if self.test_scaffolds_int is not None:
            metrics['SNN_testSF'] = SNNMetric(**self.kwargs)(
                pgen={
                    'fps': mol_fps
                }, pref=self.test_scaffolds_int['SNN'])
            metrics['Frag_testSF'] = FragMetric(**self.kwargs)(
                gen=mols, pref=self.test_scaffolds_int['Frag'])
            metrics['Scaf_testSF'] = ScafMetric(**self.kwargs)(
                pgen={
                    'scaf': scaffs
                }, pref=self.test_scaffolds_int['Scaf'])
        # Target metrics
        if self.target_int is not None:
            metrics['SNN_target'] = SNNMetric(**self.kwargs)(
                pgen={
                    'fps': mol_fps
                }, pref=self.target_int['SNN'])
            metrics['Frag_target'] = FragMetric(**self.kwargs)(
                gen=mols, pref=self.target_int['Frag'])
            metrics['Scaf_target'] = ScafMetric(**self.kwargs)(
                pgen={
                    'scaf': scaffs
                }, pref=self.target_int['Scaf'])

        return metrics