def remove_invalid(gen, canonize=True, n_jobs=1): """ Removes invalid molecules from the dataset """ if not canonize: mols = mapper(n_jobs)(get_mol, gen) return [gen_ for gen_, mol in zip(gen, mols) if mol is not None] return [x for x in mapper(n_jobs)(canonic_smiles, gen) if x is not None]
def compute_intermediate_statistics(smiles, n_jobs=1, device='cpu', batch_size=512, pool=None): """ The function precomputes statistics such as mean and variance for FCD, etc. It is useful to compute the statistics for test and scaffold test sets to speedup metrics calculation. """ close_pool = False if pool is None: if n_jobs != 1: pool = Pool(n_jobs) close_pool = True else: pool = 1 statistics = {} mols = mapper(pool)(get_mol, smiles) kwargs = {'n_jobs': pool, 'device': device, 'batch_size': batch_size} #kwargs_fcd = {'n_jobs': n_jobs, 'device': device, 'batch_size': batch_size} #statistics['FCD'] = FCDMetric(**kwargs_fcd).precalc(smiles) statistics['SNN'] = SNNMetric(**kwargs).precalc(mols) statistics['Frag'] = FragMetric(**kwargs).precalc(mols) statistics['Scaf'] = ScafMetric(**kwargs).precalc(mols) #for name, func in [('logP', logP), ('SA', SA), # ('QED', QED), # ('weight', weight)]: # statistics[name] = WassersteinMetric(func, **kwargs).precalc(mols) if close_pool: pool.terminate() return statistics
def novelty(gen, train, n_jobs=1): if isinstance(gen[0], rdkit.Chem.rdchem.Mol): gen_smiles = mapper(n_jobs)(canonic_smiles, gen) else: gen_smiles = gen gen_smiles_set = set(gen_smiles) - {None} train_set = set(train) return len(gen_smiles_set - train_set) / len(gen_smiles_set)
def fraction_valid(gen, n_jobs=1): """ Computes a number of valid molecules Parameters: gen: list of SMILES n_jobs: number of threads for calculation """ gen = mapper(n_jobs)(get_mol, gen) return 1 - gen.count(None) / len(gen)
def compute_fragments(mol_list, n_jobs=1): """ fragment list of mols using BRICS and return smiles list """ fragments = Counter() for mol_frag in mapper(n_jobs)(fragmenter, mol_list): fragments.update(mol_frag) if None in fragments: # fragments.pop(None) # return fragments
def fraction_passes_filters(gen, n_jobs=1): """ Computes the fraction of molecules that pass filters: * MCF * PAINS * Only allowed atoms ('C','N','S','O','F','Cl','Br','H') * No charges """ passes = mapper(n_jobs)(mol_passes_filters, gen) return np.mean(passes)
def compute_scaffolds(mol_list, n_jobs=1, min_rings=2): """ Extracts a scaffold from a molecule in a form of a canonic SMILES """ scaffolds = Counter() map_ = mapper(n_jobs) scaffolds = Counter( map_(partial(compute_scaffold, min_rings=min_rings), mol_list)) if None in scaffolds: scaffolds.pop(None) return scaffolds
def fingerprints(smiles_mols_array, n_jobs=1, already_unique=False, *args, **kwargs): ''' Computes fingerprints of smiles np.array/list/pd.Series with n_jobs workers e.g.fingerprints(smiles_mols_array, type='morgan', n_jobs=10) Inserts np.NaN to rows corresponding to incorrect smiles. IMPORTANT: if there is at least one np.NaN, the dtype would be float Parameters: smiles_mols_array: list/array/pd.Series of smiles or already computed RDKit molecules n_jobs: number of parralel workers to execute already_unique: flag for performance reasons, if smiles array is big and already unique. Its value is set to True if smiles_mols_array contain RDKit molecules already. ''' if isinstance(smiles_mols_array, pd.Series): smiles_mols_array = smiles_mols_array.values else: smiles_mols_array = np.asarray(smiles_mols_array) if not isinstance(smiles_mols_array[0], str): already_unique = True if not already_unique: smiles_mols_array, inv_index = np.unique(smiles_mols_array, return_inverse=True) fps = mapper(n_jobs)(partial(fingerprint, *args, **kwargs), smiles_mols_array) length = 1 for fp in fps: if fp is not None: length = fp.shape[-1] first_fp = fp break fps = [ fp if fp is not None else np.array([np.NaN]).repeat(length)[None, :] for fp in fps ] if scipy.sparse.issparse(first_fp): fps = scipy.sparse.vstack(fps).tocsr() else: fps = np.vstack(fps) if not already_unique: return fps[inv_index] return fps
def fraction_unique(gen, k=None, n_jobs=1, check_validity=True): """ Computes a number of unique molecules Parameters: gen: list of SMILES k: compute unique@k n_jobs: number of threads for calculation check_validity: raises ValueError if invalid molecules are present """ if k is not None: if len(gen) < k: warnings.warn("Can't compute unique@{}.".format(k) + "gen contains only {} molecules".format(len(gen))) gen = gen[:k] canonic = set(mapper(n_jobs)(canonic_smiles, gen)) if None in canonic and check_validity: raise ValueError("Invalid molecule passed to unique@k") return len(canonic) / len(gen)
def numpy_fps_to_bitvectors(fps, n_jobs=1): bit_vectors = mapper(n_jobs)(partial(numpy_fp_to_bitvector), fps) return list(set(bit_vectors) - {None})
def precalc(self, mols): if self.func is not None: values = mapper(self.n_jobs)(self.func, mols) else: values = mols return {'values': values}
def calculate(self, gen, calc_valid=False, calc_unique=False, unique_k=None, se_k=None): metrics = {} metrics['#'] = len(gen) # Calculate validity if calc_valid: metrics['Validity'] = fraction_valid(gen, self.pool) gen = remove_invalid(gen, canonize=True) mols = mapper(self.pool)(get_mol, gen) metrics['# valid'] = len(gen) # Calculate Uniqueness if calc_unique: metrics['Uniqueness'] = fraction_unique(gen=gen, k=None, n_jobs=self.pool) if unique_k is not None: metrics[f'Unique@{unique_k/1000:.0f}k'] = fraction_unique( gen=gen, k=unique_k, n_jobs=self.pool) # Now subset only unique molecules gen = list(set(gen)) mols = mapper(self.pool)(get_mol, gen) # Precalculate some things mol_fps = fingerprints(mols, self.pool, already_unique=True, fp_type='morgan') scaffs = compute_scaffolds(mols, n_jobs=self.n_jobs) scaff_gen = list(scaffs.keys()) scaff_mols = mapper(self.pool)(get_mol, scaff_gen) metrics['# valid & unique'] = len(gen) # Calculate diversity related metrics if self.train is not None: metrics['Novelty'] = novelty(gen, self.train, self.pool) metrics['IntDiv1'] = internal_diversity(gen=mol_fps, n_jobs=self.pool, device=self.device) metrics['IntDiv2'] = internal_diversity(gen=mol_fps, n_jobs=self.pool, device=self.device, p=2) metrics['SEDiv'] = se_diversity(gen=mols, n_jobs=self.pool) if se_k is not None: metrics[f'SEDiv@{se_k/1000:.0f}k'] = se_diversity(gen=mols, k=se_k, n_jobs=self.pool, normalize=True) metrics['ScaffDiv'] = internal_diversity(gen=scaff_mols, n_jobs=self.pool, device=self.device, fp_type='morgan') metrics['Scaff uniqueness'] = len(scaff_gen) / len(gen) # Calculate % pass filters metrics['Filters'] = fraction_passes_filters(mols, self.pool) # Calculate FCD pgen = FCDMetric(**self.kwargs_fcd).precalc(gen) if self.ptrain: metrics['FCD_train'] = FCDMetric(**self.kwargs_fcd)( pgen=pgen, pref=self.ptrain) if self.ptest: metrics['FCD_test'] = FCDMetric(**self.kwargs_fcd)(pgen=pgen, pref=self.ptest) if self.ptest_scaffolds: metrics['FCD_testSF'] = FCDMetric(**self.kwargs_fcd)( pgen=pgen, pref=self.ptest_scaffolds) if self.ptarget: metrics['FCD_target'] = FCDMetric(**self.kwargs_fcd)( pgen=pgen, pref=self.ptarget) # Test metrics if self.test_int is not None: metrics['SNN_test'] = SNNMetric(**self.kwargs)( pgen={ 'fps': mol_fps }, pref=self.test_int['SNN']) metrics['Frag_test'] = FragMetric(**self.kwargs)( gen=mols, pref=self.test_int['Frag']) metrics['Scaf_test'] = ScafMetric(**self.kwargs)( pgen={ 'scaf': scaffs }, pref=self.test_int['Scaf']) # Test scaff metrics if self.test_scaffolds_int is not None: metrics['SNN_testSF'] = SNNMetric(**self.kwargs)( pgen={ 'fps': mol_fps }, pref=self.test_scaffolds_int['SNN']) metrics['Frag_testSF'] = FragMetric(**self.kwargs)( gen=mols, pref=self.test_scaffolds_int['Frag']) metrics['Scaf_testSF'] = ScafMetric(**self.kwargs)( pgen={ 'scaf': scaffs }, pref=self.test_scaffolds_int['Scaf']) # Target metrics if self.target_int is not None: metrics['SNN_target'] = SNNMetric(**self.kwargs)( pgen={ 'fps': mol_fps }, pref=self.target_int['SNN']) metrics['Frag_target'] = FragMetric(**self.kwargs)( gen=mols, pref=self.target_int['Frag']) metrics['Scaf_target'] = ScafMetric(**self.kwargs)( pgen={ 'scaf': scaffs }, pref=self.target_int['Scaf']) return metrics