def test_to_smarts(): smiles = "O=C(C)Oc1ccccc1C(=O)O" mol = dm.to_mol(smiles) smarts = dm.to_smarts(mol, keep_hs=True) assert smarts == "[CH3]-[C](=[O])-[O]-[c]1:[cH]:[cH]:[cH]:[cH]:[c]:1-[C](=[O])-[OH]" smarts = dm.to_smarts(mol, keep_hs=False) assert smarts == "[CH3]-[C](=[O])-[O]-[c]1:[cH]:[cH]:[cH]:[cH]:[c]:1-[C](=[O])-[OH]" assert dm.to_smarts(None) is None
def fuzzy_scaffolding( mols: List[Chem.rdchem.Mol], enforce_subs: List[str] = None, n_atom_cuttoff: int = 8, additional_templates: List[Chem.rdchem.Mol] = None, ignore_non_ring: bool = False, mcs_params: Dict[Any, Any] = None, ): """Generate fuzzy scaffold with enforceable group that needs to appear in the core, forcing to keep the full side chain if required. NOTE(hadim): consider parallelize this (if possible). Args: mols: List of all molecules enforce_subs: List of substructure to enforce on the scaffold. n_atom_cuttoff: Minimum number of atom a core should have. additional_templates: Additional template to use to generate scaffolds. ignore_non_ring: Whether to ignore atom no in murcko ring system, even if they are in the framework. mcs_params: Arguments of MCS algorithm. Returns: scaffolds: set All found scaffolds in the molecules as valid smiles scaffold_infos: dict of dict Infos on the scaffold mapping, ignoring any side chain that had to be enforced. Key corresponds to generic scaffold smiles Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS) Values at ['mols'] corresponds to list of molecules matching the scaffold scaffold_to_group: dict of list Map between each generic scaffold and the R-groups decomposition row """ if enforce_subs is None: enforce_subs = [] if additional_templates is None: additional_templates = [] if mcs_params is None: mcs_params = {} rg_params = rdRGroupDecomposition.RGroupDecompositionParameters() rg_params.removeAllHydrogenRGroups = True rg_params.removeHydrogensPostMatch = True rg_params.alignment = rdRGroupDecomposition.RGroupCoreAlignment.MCS rg_params.matchingStrategy = rdRGroupDecomposition.RGroupMatching.Exhaustive rg_params.rgroupLabelling = rdRGroupDecomposition.RGroupLabelling.AtomMap rg_params.labels = rdRGroupDecomposition.RGroupLabels.AtomIndexLabels core_query_param = AdjustQueryParameters() core_query_param.makeDummiesQueries = True core_query_param.adjustDegree = False core_query_param.makeBondsGeneric = True # group molecules by they generic Murcko scaffold, allowing # side chain that contains cycle (might be a bad idea) scf2infos = collections.defaultdict(dict) scf2groups = {} all_scaffolds = set([]) for m in mols: generic_m = MurckoScaffold.MakeScaffoldGeneric(m) scf = MurckoScaffold.GetScaffoldForMol(m) try: scf = MurckoScaffold.MakeScaffoldGeneric(scf) except: pass if ignore_non_ring: rw_scf = Chem.RWMol(scf) atms = [a.GetIdx() for a in rw_scf.GetAtoms() if not a.IsInRing()] atms.sort(reverse=True) for a in atms: rw_scf.RemoveAtom(a) scfs = list(rdmolops.GetMolFrags(rw_scf, asMols=False)) else: scfs = [dm.to_smiles(scf)] # add templates mols if exists: for tmp in additional_templates: tmp = dm.to_mol(tmp) tmp_scf = MurckoScaffold.MakeScaffoldGeneric(tmp) if generic_m.HasSubstructMatch(tmp_scf): scfs.append(dm.to_smiles(tmp_scf)) for scf in scfs: if scf2infos[scf].get("mols"): scf2infos[scf]["mols"].append(m) else: scf2infos[scf]["mols"] = [m] for scf in scf2infos: # cheat by adding murcko as last mol always popout = False mols = scf2infos[scf]["mols"] if len(mols) < 2: mols = mols + [MurckoScaffold.GetScaffoldForMol(mols[0])] popout = True # compute the MCS of the cluster mcs = rdFMCS.FindMCS( mols, atomCompare=rdFMCS.AtomCompare.CompareAny, bondCompare=rdFMCS.BondCompare.CompareAny, completeRingsOnly=True, **mcs_params, ) mcsM = Chem.MolFromSmarts(mcs.smartsString) mcsM.UpdatePropertyCache(False) Chem.SetHybridization(mcsM) if mcsM.GetNumAtoms() < n_atom_cuttoff: continue scf2infos[scf]["smarts"] = dm.to_smarts(mcsM) if popout: mols = mols[:-1] core_groups = [] # generate rgroups based on the mcs core success_mols = [] try: rg = rdRGroupDecomposition.RGroupDecomposition(mcsM, rg_params) for i, analog in enumerate(mols): analog.RemoveAllConformers() res = rg.Add(analog) if not (res < 0): success_mols.append(i) rg.Process() core_groups = rg.GetRGroupsAsRows() except Exception: pass mols = [mols[i] for i in success_mols] scf2groups[scf] = core_groups for mol, gp in zip(mols, core_groups): core = gp["Core"] acceptable_groups = [ a.GetAtomMapNum() for a in core.GetAtoms() if (a.GetAtomMapNum() and not a.IsInRing()) ] rgroups = [ gp[f"R{k}"] for k in acceptable_groups if f"R{k}" in gp.keys() ] if enforce_subs: rgroups = [ rgp for rgp in rgroups if not any([ len(rgp.GetSubstructMatch(frag)) > 0 for frag in enforce_subs ]) ] try: scaff = trim_side_chain( mol, AdjustQueryProperties(core, core_query_param), rgroups) except: continue all_scaffolds.add(dm.to_smiles(scaff)) return all_scaffolds, scf2infos, scf2groups
def _preprocess(i, row): '''Takes a smiles string and generates a clean rdkit mol with datamol. The stereoisomers are then enumerated while holding defined stereochemistry. Morgan fingerprints are then generated using RDkit with and without stereochemistry. The try/except logic deals with RDkit mol failures on conversion of an invalid smiles string. Smarts are added for later searching.''' try: mol = dm.to_mol(str(row[smiles_column]), ordered=True) mol = dm.fix_mol(mol) mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) opts = StereoEnumerationOptions(unique=True,maxIsomers=20,rand=0xf00d) isomers = EnumerateStereoisomers(mol, options=opts) enum_smiles = sorted(Chem.MolToSmiles(y,isomericSmiles=True) for y in isomers) # enum_dm_smiles = sorted(dm.standardize_smiles(dm.to_smiles(x)) for x in isomers) smiles_list = [] achiral_fp_lis = [] chiral_fp_lis = [] # standard_smiles_list = [] for count, smi in enumerate(enum_smiles): smiles_string = smi mol = dm.to_mol(smi, ordered=True) mol = dm.fix_mol(mol) mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect pars = { "radius": 2, "nBits": 8192, "invariants": [], "fromAtoms": [], "useChirality": True, "useBondTypes": True, "useFeatures": False, } pars2 = { "radius": 2, "nBits": 8192, "invariants": [], "fromAtoms": [], "useChirality": False, "useBondTypes": True, "useFeatures": False, } fp = fingerprint_function(mol, **pars) fp1 = fingerprint_function(mol, **pars2) smiles_list.append(dm.standardize_smiles(smiles_string)) achiral_fp_lis.append(list(fp1.GetOnBits())) chiral_fp_lis.append(list(fp.GetOnBits())) row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol)) row["smarts"] = dm.to_smarts(mol) row["selfies"] = dm.to_selfies(mol) row["enumerated_smiles"] = smiles_list row["achiral_fp"] = achiral_fp_lis row["chiral_fp"] = chiral_fp_lis # row["dm_enumerated_smiles"] = enum_dm_smiles_lis # row["onbits_fp"] =list(fp.GetOnBits()) return row except ValueError: # row["standard_smiles"] = 'dropped' # row["selfies"] = 'dropped' # row["inchi"] = 'dropped' # row["inchikey"] = 'dropped' row["standard_smiles"] = 'dropped' row["smarts"] = 'dropped' row["selfies"] = 'dropped' row["enumerated_smiles"] = list('dropped') row["achiral_fp"] = list('dropped') row["chiral_fp"] = list('dropped') # row["dm_enumerated_smiles"] = 'dropped' return row