def RetrieveRGroups(Mols):
    """Retrieve R groups"""

    CoreMols = SetupCoreScaffolds(Mols)
    DecompositionParams = SetupRGroupDecompositionParams()
    RGroupDecompositionObject = rgd.RGroupDecomposition(
        CoreMols, DecompositionParams)

    MiscUtil.PrintInfo("\nPerforming R group decomposition...")

    UnmatchedMolIndices = []
    for MolIndex, Mol in enumerate(Mols):
        Status = RGroupDecompositionObject.Add(Mol)
        if Status < 0:
            UnmatchedMolIndices.append(MolIndex)

    if not RGroupDecompositionObject.Process():
        MiscUtil.PrintWarning(
            "R group decomposition failed to match any molecule to core scaffold(s)..."
        )

    RGroups = RGroupDecompositionObject.GetRGroupsAsColumns(asSmiles=True)

    return (RGroups, UnmatchedMolIndices)
Example #2
0
def FWDecompose(
        scaffolds,
        mols,
        scores,
        decomp_params=default_decomp_params) -> FreeWilsonDecomposition:
    """
    Perform a free wilson analysis
        : param scaffolds : scaffold or list of scaffolds to use for the rgroup decomposition
        : param mols : molecules to decompose
        : param scores : list of floating point numbers for the regression (
                             you may need convert these to their logs in some cases)
        : param decomp_params : RgroupDecompositionParams default [
                                    default_decomp_params = rdkit.Chem.rdRGroupDecomposition.RGroupDecompositionParameters()
                                    default_decomp_params.matchingStrategy = rgd.GA
                                    default_decomp_params.onlyMatchAtRGroups = False
                                   ]
                                If you only want to decompose on specific group locations
                                set onlyMatchAtRGroups to True


        >>> from rdkit import Chem
        >>> from freewilson import FWBuild, FWDecompose
        >>> from rdkit.Chem import Descriptors
        >>> scaffold = Chem.MolFromSmiles("c1cccnc1")
        >>> mols = [Chem.MolFromSmiles("c1cccnc1"+"C"*(i+1)) for i in range(100)]
        >>> scores = [Descriptors.MolLogP(m) for m in mols]
        >>> fw = FWDecompose(scaffold, mols, scores)
        >>> for pred in FWBuild(fw):
        ...   print(pred)

    For an easy way to report predictions see 

       >>> import sys
       >>> predictions_to_csv(sys.stdout, fw, FWBuild(fw))

   
    See FWBuild docs to see how to filter predictions, molecular weight or molecular properties.
    """
    descriptors = []  # list of descriptors, one per matched molecules
    #    descriptors are 1/0 if a sidechain is present
    matched_scores = []  # scores from the matching molecules
    rgroup_idx = {}  # rgroup index into descriptor { smiles: idx }
    rgroups = defaultdict(list)  # final list of rgrups/sidechains

    if len(mols) != len(scores):
        raise ValueError(
            f"The number of molecules must match the number of scores #mols {len(mols)} #scores {len(scores)}"
        )
    # decompose the rgroups
    logger.info(f"Decomposing {len(mols)} molecules...")
    decomposer = rgd.RGroupDecomposition(scaffolds, decomp_params)
    for mol, score in tqdm(zip(mols, scores)):
        if decomposer.Add(mol) >= 0:
            matched_scores.append(score)
    decomposer.Process()
    logger.info(f"Matched {len(matched_scores)} out of {len(mols)}")
    if not (matched_scores):
        logger.error("No scaffolds matched the input molecules")
        return

    decomposition = decomposition = decomposer.GetRGroupsAsRows(asSmiles=True)
    logger.info("Get unique rgroups...")
    rgroup_counts = defaultdict(int)
    for row in decomposition:
        for rgroup, smiles in row.items():
            rgroup_counts[smiles] += 1
            if smiles not in rgroup_idx:
                rgroup_idx[smiles] = len(rgroup_idx)
                rgroups[rgroup].append(RGroup(smiles, rgroup, 0, 0))

    logger.info(f"Descriptor size {len(rgroup_idx)}")
    # get the descriptors list, one-hot encoding per rgroup
    for row in decomposition:
        descriptor = [0] * len(rgroup_idx)
        descriptors.append(descriptor)
        for smiles in row.values():
            if smiles in rgroup_idx:
                descriptor[rgroup_idx[smiles]] = 1

    assert len(descriptors) == len(
        matched_scores
    ), f"Number of descriptors({len(descriptors)}) doesn't match number of matcved scores({len(matched_scores)})"

    # Perform the Ridge Regression
    logger.info("Ridge Regressing...")
    lm = Ridge()
    lm.fit(descriptors, matched_scores)
    preds = lm.predict(descriptors)
    r2 = r2_score(matched_scores, preds)
    logger.info(f"R2 {r2}")
    logger.info(f"Intercept = {lm.intercept_:.2f}")

    for sidechains in rgroups.values():
        for rgroup in sidechains:
            rgroup.count = rgroup_counts[rgroup.smiles]
            rgroup.coefficient = lm.coef_[rgroup_idx[rgroup.smiles]]
            rgroup.idx = rgroup_idx[rgroup.smiles]

    return FreeWilsonDecomposition(rgroups, rgroup_idx, lm, r2, descriptors)
Example #3
0
def fuzzy_scaffolding(
    mols: List[Chem.rdchem.Mol],
    enforce_subs: List[str] = None,
    n_atom_cuttoff: int = 8,
    additional_templates: List[Chem.rdchem.Mol] = None,
    ignore_non_ring: bool = False,
    mcs_params: Dict[Any, Any] = None,
):
    """Generate fuzzy scaffold with enforceable group that needs to appear
    in the core, forcing to keep the full side chain if required.

    NOTE(hadim): consider parallelize this (if possible).

    Args:
        mols: List of all molecules
        enforce_subs: List of substructure to enforce on the scaffold.
        n_atom_cuttoff: Minimum number of atom a core should have.
        additional_templates: Additional template to use to generate scaffolds.
        ignore_non_ring: Whether to ignore atom no in murcko ring system, even if they are in the framework.
        mcs_params: Arguments of MCS algorithm.

    Returns:
        scaffolds: set
            All found scaffolds in the molecules as valid smiles
        scaffold_infos: dict of dict
            Infos on the scaffold mapping, ignoring any side chain that had to be enforced.
            Key corresponds to generic scaffold smiles
            Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS)
            Values at ['mols'] corresponds to list of molecules matching the scaffold
        scaffold_to_group: dict of list
            Map between each generic scaffold and the R-groups decomposition row
    """

    if enforce_subs is None:
        enforce_subs = []

    if additional_templates is None:
        additional_templates = []

    if mcs_params is None:
        mcs_params = {}

    rg_params = rdRGroupDecomposition.RGroupDecompositionParameters()
    rg_params.removeAllHydrogenRGroups = True
    rg_params.removeHydrogensPostMatch = True
    rg_params.alignment = rdRGroupDecomposition.RGroupCoreAlignment.MCS
    rg_params.matchingStrategy = rdRGroupDecomposition.RGroupMatching.Exhaustive
    rg_params.rgroupLabelling = rdRGroupDecomposition.RGroupLabelling.AtomMap
    rg_params.labels = rdRGroupDecomposition.RGroupLabels.AtomIndexLabels

    core_query_param = AdjustQueryParameters()
    core_query_param.makeDummiesQueries = True
    core_query_param.adjustDegree = False
    core_query_param.makeBondsGeneric = True

    # group molecules by they generic Murcko scaffold, allowing
    # side chain that contains cycle (might be a bad idea)
    scf2infos = collections.defaultdict(dict)
    scf2groups = {}
    all_scaffolds = set([])

    for m in mols:
        generic_m = MurckoScaffold.MakeScaffoldGeneric(m)
        scf = MurckoScaffold.GetScaffoldForMol(m)
        try:
            scf = MurckoScaffold.MakeScaffoldGeneric(scf)
        except:
            pass

        if ignore_non_ring:
            rw_scf = Chem.RWMol(scf)
            atms = [a.GetIdx() for a in rw_scf.GetAtoms() if not a.IsInRing()]
            atms.sort(reverse=True)
            for a in atms:
                rw_scf.RemoveAtom(a)
            scfs = list(rdmolops.GetMolFrags(rw_scf, asMols=False))
        else:
            scfs = [dm.to_smiles(scf)]

        # add templates mols if exists:
        for tmp in additional_templates:
            tmp = dm.to_mol(tmp)
            tmp_scf = MurckoScaffold.MakeScaffoldGeneric(tmp)
            if generic_m.HasSubstructMatch(tmp_scf):
                scfs.append(dm.to_smiles(tmp_scf))

        for scf in scfs:
            if scf2infos[scf].get("mols"):
                scf2infos[scf]["mols"].append(m)
            else:
                scf2infos[scf]["mols"] = [m]

    for scf in scf2infos:
        # cheat by adding murcko as last mol always
        popout = False
        mols = scf2infos[scf]["mols"]
        if len(mols) < 2:
            mols = mols + [MurckoScaffold.GetScaffoldForMol(mols[0])]
            popout = True

        # compute the MCS of the cluster
        mcs = rdFMCS.FindMCS(
            mols,
            atomCompare=rdFMCS.AtomCompare.CompareAny,
            bondCompare=rdFMCS.BondCompare.CompareAny,
            completeRingsOnly=True,
            **mcs_params,
        )

        mcsM = Chem.MolFromSmarts(mcs.smartsString)
        mcsM.UpdatePropertyCache(False)
        Chem.SetHybridization(mcsM)

        if mcsM.GetNumAtoms() < n_atom_cuttoff:
            continue

        scf2infos[scf]["smarts"] = dm.to_smarts(mcsM)
        if popout:
            mols = mols[:-1]

        core_groups = []
        # generate rgroups based on the mcs core
        success_mols = []
        try:
            rg = rdRGroupDecomposition.RGroupDecomposition(mcsM, rg_params)
            for i, analog in enumerate(mols):
                analog.RemoveAllConformers()
                res = rg.Add(analog)
                if not (res < 0):
                    success_mols.append(i)
            rg.Process()
            core_groups = rg.GetRGroupsAsRows()
        except Exception:
            pass

        mols = [mols[i] for i in success_mols]
        scf2groups[scf] = core_groups
        for mol, gp in zip(mols, core_groups):
            core = gp["Core"]
            acceptable_groups = [
                a.GetAtomMapNum() for a in core.GetAtoms()
                if (a.GetAtomMapNum() and not a.IsInRing())
            ]

            rgroups = [
                gp[f"R{k}"] for k in acceptable_groups if f"R{k}" in gp.keys()
            ]
            if enforce_subs:
                rgroups = [
                    rgp for rgp in rgroups if not any([
                        len(rgp.GetSubstructMatch(frag)) > 0
                        for frag in enforce_subs
                    ])
                ]
            try:
                scaff = trim_side_chain(
                    mol, AdjustQueryProperties(core, core_query_param),
                    rgroups)
            except:
                continue
            all_scaffolds.add(dm.to_smiles(scaff))

    return all_scaffolds, scf2infos, scf2groups