Ejemplo n.º 1
0
def test_sanitize():
    smiles = "CC(=O)Oc1ccccc1C(=O)O"
    mol = dm.to_mol(smiles, sanitize=False)
    mol = dm.sanitize_mol(mol, charge_neutral=True)
    assert dm.to_smiles(mol) == "CC(=O)Oc1ccccc1C(=O)O"

    mol = dm.sanitize_mol(None, charge_neutral=True)
    assert mol is None

    smiles_list = (
        "CC.[H][N:1]1(C)=CC(O)=CC2CCCCC12",  # broken
        "O=c1ccc2ccccc2n1",  # sanitize
        "Cc1nnnn1C",  # none
        "CCc1ccc2nc(=O)c(cc2c1)Cc1nnnn1C1CCCCC1",  # sanitize
        "c1cnc2cc3ccnc3cc12",  # none
        "c1cc2cc3ccnc3cc2n1",  # none
        "O=c1ccnc(c1)-c1cnc2cc3ccnc3cc12",  # sanitize
        "O=c1ccnc(c1)-c1cc1",  # broken
    )

    # check sanitize_mol
    assert dm.to_mol(smiles_list[1]) is None
    assert dm.to_mol(smiles_list[2]) is not None
    assert dm.sanitize_mol(None) is None
    assert dm.sanitize_mol(dm.to_mol(smiles_list[0], sanitize=False)) is None
    assert dm.sanitize_mol(dm.to_mol(smiles_list[1],
                                     sanitize=False)) is not None
    assert dm.sanitize_mol(dm.to_mol(smiles_list[2],
                                     sanitize=False)) is not None

    mol_2 = dm.sanitize_mol(dm.to_mol(smiles_list[1], sanitize=False))
    assert dm.to_smiles(mol_2) == dm.sanitize_smiles("O=c1ccc2ccccc2[nH]1")

    fixed_smiles = [dm.sanitize_smiles(smiles) for smiles in smiles_list]
    assert len([x for x in fixed_smiles if x is not None]) == 6
Ejemplo n.º 2
0
def test_to_image():

    # Get a list of molecules
    data = dm.data.freesolv()
    mols = dm.from_df(data)  # type: ignore
    mols = mols[:8]

    # With multiple molecules
    legends = [dm.to_smiles(mol) for mol in mols]
    image = dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200))
    # image = _convert_ipython_to_array(image)
    image = np.array(image)

    assert image.dtype == np.uint8
    assert image.shape == (400, 800, 3)
    assert image.shape[1] == 200 * 4

    # With a single molecule
    mol = mols[0]
    legends = dm.to_smiles(mol)
    image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200))
    # image = _convert_ipython_to_array(image)
    image = np.array(image)

    assert image.dtype == np.uint8
    assert image.shape == (200, 200, 3)

    dm.viz.to_image(mol, indices=True, mol_size=400)
Ejemplo n.º 3
0
def test_to_smiles_fail():
    smiles = dm.to_smiles(55, allow_to_fail=False)
    assert smiles == None

    # NOTE(hadim): ideally you want to catch only `Boost.Python.ArgumentError` here.
    with pytest.raises(Exception):
        dm.to_smiles(55, allow_to_fail=True)
Ejemplo n.º 4
0
def test_standardize_mol():
    sm = "[Na]OC1=CC2CCCCC2N=C1"
    sm_standard = dm.to_smiles(dm.standardize_smiles(sm))
    standard_mol = dm.standardize_mol(dm.to_mol(sm),
                                      disconnect_metals=True,
                                      uncharge=True)
    mol_standard = dm.to_smiles(Chem.MolToSmiles(standard_mol))
    assert sm_standard == mol_standard
Ejemplo n.º 5
0
def test_to_sdf_mols(datadir, tmp_path):
    data_path = datadir / "TUBB3-observations.sdf.gz"

    mols = dm.read_sdf(data_path, as_df=False)

    sdf_path = tmp_path / "mols.sdf"
    dm.to_sdf(mols, sdf_path)

    new_mols = dm.read_sdf(sdf_path, as_df=False)
    assert [dm.to_smiles(mol) for mol in mols] == [dm.to_smiles(mol) for mol in new_mols]
Ejemplo n.º 6
0
def test_to_sdf_single_mol(tmp_path):

    sdf_path = tmp_path / "test.sdf"

    smiles = "CC1(C2C(C3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O)O"
    mol = dm.to_mol(smiles)
    dm.to_sdf(mol, sdf_path)

    mols = dm.read_sdf(sdf_path)
    assert dm.to_smiles(mol) == dm.to_smiles(mols[0])
Ejemplo n.º 7
0
def mmpa_cut(mol: Chem.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]:
    """Cut molecules to perform mmpa analysis later

    Args:
        mol: Molecule to fragment.
        rdkit_pattern: Whether to perform the fragmentation
            using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]"

    Returns:
        List of 'smiles,core,chains'
    """

    if mol is None:
        return mol

    outlines = set()

    smiles = dm.to_smiles(mol)

    if rdkit_pattern:
        frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30)
    else:
        # heavy atoms
        frags = mmpa_frag(mol,
                          pattern="[!#1]!@!=!#[!#1]",
                          max_cut=4,
                          max_bond_cut=30)
        frags.update(
            mmpa_frag(mol,
                      pattern="[!#1]!@!=!#[!#1]",
                      max_cut=3,
                      max_bond_cut=30))

    frags = set(frags)
    for core, chains in frags:
        output = f"{smiles},{core},{chains}\n"
        outlines.add(output)

    # hydrogen splitting
    mol = Chem.AddHs(mol)
    smiles = dm.to_smiles(mol)

    n = mol.GetNumHeavyAtoms()
    if n < 60:
        frags = mmpa_frag(mol,
                          pattern=None,
                          max_cut=1,
                          max_bond_cut=100,
                          h_split=True)
        for core, chains in frags:
            output = f"{smiles},{core},{chains}\n"
            outlines.add(output)

    return outlines
Ejemplo n.º 8
0
def all_transform_apply(
    mol,
    rxns,
    max_num_action=float("Inf"),
    asMols=True,
    **kwargs,
):
    """
    Apply a transformation defined as a reaction from a set of reaction to the input molecule.

    The reaction need to be one reactant-only

    Arguments
    ----------
        mol: <Chem.Mol>
            Input molecule
        rnxs: list
            list of reactions/ reaction smarts
        max_num_action: int, optional
            Maximum number of result to return
            (Default: inf)
        asMols: bool, optional
            Whether to return smiles or mols

    Returns
    -------
        Products obtained from applying the chemical reactions
    """

    mols = set([])
    with dm.without_rdkit_log():
        for rxn in rxns:
            if len(mols) >= max_num_action:
                break
            if isinstance(rxn, str):
                rxn = AllChem.ReactionFromSmarts(rxn)
            try:
                pcdts = [products[0] for products in rxn.RunReactants([mol])]
                pcdts = [dm.sanitize_mol(x) for x in pcdts]
                mols.update([dm.to_smiles(x) for x in pcdts if x])
            except:
                pass
    mols = [x for x in mols if x is not None]
    if np.isfinite(max_num_action):
        mols = mols[:max_num_action]

    mols = [dm.to_mol(x) for x in mols]
    if not asMols:
        mols = [dm.to_smiles(x) for x in mols if x is not None]
    return mols
Ejemplo n.º 9
0
def smiles_to_fingerprint(smiles):

    mol = dm.to_mol(str(smiles), ordered=True)
    # mol = dm.fix_mol(mol)
    # mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    # mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)

    fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect
    pars = {
        "radius": 2,
        "nBits": 8192,
        "invariants": [],
        "fromAtoms": [],
        "useChirality": False,
        "useBondTypes": True,
        "useFeatures": False,
    }
    fp = fingerprint_function(mol, **pars)

    standard_smiles = dm.to_smiles(mol)
    # row["selfies"] = dm.to_selfies(mol)
    # row["inchi"] = dm.to_inchi(mol)
    # row["inchikey"] = dm.to_inchikey(mol)
    achiral_fp = list(fp.GetOnBits())
    return standard_smiles, achiral_fp
Ejemplo n.º 10
0
def _compute_fragment_join(
    mol,
    fragment,
    mol_atom_count,
    bond_between_rings=True,
    asMols=True,
):
    """List all posibilities of where a fragment can be attached to a mol"""
    fragment = copy.copy(
        fragment
    )  # need to copy the fragment copy is faster than all the other methods
    with dm.without_rdkit_log():
        combined = Chem.CombineMols(mol, fragment)
        for i1 in range(mol.GetNumAtoms()):
            a1 = combined.GetAtomWithIdx(i1)
            if a1.GetImplicitValence() == 0:
                continue
            for i2 in range(fragment.GetNumAtoms()):
                i2 += mol_atom_count
                a2 = combined.GetAtomWithIdx(i2)
                if a2.GetImplicitValence() == 0:
                    continue
                # no bond between atoms already in rings
                if not bond_between_rings and a1.IsInRing() and a2.IsInRing():
                    continue
                # no bond to form large rings
                else:
                    possibilities = _all_atom_join(combined, a1, a2)
                    for x in possibilities:
                        x = dm.sanitize_mol(x)
                        if x is not None:
                            if not asMols:
                                x = dm.to_smiles(x)
                            yield x
Ejemplo n.º 11
0
def _preprocess(i, row):
#     print('hello')
    mol = dm.to_mol(str(row[smiles_column]), ordered=True)
    mol = dm.fix_mol(mol)
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)
    
    fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect
    pars = { "radius": 2,
                     "nBits": 8192,
                     "invariants": [],
                     "fromAtoms": [],
                     "useChirality": True,
                     "useBondTypes": True,
                     "useFeatures": False,
            }
    fp = fingerprint_function(mol, **pars)

    row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol))
    row["selfies"] = dm.to_selfies(mol)
    row["inchi"] = dm.to_inchi(mol)
    row["inchikey"] = dm.to_inchikey(mol)
    row["onbits_fp"] =list(fp.GetOnBits())
    
    return row
Ejemplo n.º 12
0
def test_to_from_text(tmp_path):

    temp_file = tmp_path / "mols.smi"

    smiles_list = [
        "Cn1c(=S)ccc2nc[nH]c21",
        "Clc1n[nH]c2c1=[NH+]C(c1ccc[nH+]c1)C[NH+]=2",
        "Fc1ccsc1",
        "N#Cc1cc2c(o1)[NH2+]CCN2Cn1cnc2c1CSCC2",
        "O=CN1CCC2NC=CC2C1",
        "Oc1[nH]nc2c1-n1ncnc1C2",
        "OC1=NNC2(OC=CCO2)C2(C3CCCc4nonc43)NN=NN12",
        "[NH-]Sc1cc2nc[nH+]cc2o1",
        "[NH3+]C12CNCCOC1(N1CCCCC1)C=C(F)NC2",
    ]
    mols = [dm.to_mol(m) for m in smiles_list]

    # Save from text and read from text
    dm.to_smi(mols, temp_file)
    loaded_mols = dm.read_smi(temp_file)
    loaded_smiles = [dm.to_smiles(m) for m in loaded_mols]
    assert loaded_smiles == smiles_list

    # Check error raised when list is empty
    with pytest.raises(ValueError):
        dm.to_smi([], temp_file, error_if_empty=True)

    temp_file.unlink()

    # Check file like object works too
    file_like = io.StringIO()
    dm.to_smi(mols, file_like)
    assert file_like.getvalue().strip().split("\n") == smiles_list
Ejemplo n.º 13
0
def all_mmpa_assemble(molist, max_num_action=float("Inf"), asMols=True, **kwargs):
    """Enumerate all mmpa assembly of molecules in molist

    Arguments
    ----------
        molist: list of <Chem.Mol>
            List of molecules to fragmente and reconstruct
        asMols: bool, optional
            Whether to return smiles or mols
        max_num_action: int, optional
            Maximum number of assembly
            (Default: inf)

    Returns
    -------
        res: list of <Chem.Mol>
            Molecules obtained by merging core and side_chains
    """
    frags = set([])
    cores = []
    side_chains = []
    for mol in molist:
        mol_frag = mmpa_frag(mol, max_bond_cut=30)
        if not mol_frag:
            continue
        _, mol_frag = map(list, zip(*mol_frag))
        for m in mol_frag:
            core, sidechain = m.split(".")
            cores.append(Chem.MolFromSmiles(core.replace("[*:1]", "[1*]")))
            side_chains.append(Chem.MolFromSmiles(sidechain.replace("[*:1]", "[1*]")))
    new_mols = _compute_mmpa_assembly(cores, side_chains, max_num_action=max_num_action)
    if not asMols:
        new_mols = [dm.to_smiles(x) for x in new_mols if x]
    return new_mols
Ejemplo n.º 14
0
def test_enumerate_tautomers():
    mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")

    mols = dm.enumerate_tautomers(mol, n_variants=10)

    assert {dm.to_smiles(m)
            for m in mols
            } == {"O=C1C=[N:1]C2CCCCC2C1", "OC1=CC2CCCCC2[N:1]=C1"}
Ejemplo n.º 15
0
def test_from_selfies():
    selfies = (
        "[C][C][Branch1_2][C][=O][O][C][=C][C][=C][C][=C][Ring1][Branch1_2][C][Branch1_2][C][=O][O]"
    )

    smiles = dm.from_selfies(selfies, as_mol=False)
    assert smiles == "CC(=O)OC1=CC=CC=C1C(=O)O"

    mol = dm.from_selfies(selfies, as_mol=True)
    assert dm.to_smiles(mol) == "CC(=O)Oc1ccccc1C(=O)O"
Ejemplo n.º 16
0
def test_to_neutral():

    smiles = "[NH4+]"
    mol = dm.to_mol(smiles, add_hs=False, explicit_only=False)

    smiles = dm.to_smiles(dm.to_neutral(mol))
    assert smiles == "[NH4]"

    smiles = "O=C(c1ccccc1)[O-]"
    mol = dm.to_mol(smiles, add_hs=False, explicit_only=False)
    uncharged_mol = dm.to_neutral(mol)
    assert sum([a.GetFormalCharge() for a in uncharged_mol.GetAtoms()]) == 0
Ejemplo n.º 17
0
def test_enumerate_stereo():
    mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")

    mols = dm.enumerate_stereoisomers(mol, n_variants=10)

    assert {dm.to_smiles(m)
            for m in mols} == {
                "OC1=C[C@@H]2CCCC[C@@H]2[N:1]=C1",
                "OC1=C[C@@H]2CCCC[C@H]2[N:1]=C1",
                "OC1=C[C@H]2CCCC[C@@H]2[N:1]=C1",
                "OC1=C[C@H]2CCCC[C@H]2[N:1]=C1",
            }
Ejemplo n.º 18
0
def mmpa_fragment_exchange(mol1, mol2, return_all=False, **kwargs):
    """Perform a fragment exchange between two molecules using mmpa rules

    Arguments
    ----------
        mol1: <Chem.Mol>
            input molecule 1
        mol2: <Chem.Mol>
            input molecule 1
        return_all: bool, optional
            Whether to return list of all molecules

    Returns
    -------
        modified_mol1, modified_mol2
            Molecules obtained by exchanging fragment between mol1 and mol2.
            In case of failure, mol1, mol2 are returned

    """

    unwanted = [dm.to_smiles(m) for m in [mol1, mol2]] + [None]
    res = all_mmpa_assemble([mol1, mol2])
    # find unique
    res = set([dm.to_smiles(m) for m in res])
    res = list(res - set(unwanted))
    out = []
    for sm in res:
        r = None
        try:
            r = dm.to_mol(sm, sanitize=True)
        except:
            continue
        if r is not None:
            out.append(r)

    if return_all:
        return out
    random.shuffle(out)
    out.extend([mol1, mol2])
    return out[0], out[1]
Ejemplo n.º 19
0
def all_fragment_on_bond(mol,
                         asMols=False,
                         max_num_action=float("Inf"),
                         break_aromatic=True):
    """Fragment all possible bond in a molecule and return the set of resulting fragments
    This is similar to `random_bond_cut`, but is not stochastic as it does not return a random fragment
    but all the fragments resulting from all potential bond break in the molecule.

    .. note::
        This will always be a subset of all_bond_remove, the main difference being that all_bond_remove, allow decreasing
        bond count, while this one will always break a molecule into two.

    Args:
        mol: <Chem.Mol>
            input molecule
        asMols: bool, optional
            Whether to return results as mols or smiles
        max_num_action: float, optional
            Maximum number of action to reduce complexity
        break_aromatic: bool, optional
            Whether to attempt to break even aromatic bonds
            (Default: True)

    Returns:
        set of fragments

    """
    mol.GetRingInfo().AtomRings()
    fragment_set = set([])
    bonds = list(mol.GetBonds())
    stop = False
    if bonds:
        if break_aromatic:
            Chem.Kekulize(mol, clearAromaticFlags=True)
        for bond in bonds:
            if stop:
                break
            if break_aromatic or not bond.GetIsAromatic():
                truncate = Chem.FragmentOnBonds(mol, [bond.GetIdx()],
                                                addDummies=False)
                truncate = dm.sanitize_mol(truncate)
                if truncate is not None:
                    for frag in rdmolops.GetMolFrags(truncate, asMols=True):
                        frag = dm.sanitize_mol(frag)
                        if frag:
                            if not asMols:
                                frag = dm.to_smiles(frag)
                            fragment_set.add(frag)
                        if len(fragment_set) > max_num_action:
                            stop = True
                            break
    return fragment_set
def _preprocess(i, row):
    #     print('hello')
    try:
        mol = dm.to_mol(str(row[smiles_column]), ordered=True)
        mol = dm.fix_mol(mol)
        mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
        mol = dm.standardize_mol(mol,
                                 disconnect_metals=False,
                                 normalize=True,
                                 reionize=True,
                                 uncharge=False,
                                 stereo=True)
        opts = StereoEnumerationOptions(unique=True,
                                        maxIsomers=20,
                                        rand=0xf00d)
        isomers = EnumerateStereoisomers(mol, options=opts)
        enum_smiles = sorted(
            Chem.MolToSmiles(y, isomericSmiles=True) for y in isomers)

        smiles_list = []
        for count, smi in enumerate(enum_smiles):
            smiles_string = smi

            smiles_list.append(smiles_string)
        # fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect
        # pars = { "radius": 2,
        #                  "nBits": 8192,
        #                  "invariants": [],
        #                  "fromAtoms": [],
        #                  "useChirality": False,
        #                  "useBondTypes": True,
        #                  "useFeatures": False,
        #         }
        # fp = fingerprint_function(mol, **pars)

        row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol))
        row["selfies"] = dm.to_selfies(mol)
        row["inchi"] = dm.to_inchi(mol)
        row["inchikey"] = dm.to_inchikey(mol)
        row["enumerated_smiles"] = smiles_list
        # row["onbits_fp"] =list(fp.GetOnBits())

        return row

    except ValueError:
        row["standard_smiles"] = 'dropped'
        row["selfies"] = 'dropped'
        row["inchi"] = 'dropped'
        row["inchikey"] = 'dropped'
        row["enumerated_smiles"] = list('dropped')
        return row
Ejemplo n.º 21
0
def all_atom_add(
        mol,
        atom_types=["C", "N", "O", "F", "Cl", "Br"],
        asMols=True,
        max_num_action=float("Inf"),
        **kwargs,
):
    """Add a new atom on the mol, by considering all bond type

    .. warning::
        This is computationally expensive

    Args:
        mol: <Chem.Mol>
            Input molecule
        atom_types: list
            List of atom symbol to use as replacement
            (Default: ["C", "N", "O", "F", "Cl", "Br"])
        asMols: bool, optional
            Whether to return output as molecule or smiles
        max_num_action: float, optional
            Maximum number of action to reduce complexity
    Returns:
        All possible molecules with one additional atom added

    """
    new_mols = []
    stop = False
    with dm.without_rdkit_log():
        for atom in mol.GetAtoms():
            if stop:
                break
            if atom.GetImplicitValence() == 0:
                continue
            for atom_symb in atom_types:
                emol = Chem.RWMol(mol)
                new_index = emol.AddAtom(Chem.Atom(atom_symb))
                emol.UpdatePropertyCache(strict=False)
                new_mols.extend(
                    _all_atom_join(emol, atom,
                                   emol.GetMol().GetAtomWithIdx(new_index)))
                if len(new_mols) > max_num_action:
                    stop = True
                    break

        new_mols = [dm.sanitize_mol(mol) for mol in new_mols]
        new_mols = [mol for mol in new_mols if mol is not None]
        if not asMols:
            return [dm.to_smiles(x) for x in new_mols if x]
    return new_mols
Ejemplo n.º 22
0
def test_inchi():
    smiles = "CC(=O)Oc1ccccc1C(=O)O"
    mol = dm.to_mol(smiles)

    inchi = dm.to_inchi(mol)
    assert inchi == "InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)"

    inchikey = dm.to_inchikey(mol)
    assert inchikey == "BSYNRYMUTXBXSQ-UHFFFAOYSA-N"

    new_mol = dm.from_inchi(inchi)
    assert dm.to_smiles(new_mol) == smiles

    assert dm.to_inchi(None) is None
    assert dm.to_inchikey(None) is None
    assert dm.from_inchi(None) is None
Ejemplo n.º 23
0
def all_atom_replace(mol,
                     atom_types=["C", "N", "S", "O"],
                     asMols=True,
                     max_num_action=float("Inf"),
                     **kwargs):
    """Replace all non-hydrogen atoms by other possibilities.

    .. warning::
        This is computationally expensive

    Args:
        mol: <Chem.Mol>
            Input molecule
        atom_types: list
            List of atom symbol to use as replacement
            (Default: ['C', 'N', 'S', 'O'])
        asMols: bool, optional
            Whether to return output as molecule or smiles
        max_num_action: float, optional
            Maximum number of action to reduce complexity

    Returns:
        All possible molecules with atoms replaced

    """
    new_mols = []
    stop = False
    with dm.without_rdkit_log():
        for atom in mol.GetAtoms():
            if stop:
                break
            if atom.GetAtomicNum() > 1:
                for atom_symb in atom_types:
                    emol = Chem.RWMol(mol)
                    emol.ReplaceAtom(atom.GetIdx(), Chem.Atom(atom_symb))
                    new_mols.append(emol)
                    if len(new_mols) > max_num_action:
                        stop = True
                        break

        # Sanitize and remove bad molecules
        new_mols = [dm.sanitize_mol(mol) for mol in new_mols]
        new_mols = [mol for mol in new_mols if mol is not None]

    if not asMols:  # Return SMILES
        return [dm.to_smiles(x) for x in new_mols]
    return new_mols
Ejemplo n.º 24
0
def all_fragment_assemble(
    fragmentlist,
    max_num_action=float("Inf"),
    asMols=True,
    seen=None,
    **kwargs,
):
    """Assemble a set of fragment into a new molecule

    .. warning::
        This is computationally expensive

    Arguments
    ----------
        fragmentlist: list
            List of blocks to use for replacement, or addition to molparent
        max_num_action: float, optional
            Maximum number of action to reduce complexity. No limit by default
        asMols: bool, optional
            Whether to return smiles or mols
        seen: list, optional
            List of initial molecules

    Returns
    -------
        reconstructed molecules

    """
    mols = []
    for m in dm.assemble.assemble_brics_order(
        fragmentlist, seen=seen, allow_incomplete=False, max_n_mols=max_num_action
    ):
        if len(mols) > max_num_action:
            break
        mols.append(m)

    if not asMols:
        mols = [dm.to_smiles(x) for x in mols if x is not None]
    return mols
Ejemplo n.º 25
0
def sanitize_smiles(smiles: str, isomeric: bool = True) -> Optional[str]:
    """Takes SMILES string and returns its sanitized version.

    Args:
        smiles: smiles to be sanitized.
        isomeric: Whether to include information about stereochemistry in the SMILES.

    Returns:
        sanitized smiles.
    """
    try:
        mol = dm.to_mol(smiles, sanitize=False)
        mol = dm.sanitize_mol(mol, False)
    except Exception:
        return None

    if mol is None:
        return None

    try:
        smiles = dm.to_smiles(mol, isomeric=isomeric)  # type: ignore
    except:
        return None
    return smiles
Ejemplo n.º 26
0
def test_to_smiles():

    smiles = "O=C(C)Oc1ccccc1C(=O)O"
    mol = dm.to_mol(smiles)

    smiles = dm.to_smiles(
        mol,
        isomeric=True,
        ordered=True,
        explicit_bonds=False,
        explicit_hs=False,
    )
    assert smiles == "CC(=O)Oc1ccccc1C(=O)O"

    smiles = dm.to_smiles(
        mol,
        isomeric=True,
        ordered=False,
        explicit_bonds=True,
        explicit_hs=False,
    )
    assert smiles == "C-C(=O)-O-c1:c:c:c:c:c:1-C(=O)-O"

    smiles = dm.to_smiles(
        mol,
        isomeric=True,
        ordered=False,
        explicit_bonds=False,
        explicit_hs=True,
    )
    assert smiles == "[CH3][C](=[O])[O][c]1[cH][cH][cH][cH][c]1[C](=[O])[OH]"

    smiles = "O=C(C)Oc1ccccc1C(=O)O"
    mol = dm.to_mol(smiles)
    randomized_smiles = dm.to_smiles(mol, randomize=True)
    randomized_mol = dm.to_mol(randomized_smiles)

    assert dm.to_smiles(randomized_mol) == dm.to_smiles(mol)
Ejemplo n.º 27
0
def fuzzy_scaffolding(
    mols: List[Chem.rdchem.Mol],
    enforce_subs: List[str] = None,
    n_atom_cuttoff: int = 8,
    additional_templates: List[Chem.rdchem.Mol] = None,
    ignore_non_ring: bool = False,
    mcs_params: Dict[Any, Any] = None,
):
    """Generate fuzzy scaffold with enforceable group that needs to appear
    in the core, forcing to keep the full side chain if required.

    NOTE(hadim): consider parallelize this (if possible).

    Args:
        mols: List of all molecules
        enforce_subs: List of substructure to enforce on the scaffold.
        n_atom_cuttoff: Minimum number of atom a core should have.
        additional_templates: Additional template to use to generate scaffolds.
        ignore_non_ring: Whether to ignore atom no in murcko ring system, even if they are in the framework.
        mcs_params: Arguments of MCS algorithm.

    Returns:
        scaffolds: set
            All found scaffolds in the molecules as valid smiles
        scaffold_infos: dict of dict
            Infos on the scaffold mapping, ignoring any side chain that had to be enforced.
            Key corresponds to generic scaffold smiles
            Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS)
            Values at ['mols'] corresponds to list of molecules matching the scaffold
        scaffold_to_group: dict of list
            Map between each generic scaffold and the R-groups decomposition row
    """

    if enforce_subs is None:
        enforce_subs = []

    if additional_templates is None:
        additional_templates = []

    if mcs_params is None:
        mcs_params = {}

    rg_params = rdRGroupDecomposition.RGroupDecompositionParameters()
    rg_params.removeAllHydrogenRGroups = True
    rg_params.removeHydrogensPostMatch = True
    rg_params.alignment = rdRGroupDecomposition.RGroupCoreAlignment.MCS
    rg_params.matchingStrategy = rdRGroupDecomposition.RGroupMatching.Exhaustive
    rg_params.rgroupLabelling = rdRGroupDecomposition.RGroupLabelling.AtomMap
    rg_params.labels = rdRGroupDecomposition.RGroupLabels.AtomIndexLabels

    core_query_param = AdjustQueryParameters()
    core_query_param.makeDummiesQueries = True
    core_query_param.adjustDegree = False
    core_query_param.makeBondsGeneric = True

    # group molecules by they generic Murcko scaffold, allowing
    # side chain that contains cycle (might be a bad idea)
    scf2infos = collections.defaultdict(dict)
    scf2groups = {}
    all_scaffolds = set([])

    for m in mols:
        generic_m = MurckoScaffold.MakeScaffoldGeneric(m)
        scf = MurckoScaffold.GetScaffoldForMol(m)
        try:
            scf = MurckoScaffold.MakeScaffoldGeneric(scf)
        except:
            pass

        if ignore_non_ring:
            rw_scf = Chem.RWMol(scf)
            atms = [a.GetIdx() for a in rw_scf.GetAtoms() if not a.IsInRing()]
            atms.sort(reverse=True)
            for a in atms:
                rw_scf.RemoveAtom(a)
            scfs = list(rdmolops.GetMolFrags(rw_scf, asMols=False))
        else:
            scfs = [dm.to_smiles(scf)]

        # add templates mols if exists:
        for tmp in additional_templates:
            tmp = dm.to_mol(tmp)
            tmp_scf = MurckoScaffold.MakeScaffoldGeneric(tmp)
            if generic_m.HasSubstructMatch(tmp_scf):
                scfs.append(dm.to_smiles(tmp_scf))

        for scf in scfs:
            if scf2infos[scf].get("mols"):
                scf2infos[scf]["mols"].append(m)
            else:
                scf2infos[scf]["mols"] = [m]

    for scf in scf2infos:
        # cheat by adding murcko as last mol always
        popout = False
        mols = scf2infos[scf]["mols"]
        if len(mols) < 2:
            mols = mols + [MurckoScaffold.GetScaffoldForMol(mols[0])]
            popout = True

        # compute the MCS of the cluster
        mcs = rdFMCS.FindMCS(
            mols,
            atomCompare=rdFMCS.AtomCompare.CompareAny,
            bondCompare=rdFMCS.BondCompare.CompareAny,
            completeRingsOnly=True,
            **mcs_params,
        )

        mcsM = Chem.MolFromSmarts(mcs.smartsString)
        mcsM.UpdatePropertyCache(False)
        Chem.SetHybridization(mcsM)

        if mcsM.GetNumAtoms() < n_atom_cuttoff:
            continue

        scf2infos[scf]["smarts"] = dm.to_smarts(mcsM)
        if popout:
            mols = mols[:-1]

        core_groups = []
        # generate rgroups based on the mcs core
        success_mols = []
        try:
            rg = rdRGroupDecomposition.RGroupDecomposition(mcsM, rg_params)
            for i, analog in enumerate(mols):
                analog.RemoveAllConformers()
                res = rg.Add(analog)
                if not (res < 0):
                    success_mols.append(i)
            rg.Process()
            core_groups = rg.GetRGroupsAsRows()
        except Exception:
            pass

        mols = [mols[i] for i in success_mols]
        scf2groups[scf] = core_groups
        for mol, gp in zip(mols, core_groups):
            core = gp["Core"]
            acceptable_groups = [
                a.GetAtomMapNum() for a in core.GetAtoms()
                if (a.GetAtomMapNum() and not a.IsInRing())
            ]

            rgroups = [
                gp[f"R{k}"] for k in acceptable_groups if f"R{k}" in gp.keys()
            ]
            if enforce_subs:
                rgroups = [
                    rgp for rgp in rgroups if not any([
                        len(rgp.GetSubstructMatch(frag)) > 0
                        for frag in enforce_subs
                    ])
                ]
            try:
                scaff = trim_side_chain(
                    mol, AdjustQueryProperties(core, core_query_param),
                    rgroups)
            except:
                continue
            all_scaffolds.add(dm.to_smiles(scaff))

    return all_scaffolds, scf2infos, scf2groups
Ejemplo n.º 28
0
def test_to_cxsmiles():
    mol = dm.to_mol("OC1=CC2CCCCC2[N:1]=C1")
    smiles = dm.to_smiles(mol, cxsmiles=True)
    assert smiles == "OC1=CC2CCCCC2[N:1]=C1 |atomProp:9.molAtomMapNumber.1|"
Ejemplo n.º 29
0
def to_df(
    mols: List[Chem.rdchem.Mol],
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    include_private: bool = False,
    include_computed: bool = False,
    render_df_mol: bool = True,
    render_all_df_mol: bool = False,
) -> Optional[pd.DataFrame]:
    """Convert a list of mols to a dataframe using each mol properties
    as a column.

    Args:
        mols: a molecule.
        smiles_column: name of the SMILES column.
        mol_column: Name of the column. If not None, rdkit.Chem.PandaTools
            is used to add a molecule column.
        include_private: Include private properties in the columns.
        include_computed: Include computed properties in the columns.
        render_df_mol: whether to render the molecule in the dataframe to images.
            If called once, it will be applied for the newly created dataframe with
            mol in it.
        render_all_df_mol: Whether to render all pandas dataframe mol column as images.
    """

    # Init a dataframe
    df = pd.DataFrame()

    # Feed it with smiles
    if smiles_column is not None:
        smiles = [dm.to_smiles(mol) for mol in mols]
        df[smiles_column] = smiles

    # Add a mol column
    if mol_column is not None:
        df[mol_column] = mols

    # Add any other properties present in the molecule
    props = [
        mol.GetPropsAsDict(
            includePrivate=include_private,
            includeComputed=include_computed,
        )
        for mol in mols
    ]
    props_df = pd.DataFrame(props)

    if smiles_column is not None and smiles_column in props_df.columns:
        logger.warning(
            f"The SMILES column name provided ('{smiles_column}') is already present in the properties"
            " of the molecules. THe returned dataframe will two columns with the same name."
        )

    # Concat the df with the properties df
    df = pd.concat([df, props_df], axis=1)

    # Render mol column to images
    if render_df_mol is True and mol_column is not None:
        # NOTE(hadim): replace by `PandaTools.ChangeMoleculeRendering` once
        # https://github.com/rdkit/rdkit/issues/3563 is fixed.
        _ChangeMoleculeRendering(df)

        if render_all_df_mol:
            PandasTools.RenderImagesInAllDataFrames()

    return df
Ejemplo n.º 30
0
def test_sanitize_first():

    smiles = ["fake_smiles", "CC(=O)Oc1ccccc1C(=O)O"]
    mols = [dm.to_mol(s) for s in smiles]
    mol = dm.sanitize_first(mols)
    assert dm.to_smiles(mol) == "CC(=O)Oc1ccccc1C(=O)O"