Beispiel #1
0
def decode_stereo(smiles2D):
    """ Convert a 2D SMILES to 3D by enumerate all the possible stereo isomers

    Args:
        smiles2D (str): SMILES string without isomeric information

    Returns:
        list: all of the possible 3D isomers
    """
    # generate all possible isomeric SMILES from 2D SMILES
    mol = Chem.MolFromSmiles(smiles2D)
    opts = StereoEnumerationOptions(tryEmbedding=True, unique=True)
    dec_isomers = tuple(EnumerateStereoisomers(mol, options=opts))

    # get isomeric SMILES strings
    smiles3D = [
        Chem.MolToSmiles(mol, isomericSmiles=True) for mol in dec_isomers
    ]

    # get indices of chiral Ns
    chiralN = [
        atom.GetIdx() for atom in dec_isomers[0].GetAtoms()
        if atom.GetChiralTag().real and atom.GetSymbol() == "N"
    ]
    # set chiral Ns to unspecified
    if len(chiralN) > 0:
        for mol in dec_isomers:
            for idx in chiralN:
                mol.GetAtomWithIdx(idx).SetChiralTag(
                    Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
            smiles3D.append(Chem.MolToSmiles(mol, isomericSmiles=True))

    return smiles3D
Beispiel #2
0
def atom_mapper3D(reactant, products):
    '''
    Written by Mads Koerstz
    '''
    reactant = label_atoms(reactant)
    opts = StereoEnumerationOptions(onlyUnassigned=False, unique=False)
    rdmolops.AssignStereochemistry(reactant,
                                   cleanIt=True,
                                   flagPossibleStereoCenters=True,
                                   force=True)

    reactant = next(EnumerateStereoisomers(reactant, options=opts))

    # Prepare reactant
    reactant = reassign_atom_idx(
        reactant)  # Makes Graph atom idx = SMILES atom mapped idx.
    rdmolops.AssignStereochemistry(reactant,
                                   cleanIt=True,
                                   flagPossibleStereoCenters=True,
                                   force=True)  # Assigns _CIPCode.

    # Prepare Product
    new_products = []
    for product in products:
        product = label_atoms(product)
        product = reassign_atom_idx(
            product)  # Makes Graph atom idx = SMILES atom mapped idx .

        new_products.append(set_chirality(product, reactant))

    return reactant, new_products
Beispiel #3
0
def get_stereoisomers(smiles):
    """
    Input: SMILES string for a molecule
    Output: set of SMILES strings of stereoisomers for the molecule
    """
    opts = StereoEnumerationOptions(tryEmbedding=True,unique=True)
    molecule = Chem.rdmolfiles.MolFromSmiles(smiles)
    
    return {Chem.rdmolfiles.MolToSmiles(i,isomericSmiles=True) for i in list(EnumerateStereoisomers(molecule,options=opts))}
Beispiel #4
0
    def enumerate_stereoisomers(mol, to_use):
        """ We likely want to distinguish between stereoisomers, so we do that here """

        if not to_use:
            # Give an easy way to pass through this function if this feature isn't used
            return (mol, )

        else:
            opts = StereoEnumerationOptions(unique=True)
            return tuple(EnumerateStereoisomers(mol, options=opts))
Beispiel #5
0
def set_chirality(product, reactant):
    """ Written by Mads Koerstz
    Produce all combinations of isomers (R/S and cis/trans). But force 
    product atoms with unchanged neighbors to the same label chirality as
    the reactant """

    # TODO move these somewhere it makes more sense.
    product = reassign_atom_idx(product)
    reactant = reassign_atom_idx(reactant)

    Chem.SanitizeMol(product)
    Chem.SanitizeMol(reactant)

    # Find chiral atoms - including label chirality
    chiral_atoms_product = Chem.FindMolChiralCenters(product,
                                                     includeUnassigned=True)

    unchanged_atoms = []
    for atom, chiral_tag in chiral_atoms_product:
        product_neighbors = [
            a.GetIdx() for a in product.GetAtomWithIdx(atom).GetNeighbors()
        ]
        reactant_neighbors = [
            a.GetIdx() for a in reactant.GetAtomWithIdx(atom).GetNeighbors()
        ]

        if sorted(product_neighbors) == sorted(reactant_neighbors):
            unchanged_atoms.append(atom)

    # make combinations of isomers.
    opts = StereoEnumerationOptions(onlyUnassigned=False, unique=False)
    rdmolops.AssignStereochemistry(product,
                                   cleanIt=True,
                                   flagPossibleStereoCenters=True,
                                   force=True)

    product_isomers = []
    product_isomers_mols = []
    for product_isomer in EnumerateStereoisomers(product, options=opts):
        rdmolops.AssignStereochemistry(product_isomer, force=True)
        for atom in unchanged_atoms:
            reactant_global_tag = reactant.GetAtomWithIdx(atom).GetProp(
                '_CIPCode')

            # TODO make sure that the _CIPRank is the same for atom in reactant and product.
            product_isomer_global_tag = product_isomer.GetAtomWithIdx(
                atom).GetProp('_CIPCode')
            if reactant_global_tag != product_isomer_global_tag:
                product_isomer.GetAtomWithIdx(atom).InvertChirality()

        if Chem.MolToSmiles(product_isomer) not in product_isomers:
            product_isomers.append(Chem.MolToSmiles(product_isomer))
            product_isomers_mols.append(product_isomer)

    return product_isomers_mols
Beispiel #6
0
def enumerate_molecule(s: str):
    """Return a list of all the isomer SMILESs of a given SMILES `s`."""
    m = Chem.MolFromSmiles(s)
    opts = StereoEnumerationOptions(unique=True, onlyUnassigned=False)
    #opts = StereoEnumerationOptions(tryEmbedding=True, unique=True, onlyUnassigned=False)
    isomers = tuple(EnumerateStereoisomers(m, options=opts))
    retval = []
    for smi in sorted(
            Chem.MolToSmiles(x, isomericSmiles=True) for x in isomers):
        retval.append(smi)
    return retval
Beispiel #7
0
def enumerate_stereoisomers(
    mol,
    n_variants: int = 20,
    undefined_only: bool = False,
    rationalise: bool = True,
):
    """Enumerate the stereocenters and bonds of the current molecule.

    Original source: the `openff-toolkit` lib.

    Warning: this function can be computationnaly intensive.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
        undefined_only: If we should enumerate all stereocenters and bonds or only those
            with undefined stereochemistry.
        rationalise: If we should try to build and rationalise the molecule to ensure it
            can exist.
    """
    from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers
    from rdkit.Chem.EnumerateStereoisomers import StereoEnumerationOptions

    # safety first
    mol = copy_mol(mol)

    # in case any bonds/centers are missing stereo chem flag it here
    Chem.AssignStereochemistry(mol, force=False, flagPossibleStereoCenters=True, cleanIt=True)  # type: ignore
    Chem.FindPotentialStereoBonds(mol, cleanIt=True)  # type: ignore

    # set up the options
    stereo_opts = StereoEnumerationOptions(
        tryEmbedding=rationalise,
        onlyUnassigned=undefined_only,
        maxIsomers=n_variants,
    )

    try:
        isomers = tuple(EnumerateStereoisomers(mol, options=stereo_opts))
    except:
        # NOTE(hadim): often got "Stereo atoms should be specified before specifying CIS/TRANS bond stereochemistry"
        # for the ligand of reference (coming from the PDB). Not sure how to handle that.
        isomers = []

    variants = []
    for isomer in isomers:
        # isomer has CIS/TRANS tags so convert back to E/Z
        Chem.SetDoubleBondNeighborDirections(isomer)  # type: ignore
        Chem.AssignStereochemistry(isomer, force=True, cleanIt=True)  # type: ignore
        variants.append(isomer)

    return variants
def _preprocess(i, row):
    #     print('hello')
    try:
        mol = dm.to_mol(str(row[smiles_column]), ordered=True)
        mol = dm.fix_mol(mol)
        mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
        mol = dm.standardize_mol(mol,
                                 disconnect_metals=False,
                                 normalize=True,
                                 reionize=True,
                                 uncharge=False,
                                 stereo=True)
        opts = StereoEnumerationOptions(unique=True,
                                        maxIsomers=20,
                                        rand=0xf00d)
        isomers = EnumerateStereoisomers(mol, options=opts)
        enum_smiles = sorted(
            Chem.MolToSmiles(y, isomericSmiles=True) for y in isomers)

        smiles_list = []
        for count, smi in enumerate(enum_smiles):
            smiles_string = smi

            smiles_list.append(smiles_string)
        # fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect
        # pars = { "radius": 2,
        #                  "nBits": 8192,
        #                  "invariants": [],
        #                  "fromAtoms": [],
        #                  "useChirality": False,
        #                  "useBondTypes": True,
        #                  "useFeatures": False,
        #         }
        # fp = fingerprint_function(mol, **pars)

        row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol))
        row["selfies"] = dm.to_selfies(mol)
        row["inchi"] = dm.to_inchi(mol)
        row["inchikey"] = dm.to_inchikey(mol)
        row["enumerated_smiles"] = smiles_list
        # row["onbits_fp"] =list(fp.GetOnBits())

        return row

    except ValueError:
        row["standard_smiles"] = 'dropped'
        row["selfies"] = 'dropped'
        row["inchi"] = 'dropped'
        row["inchikey"] = 'dropped'
        row["enumerated_smiles"] = list('dropped')
        return row
def enumerate_smiles_stereoisomers(job_count, record_batch_list):

    for x, record_batch in enumerate(record_batch_list):

        smiles = list(record_batch.column('smiles'))

        canonical_id = list(record_batch.column('canonical_id'))

        #     print(type(smiles))
        #     print(type(canonical_id))
        #     print(canonical_id[10])
        #     print(smiles[10])

        print(f'the number of smiles in the record batch is {len(smiles)}')

        canonical_id_list = []
        smiles_list = []

        for count, smi in enumerate(smiles):
            clean_smi = str(smi)
            mol = Chem.MolFromSmiles(clean_smi)
            canonical_id_prefix = str(canonical_id[count])
            opts = StereoEnumerationOptions(maxIsomers=20, rand=0xf00d)
            isomers = EnumerateStereoisomers(mol, options=opts)
            enum_smiles = sorted(
                Chem.MolToSmiles(y, isomericSmiles=True) for y in isomers)
            for count, smi in enumerate(enum_smiles):
                canonical_id_isomer_num = f'{canonical_id_prefix}_{count}'
                smiles_string = smi
                canonical_id_list.append(canonical_id_isomer_num)
                smiles_list.append(smiles_string)
            # except:
            #     print('molecule failed')
        outfilename = '/data/dopamine_3_results/enumerated_smiles'

        name = os.path.join(
            outfilename,
            'enumerated_stereoisomers_' + str(x) + '_' + str(job_count))

        record_batch = namesdict_to_arrow_batch(canonical_id_list, smiles_list)

        df = record_batch.to_pandas()
        print(df.shape)

        feather.write_feather(df, f'{name}.feather')

        print(f'Job number {job_count} sub-batch {x} complete.')
        print(f'Job contained {len(smiles)} smiles strings')
        print(
            f'Job generated a total of {len(smiles_list)} smiles after enumeration'
        )
Beispiel #10
0
def Stereoisomer(inp):
    smiles, name, dgunsat, formula, molwt = inp

    sto_opt = StereoEnumerationOptions(tryEmbedding=False, unique=True)

    mol = Chem.MolFromSmiles(smiles)
    ism_mol = tuple(EnumerateStereoisomers(mol, options=sto_opt))
    ism_smi = [Chem.MolToSmiles(s, isomericSmiles=True) for s in ism_mol]
    tau_smi = [MolStandardize.canonicalize_tautomer_smiles(m) for m in ism_smi]

    Out = []
    for idx, smi in enumerate(tau_smi):
        Out.append([smi, name + '_' + str(idx + 1), dgunsat, formula, molwt])

    return Out
Beispiel #11
0
def enumerate_stereo_isomers(mol, max_stereo_isomers):
    """
    This function emumerates stereo isomers.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        An RDKit molecule.

    max_stereo_isomers: int
        Maximal number of stereo isomers to generate.

    Returns
    -------
    isomers : tuple
        A tuple of enumerated RDKit molecules.

    """
    options = StereoEnumerationOptions(tryEmbedding=True, unique=True, maxIsomers=max_stereo_isomers)
    isomers = tuple(EnumerateStereoisomers(mol, options=options))
    return isomers
def PerformEnumeration():
    """Enumerate stereoisomers."""

    Infile = OptionsInfo["Infile"]
    Outfile = OptionsInfo["Outfile"]

    # Setup a molecule reader...
    MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
    Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])

    # Set up a molecule writer...
    Writer = None
    Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
    if Writer is None:
        MiscUtil.PrintError("Failed to setup a writer for output fie %s " %
                            Outfile)
    MiscUtil.PrintInfo("Generating file %s...\n" % Outfile)

    # Setup stereo enumeration options...
    StereoOptions = StereoEnumerationOptions(
        tryEmbedding=OptionsInfo["DiscardNonPhysical"],
        onlyUnassigned=OptionsInfo["UnassignedOnly"],
        maxIsomers=OptionsInfo["MaxIsomers"])

    # Process molecules...
    MolCount = 0
    ValidMolCount = 0

    Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]

    for Mol in Mols:
        MolCount += 1

        if Mol is None:
            continue

        if RDKitUtil.IsMolEmpty(Mol):
            MolName = RDKitUtil.GetMolName(Mol, MolCount)
            MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
            continue

        ValidMolCount += 1

        MolName = RDKitUtil.GetMolName(Mol, MolCount)

        # Generate and process stereoisomers...
        StereoisomersMols = EnumerateStereoisomers(Mol, options=StereoOptions)
        IsomerCount = 0
        for IsomerMol in StereoisomersMols:
            IsomerCount += 1

            # Set isomer mol name...
            IsomerMolName = "%s_Isomer%d" % (MolName, IsomerCount)
            IsomerMol.SetProp("_Name", IsomerMolName)

            if Compute2DCoords:
                AllChem.Compute2DCoords(IsomerMol)

            Writer.write(IsomerMol)

        MiscUtil.PrintInfo("Number of stereoisomers written for %s: %d" %
                           (MolName, IsomerCount))

    if Writer is not None:
        Writer.close()

    MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
    MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
    MiscUtil.PrintInfo("Number of ignored molecules: %d" %
                       (MolCount - ValidMolCount))
Beispiel #13
0
 def enumerate_stereoisomers(mol):
     """ We likely want to distinguish between stereoisomers, so we do that here """
     opts = StereoEnumerationOptions(unique=True)
     return tuple(EnumerateStereoisomers(mol, options=opts))
Beispiel #14
0
    return centers


#------------------------------------------------

for mol in ifs:
    #print (mol.GetProp("_Name"))
    #opts = StereoEnumerationOptions(tryEmbedding=True,unique=True) #Because the molecule is constrained, not all of those isomers can actually exist. We can check that, but this is computationally quite expensive, should save time later
    #maxisomers is only used if not all centers are assigned

    #opts = StereoEnumerationOptions(unique=True,maxIsomers=maxcenters*2,onlyUnassigned=flip_only_unasigned,tryEmbedding=True)
    #embedding is really very slow, it's 50% faster, if we test it later and just kick out the ones that failed

    opts = StereoEnumerationOptions(unique=True,
                                    maxIsomers=maxcenters * 2,
                                    onlyUnassigned=flip_only_unasigned)

    isomers = tuple(EnumerateStereoisomers(mol, options=opts))

    n_chiral = len(Chem.FindMolChiralCenters(mol, includeUnassigned=False))
    #print (Chem.FindMolChiralCenters(mol))
    n_chiral_all = len(Chem.FindMolChiralCenters(mol, includeUnassigned=True))
    #print (n_chiral, n_chiral_all, maxcenters)

    if len(isomers) == 1:
        s_type = 'specified'
    elif (n_chiral_all - n_chiral) > maxcenters:
        s_type = 'exceeded max centres, random selection'
    else:
        s_type = 'guessed'
import rdkit.Chem as Chem
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions, GetStereoisomerCount
enumeration_options = StereoEnumerationOptions(tryEmbedding=True,unique=True)


def get_individual_smiles_lists(untokenized_sequence,is_predictions):
    """
    Input: SMILES sequence of molecules separated by "."
    Output: 
        if is_predictions:
            List of lists of SMILES strings for all unique stereoisomers in the sequence
            e.g. if the top 5 predictions are "CO.CCO.FC(O)(C)Br.CCCO.CCCCO", the output should be
            [["CO"],["CCO"],["F[C@@](O)(C)Br","F[C@](O)(C)Br"],["CCCO"],["CCCCO"]]
            This way, both stereoisomers are included in the 3rd prediction
        else:
            List of SMILES strings for all unique stereoisomers in the products
            e.g. the above example returns ["CO","CCO","F[C@@](O)(C)Br","F[C@](O)(C)Br","CCCO","CCCCO"]

        This facilitates easier top-k coverage comparisons
    """
    individual_smiles_list = []
    original_individual_smiles_list = untokenized_sequence.split(".")
    for individual_smiles in original_individual_smiles_list:
        molecule = Chem.rdmolfiles.MolFromSmiles(individual_smiles)
        try:
            num_stereoisomers = GetStereoisomerCount(molecule)
            if num_stereoisomers > 1:
                isomers = tuple(EnumerateStereoisomers(molecule,options=enumeration_options))
                isomer_smiles_list = sorted(Chem.rdmolfiles.MolToSmiles(isomer,isomericSmiles=True) for isomer in isomers)
            else:
                isomer_smiles_list = [Chem.rdmolfiles.MolToSmiles(molecule,isomericSmiles=True)]
Beispiel #16
0
def _preprocess(i, row):
    '''Takes a smiles string and generates a clean rdkit mol with datamol. The stereoisomers
    are then enumerated while holding defined stereochemistry. Morgan fingerprints are then
    generated using RDkit with and without stereochemistry. The try/except logic deals with 
    RDkit mol failures on conversion of an invalid smiles string. Smarts are added for later
    searching.'''
    try:
        mol = dm.to_mol(str(row[smiles_column]), ordered=True)
        mol = dm.fix_mol(mol)
        mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
        mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)
        opts = StereoEnumerationOptions(unique=True,maxIsomers=20,rand=0xf00d)
        isomers = EnumerateStereoisomers(mol, options=opts)
        enum_smiles = sorted(Chem.MolToSmiles(y,isomericSmiles=True) for y in isomers)
#         enum_dm_smiles = sorted(dm.standardize_smiles(dm.to_smiles(x)) for x in isomers)
        
        smiles_list = []
        achiral_fp_lis = []
        chiral_fp_lis = []
        
#         standard_smiles_list = []
        for count, smi in enumerate(enum_smiles):
            smiles_string = smi
            
            mol = dm.to_mol(smi, ordered=True)
            mol = dm.fix_mol(mol)
            mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
            mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)

            fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect
            
            pars = { "radius": 2,
                             "nBits": 8192,
                             "invariants": [],
                             "fromAtoms": [],
                             "useChirality": True,
                             "useBondTypes": True,
                             "useFeatures": False, }
            
            pars2 = { "radius": 2,
                             "nBits": 8192,
                             "invariants": [],
                             "fromAtoms": [],
                             "useChirality": False,
                             "useBondTypes": True,
                             "useFeatures": False, }

            fp = fingerprint_function(mol, **pars)
            fp1 = fingerprint_function(mol, **pars2)
            smiles_list.append(dm.standardize_smiles(smiles_string))
            achiral_fp_lis.append(list(fp1.GetOnBits()))
            chiral_fp_lis.append(list(fp.GetOnBits()))

        row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol))
        row["smarts"] = dm.to_smarts(mol)
        row["selfies"] = dm.to_selfies(mol)
        row["enumerated_smiles"] = smiles_list
        row["achiral_fp"] = achiral_fp_lis
        row["chiral_fp"] = chiral_fp_lis
#         row["dm_enumerated_smiles"] = enum_dm_smiles_lis
        # row["onbits_fp"] =list(fp.GetOnBits())
        
        return row

    except ValueError:
#         row["standard_smiles"] = 'dropped'
#         row["selfies"] = 'dropped'
#         row["inchi"] = 'dropped'
#         row["inchikey"] = 'dropped'
        
        row["standard_smiles"] = 'dropped'
        row["smarts"] = 'dropped'
        row["selfies"] = 'dropped'
        row["enumerated_smiles"] = list('dropped')
        row["achiral_fp"] = list('dropped')
        row["chiral_fp"] = list('dropped')
#         row["dm_enumerated_smiles"] = 'dropped'
        return row