Ejemplo n.º 1
0
def generateMoleculeHierarchyTask(structure, debug=False):

    if debug:
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    molecule = structure.molecule
    if not molecule.moleculeHierarchy:
        hierarchy = MoleculeHierarchy(molecule=molecule)
    else:
        hierarchy = molecule.moleculeHierarchy

    saltRemover = SaltRemover()
    mol = Chem.MolFromMolBlock(str(structure.molfile))
    base = saltRemover.StripMol(mol)

    if mol.GetNumAtoms() == base.GetNumAtoms():
        hierarchy.parent_molecule = molecule
    else:
        hierarchy.parent_molecule = getParentMolregnoFromBase(
            MolToMolBlock(base))

    hierarchy.active_molecule = hierarchy.parent_molecule

    try:
        hierarchy.save()

    except IntegrityError as e:
        if debug:
            print e.message
        else:
            raise e
Ejemplo n.º 2
0
def preprocess_smi(smi):

    # Filter 1- Convert to Canonical Smiles
    try:
        mol = Chem.MolFromSmiles(smi)
        can_smi = Chem.MolToSmiles(mol, True)
    except:
        return None

    # Filter 2- Remove salt
    remover = SaltRemover()
    mol = Chem.MolFromSmiles(can_smi)
    res, deleted = remover.StripMolWithDeleted(mol, dontRemoveEverything=True)
    removed_salt_smi = Chem.MolToSmiles(res)

    # Filter 3- Remove Charge
    uncharger = rdMolStandardize.Uncharger()
    m = Chem.MolFromSmiles(removed_salt_smi)
    p = uncharger.uncharge(m)
    uncharged_smi = Chem.MolToSmiles(p)

    # Filter 4 - Standardize the tautomer
    clean_smi = MolStandardize.canonicalize_tautomer_smiles(uncharged_smi)

    return clean_smi
Ejemplo n.º 3
0
 def test_withSmiles(self):
     remover = SaltRemover(defnData="[Na+]\nCC(=O)O",
                           defnFormat=InputFormat.SMILES)
     self.assertEqual(len(remover.salts), 2)
     mol = Chem.MolFromSmiles('CC(=O)O.[Na+]')
     res = remover.StripMol(mol)
     self.assertEqual(res.GetNumAtoms(), 0)
Ejemplo n.º 4
0
def standardizeSMILES(smiIn):


        # self.mol = loader.ReadMolFromSmile(self.smi)

        s = Standardizer()
        mol = Chem.MolFromSmiles(smiIn)

        try:
            out = timeFunction(normalize, mol)
            if out == "ERROR":
                print "Normalize SMILES: ERROR DURING THE PROCESS"
            else:
                molstandardized = out
        except:
            print "Normalize SMILES: ERROR INPUT SMI"

        if "molstandardized" in locals():

            smilestandadized = Chem.MolToSmiles(molstandardized)

            # remove salt
            # 1.default
            remover = SaltRemover()
            mol = Chem.MolFromSmiles(smilestandadized)
            molcleandefault = remover(mol)
            # 2. Personal remover
            homeremover = SaltRemover(defnData=LSALT)
            molclean = homeremover(molcleandefault)
            smilesclean = Chem.MolToSmiles(molclean)
            # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
            lelem = smilesclean.split(".")
            if len(lelem) > 1:
                # reduce double, case of several salts are included - 255
                lelem = list(set(lelem))
                for smilesdel in LSMILESREMOVE:
                    if smilesdel in lelem:
                        lelem.remove(smilesdel)
                try:
                    lelem.remove("")  # case of bad smile
                except:
                    pass
                if len(lelem) == 1:
                    smilesclean = str(lelem[0])
                else:
                    # 4. Fragments
                    # Case of fragment -> stock in log file, check after to control
                    print "Fragments after standardization: " + smilesclean + "\n"
                    smilesclean = ""

            if smilesclean == "":
                print "SMILES empty after preparation\n"
                return 1

            else:
                print "Prepared SMI :" + str(smilesclean) + "\n"


            return smilesclean
Ejemplo n.º 5
0
 def test_withDontRemoveEverything(self):
   testFile = os.sep.join(
     [os.path.dirname(os.path.abspath(__file__)), 'test_data', 'witch-salts.sdf'])
   remover = SaltRemover(defnFilename=testFile, defnFormat=InputFormat.MOL)
   m = Chem.MolFromSmiles('Cc1ccccc1')
   mol, deleted = remover.StripMolWithDeleted(m, dontRemoveEverything=True)
   # List should be empty
   self.assertFalse(deleted)
   self.assertEqual(m, mol)
Ejemplo n.º 6
0
def remove_salts(mol, dictionary=True, *args, **kwargs):
    """Removes salts from a molecule.

    This function removes detected salts following a salts dictionary by
    default.

    Parameters
    ----------
    mol: rdkit.Chem.Mol
        The molecule to be modified.
    dictionary: bool, optional
        True (default): Activates the use of the salt dictionary.
        False: Uses the standard StripMol functionality, provided by
        rdkit.Chem.SaltRemover.
    defnData: list of str, optional
        If the dictionary is set to False, a custom dictionary can be
        set up. If not rdkit default values from
        '/scratch/RDKit_git/Data/Salts.txt' are used.

    Returns
    -------
    mol: rdkit.Chem.Mol
        A new molecule with salts removed.

    Notes
    -----
    The Salts Dictionary
        The dictionary used is a derived version from the ChEMBL salt
        dictionary, created for the standardiser application by Francis 
        Atkinson. The salts are stored as list of (neutral) SMILES.
    """
    lg = RDLogger.logger()
    lg.setLevel(RDLogger.ERROR)
    i = 0

    if dictionary == True:
        salts = _extract_row_from_csv(0)
        salt_names = _extract_row_from_csv(1)
        list_len = len(salts)

        while i < list_len:
            salt = salts[i]
            salt_name = salt_names[i]
            test = Chem.MolToSmiles(mol)
            i += 1
            remover = SaltRemover(defnData=salt)
            stripped_mol = remover.StripMol(mol)
            test_smiles = Chem.MolToSmiles(stripped_mol)
            if test_smiles != test:
                logging.debug("Following salt was stripped: %s", salt_name)
                mol = stripped_mol
                continue
    else:
        mol = SaltRemover(*args, **kwargs).StripMol(mol)

    return mol
Ejemplo n.º 7
0
 def test_withSdfFile(self):
   testFile = os.sep.join(
     [os.path.dirname(os.path.abspath(__file__)), 'test_data', 'witch-salts.sdf'])
   remover = SaltRemover(defnFilename=testFile, defnFormat=InputFormat.MOL)
   self.assertEqual(len(remover.salts), 240)
   m = Chem.MolFromSmiles("Cc1onc(-c2ccccc2)c1C([O-])=NC1C(=O)N2C1SC(C)(C)C2C(=O)O.O.[Na+]")
   tuple = remover.StripMolWithDeleted(m)
   self.assertEqual(Chem.MolToSmiles(tuple.mol), 'Cc1onc(-c2ccccc2)c1C([O-])=NC1C(=O)N2C1SC(C)(C)C2C(=O)O.O')
   self.assertEqual(len(tuple.deleted), 1)
   self.assertEqual(Chem.MolToSmiles(tuple.deleted[0]), '[Na+]')
Ejemplo n.º 8
0
 def __init__(self,
              f=None,
              mol_format=None,
              remove_hydrogens=False,
              remove_salts=True,
              compute_2d_coords=True):
     if not remove_hydrogens and remove_salts:
         warnings.warn('Compounds with salts will have hydrogens removed')
     super(MolReader, self).__init__(f, mol_format)
     self.remove_hydrogens = remove_hydrogens
     self.remove_salts = remove_salts
     if remove_salts:
         self.salt_remover = SaltRemover()
     self.compute_2d_coords = compute_2d_coords
Ejemplo n.º 9
0
def parse_smiles(smiles):
    """ Sanity check and normalization for drugs """
    try:
        # Remove salts
        smiles = smiles.split()[0]
        mol = Chem.MolFromSmiles(smiles)
        remover = SaltRemover()
        mol = remover.StripMol(mol)
        parsed_smiles = Chem.MolToSmiles(mol)
        return parsed_smiles

    except Exception as e:
        pass

    return smiles
Ejemplo n.º 10
0
 def __init__(self, DataFrame, threshold = None, set_threshold = False, standardise = True, process = True):
     """
     Initialiser.
     
     : name (str/pd.DataFrame):
     : threshold (int):
     : set_threshold (bool):
     : standardise (bool):
     
     """
     
     self.threshold = threshold
     self.set_threshold = set_threshold
     self.process = process
     self.standardise = standardise
     self.DataFrame = DataFrame
     
     # path with stored datasets
     self.path = '/projects/../../datasets/'
     self.pool = mp.Pool(processes = mp.cpu_count())   
     
     if self.standardise:
         if self.process:
             self.name = DataFrame
             # Preparing data for preprocessing  
             self.open_file()
             self.filter_data()
         
         self.standardiser = mv.Standardizer()
         self.salt_remover = SaltRemover()
         self.accepted_atoms = ['H','C','N','O','F','S','Cl','Br']
Ejemplo n.º 11
0
def prepSMI(SMIin, defnFilename, removeMetal=1):

    mol = Chem.MolFromSmiles(SMIin)
    s = Standardizer()

    try:
        molstandardized = s.standardize(mol)
        smilestandadized = Chem.MolToSmiles(molstandardized)
    except:

        return "Error: Standardization Fail"

    # remove salt
    # 1.default
    if defnFilename != "":
        remover = SaltRemover(defnFilename=defnFilename)
    else:
        remover = SaltRemover()
    molclean = remover(molstandardized)
    smilesclean = Chem.MolToSmiles(molclean)

    # 2. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
    lelem = smilesclean.split(".")
    # reduce double, case of several salts are included - 255
    lelem = list(set(lelem))
    try:
        lelem.remove("")
    except:
        pass

    # remove metal
    if removeMetal == 1:
        lnometal = []
        for elem in lelem:
            if is_metalorion(elem) == 0:
                lnometal.append(elem)
        lelem = lnometal

    if len(lelem) == 1:
        smilesclean = str(lelem[0])
        return smilesclean
    elif len(lelem) > 1:
        return "Error: Mixture or fragment ot check: " + smilesclean
    elif smilesclean == "":
        return "Error: SMILES empty after preparation"
    else:
        return "Error: No identified"
Ejemplo n.º 12
0
 def test_withSmiFile(self):
     testFile = os.sep.join([
         os.path.dirname(os.path.abspath(__file__)), 'test_data',
         'c6h6-cdk.smi'
     ])
     remover = SaltRemover(defnFilename=testFile,
                           defnFormat=InputFormat.SMILES)
     self.assertEqual(len(remover.salts), 216)
Ejemplo n.º 13
0
def SetupSaltRemover():
    """Setup a salt removerr."""
    
    Remover = None
    if OptionsInfo["SaltsByComponentsMode"]:
        return Remover

    return SaltRemover(defnFilename = OptionsInfo["SaltsFile"], defnData = OptionsInfo["SaltsSMARTS"], defnFormat = InputFormat.SMARTS)
Ejemplo n.º 14
0
    def check_salt(self, molecule: str, subType: str) -> str:
        """
            Checks if the molecule is salt.

            :param molecule:

            :return salt:
        """

        remover = SaltRemover()
        salt = None

        res, deleted = remover.StripMolWithDeleted(self.smiles_mol)

        if len(deleted) >= 1:
            salt = '_'.join([subType, 'salt'])

        return salt
Ejemplo n.º 15
0
def NeutraliseCharges_RemoveSalt(smiles, reactions=None):
    global _reactions
    if reactions is None:
        if _reactions is None:
            _reactions = _InitialiseNeutralisationReactions()
        reactions = _reactions
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        remover = SaltRemover()
        mol, deleted = remover.StripMolWithDeleted(mol)
        replaced = False
        for i, (reactant, product) in enumerate(reactions):
            while mol.HasSubstructMatch(reactant):
                replaced = True
                rms = AllChem.ReplaceSubstructs(mol, reactant, product)
                mol = rms[0]
        if replaced:
            return (Chem.MolToSmiles(mol, True), True)
        else:
            return (smiles, False)
    else:
        return (None, False)
Ejemplo n.º 16
0
def generateCompoundPropertiesTask(structure, debug=False):
    if debug:
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    molecule = structure.molecule
    if not molecule.compoundProperty:
        prop = CompoundProperties(molecule=molecule)
    else:
        prop = molecule.compoundProperty

    saltRemover = SaltRemover()
    mol = Chem.MolFromMolBlock(str(structure.molfile))
    base = saltRemover.StripMol(mol)
    prop.hbd = Descriptors.CalcNumHBD(mol)
    prop.hba = Descriptors.CalcNumHBA(mol)
    prop.rtb = Descriptors.CalcNumRotatableBonds(mol)
    prop.alogp = Crippen.MolLogP(mol)
    prop.psa = Descriptors.CalcTPSA(mol)
    prop.full_mwt = NewDescriptors.MolWt(mol)
    # prop.exact_mass = Descriptors.CalcExactMolWt(mol)

    if base.GetNumAtoms():
        prop.mw_freebase = NewDescriptors.MolWt(base)

    prop.full_molformula = Descriptors.CalcMolFormula(mol)

    try:
        prop.save()

    except IntegrityError as e:
        if debug:
            print e.message
        else:
            raise e
Ejemplo n.º 17
0
 def test_SmilesVsSmarts(self):
   # SMARTS
   remover = SaltRemover(defnData="[Cl,Br]")
   mol = Chem.MolFromSmiles('CN(Br)Cl.Cl')
   res = remover.StripMol(mol)
   self.assertEqual(res.GetNumAtoms(), 4)
   self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br')
   mol = Chem.MolFromSmiles('CN(C)C.Cl.Br')
   res, deleted = remover.StripMolWithDeleted(mol)
   self.assertEqual(Chem.MolToSmiles(res), 'CN(C)C')
   # Because we read in SMARTS, we should output as well. Otherwise, we will have
   # mismatches
   self.assertListEqual([Chem.MolToSmarts(m) for m in deleted], ['[Cl,Br]'])
   # SMILES
   remover = SaltRemover(defnData="Cl", defnFormat=InputFormat.SMILES)
   mol = Chem.MolFromSmiles('CN(Br)Cl.Cl')
   res = remover.StripMol(mol)
   self.assertEqual(res.GetNumAtoms(), 4)
   self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br')
Ejemplo n.º 18
0
    # Could allow only H, C, N, O, S, P, F, Cl, Br, I
    for a in fragment.GetAtoms():
        if a.GetAtomicNum() == 6:
            return False
    return True


def contains_nonorg(fragment):
    # organic: H, C, N, O, P, S, F, Cl, Br, I
    for a in fragment.GetAtoms():
        if a.GetAtomicNum() not in [1, 6, 7, 8, 15, 16, 9, 17, 35, 53]:
            return "Yes"
    return "No"


r = SaltRemover()

molecule_column = input_table['Molecule']  # Input from KNIME table
stand_mol_list = []
errs = []
mixture = "No"

for index, input_cell in molecule_column.iteritems(
):  # iterate through molecule list
    mol = input_cell
    if mol is None:
        stand_mol_list.append(
            ("Got empty molecule", index, mol, "No", None, None))
        continue
    try:
        mol = rdMolStandardize.MetalDisconnector().Disconnect(
Ejemplo n.º 19
0
"""Functions that can be used to preprocess SMILES sequnces in the form used in the publication."""
import numpy as np
import pandas as pd
import tensorflow as tf
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit import Chem
from rdkit.Chem import Descriptors
REMOVER = SaltRemover()
ORGANIC_ATOM_SET = set([5, 6, 7, 8, 9, 15, 16, 17, 35, 53])


def dataframe_to_tfrecord(df,
                          tfrecord_file_name,
                          random_smiles_key=None,
                          canonical_smiles_key=None,
                          inchi_key=None,
                          mol_feature_keys=None,
                          shuffle_first=False):
    """Function to create a tf-record file to train the tranlation model from a pandas dataframe.
    Args:
        df: Dataframe with the sequnce representations of the molecules.
        tfrecord_file_name: Name/Path of the file to write the tf-record file to.
        random_smiles_key: header of the dataframe row which holds the randomized SMILES sequnces.
        canonical_smiles_key: header of the dataframe row which holds the canonicalized SMILES
        sequnces.
        inchi_key: header of the dataframe row which holds the InChI sequnces.
        mol_feature_keys:header of the dataframe row which holds molecualar features.
        shuffle_first: Defines if dataframe is shuffled first before writing to tf-record file.
    Returns:
        None
    """
Ejemplo n.º 20
0
import pyrfume
from pyrfume import odorants
from rickpy import ProgressBar
# -

file_path = os.path.join(pyrfume.DATA, 'all_cids_properties.csv')
df = pd.read_csv(file_path).set_index('CID')

# ## Make 3D optimized versions of the molecules

# +
# Make basic mol objects
mols = {cid: Chem.MolFromSmiles(smi) for cid, smi in df['IsomericSMILES'].items()}

# Then optimize them
s = SaltRemover()
p = ProgressBar(len(df))
for i, (cid, mol) in enumerate(mols.items()):
    p.animate(i, status=cid)
    try:
        mol.SetProp("_Name","%d: %s" % (cid, df.loc[cid, 'IsomericSMILES']))
        mol = s.StripMol(mol, dontRemoveEverything=True)
        mol = Chem.AddHs(mol)
        AllChem.Compute2DCoords(mol)
        AllChem.EmbedMolecule(mol)
        AllChem.UFFOptimizeMolecule(mol) # Is this deterministic?  
    except Exception as e:
        p.log('Exception for %d: %s' % (cid, e))
        mols[cid] = None
    else:
        mols[cid] = mol
Ejemplo n.º 21
0
        print('\t\t',dataset_smiles_y.iloc[11]['SMILES'],'\n')
else:
    print('\tCongratulations, your dataset has not incorrect smiles.')
    

##############################################################################
############################ STEP 2: salt elimination ########################
##############################################################################

print('[+] Eliminating salts ')

withoutsalts = []

for smi in df_clean_by_sanit['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    remover = SaltRemover(defnData='[Na,Cl,K,O,OH,Fe,F,H,Al,Mg,Co,Ti,NH4,Mn,Si,Ca,Au,I,Hg,Mo,Zn,Br,Ag,Sr,Cu,Bi,S,Li,NH3,He,Y,Ar,Ba,La]')
    mol = remover.StripMol(mol)
    smiles_new = Chem.MolToSmiles(mol)
    smiles_new = smiles_new.replace('.[H+]', '').replace('[H+].', '') # because saltremover do not eliminate water
    withoutsalts.append(smiles_new)




df_clean_by_sanit.insert(2,'W/O SALTS',withoutsalts)


prompt = []

for smile_with, smile_without in zip(df_clean_by_sanit['SAN_SMILES'],df_clean_by_sanit['W/O SALTS']):
    if smile_with != smile_without:
Ejemplo n.º 22
0
def main(argv=sys.argv):
    valid_elements = ['H', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'Br', 'I']
    valid_atomic_num = [1, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53]

    if len(argv) < 3:
        print """
OBJ
  to strip self-defined counterions

Usage:
  %s [options] input output

[options]
  --strip            : if given, to run stripping salts/solvents
  --strip-sdf    file: specify the mols/fragments to be removed
  --strip-smarts file: one SMARTS string per line
  --filter-invalid   : if given, to remove molecules containing
                       R group or elements other than
                       %s
  --addh             : if given, to add hydrogens
  --make3d           : if given, 3D coordinates will be generated

Attention
  1. rdkit.Chem.SaltRemover.SaltRemover is called
  2. if neither `--strip-sdf` nor `--strip-smarts` is provided,
     stripping salts will be done according to default salts defined
     in `RDConfig.RDDataDir/Salts.txt`
  3. both `input` and `output` are .sdf
  4. whenever `--make3d` is given, please make sure that
     there is no complex or salts/solvents can be stripped.
     Otherwise, maybe there is something wrong with optimized structure.
""" % (argv[0], str(valid_elements))
        sys.exit(1)

    options, args = getopt(argv[1:], '', [
        'strip-sdf=', 'strip-smarts=', 'strip', 'filter-invalid', 'addh',
        'make3d'
    ])
    filter_invalid = False
    strip = False
    addh = False
    make3d = False
    strip_sdf = None
    strip_smarts = None
    for opt, val in options:
        if opt == '--strip':
            strip = True
        elif opt == "--strip-sdf":
            strip_sdf = val
        elif opt == '--strip-smarts':
            strip_smarts = val
        elif opt == '--filter-invalid':
            filter_invalid = True
        elif opt == '--addh':
            addh = True
        elif opt == '--make3d':
            make3d = True
        else:
            print "Error: invalid option", opt
            sys.exit(1)
    assert len(args) == 2
    infile = args[0]
    outfile = args[1]

    smarts = ""
    if strip_sdf is not None:
        print "To load fragments from", strip_sdf
        count = 0
        for m in Chem.SDMolSupplier(strip_sdf):
            count += 1
            if m is None:
                print "Warning: failed to read %dth molecule in %s" % (
                    count, strip_sdf)
                continue
            smarts += (Chem.MolToSmarts(m) + "\n")
    if strip_smarts is not None:
        print "to load fragments from", strip_smarts
        for line in open(strip_smarts, 'r'):
            smarts += line

    if strip:
        if smarts == "":
            remover = SaltRemover(defnData=smarts)
        else:
            remover = SaltRemover()
    else:
        remover = None

    inf = Chem.SDMolSupplier(infile)
    outf = Chem.SDWriter(outfile)
    count = 0
    for m in inf:
        count += 1
        if m is None:
            print "Warning: failed to load %dth molecule from %s" % (count,
                                                                     infile)
            continue
        if filter_invalid:
            invalid = False
            for a in m.GetAtoms():
                if a.GetAtomicNum() not in valid_atomic_num:
                    invalid = True
                    break
            if invalid:
                continue
        if strip:
            m = remover.StripMol(m)
            if m.HasProp("_Name"):
                name = m.GetProp("_Name")
            else:
                name = "%s_%d" % (infile, count)
            if num_components(m) > 1:
                print "Warning: %s still has more than one components!" % name
        if addh:
            m = AllChem.AddHs(m)
        if make3d:
            AllChem.EmbedMolecule(m)
            AllChem.UFFOptimizeMolecule(m)
        outf.write(m)
    outf.close()
Ejemplo n.º 23
0
def main(filename,remove,select,identity,verbose,check,cpu):

    """ remove or select molecules by using predefined or custom filters """

    if check: 
        checkPredefined()
        sys.exit(0)

    select_xml= getxmls(select)
    remove_xml= getxmls(remove)
    showFilters(select_xml,remove_xml)
    FS = readFilters(select_xml,remove_xml)

    b= os.path.basename(filename)
    prefix= b.split(".")[0]
    ext= b.split(".")[-1].lower()

    if ext == "smi" or ext == "smiles" or ext == "ism" :
        mols = Chem.SmilesMolSupplier(filename,titleLine=False)
        outfile = prefix+"-selected.smi"
        outWriter = Chem.SmilesWriter(outfile,includeHeader=False,delimiter=' ')
    elif ext == "sdf" or ext == "sd" :
        mols = Chem.SDMolSupplier(filename)
        outfile = prefix+"-selected.sdf"
        outWriter = Chem.SDWriter(outfile)

    logfile = prefix+"-rejected.csv"
    f = open(logfile,"w")
    logWriter= csv.writer(f,delimiter=",",quotechar='"')

    p = Pool(cpu)

    # test molId with the first record
    if identity:
        try:
            Id =  mols[0].GetProp(identity)
            assert Id
        except:
            print("Error: cannot define Id by given 'identity'")
            sys.exit(9)
    else:
        if mols[0].GetProp("_Name"): 
            identity= "_Name"
        else:
            identity= guessIdentity(mols)

    num = 0
    num_records= 0
    num_remove = 0
    num_select = 0
    done= []
    for m in mols :
        num_records += 1
        if m == None : continue
        Id = m.GetProp(identity)
        if (not Id) or (Id in done):
            Id = default_molId_prefix+"%04d" % num
        m.SetProp("_Name",Id)
        m = SaltRemover().StripMol(m, dontRemoveEverything=True)
        workload = []
        for filtername,action,entries in FS :
            workload.extend([(m,Id,action,filtername,grp,smarts,lb,ub) \
                for grp,smarts,lb,ub in entries])

        results = list(p.map(worker, workload))

        vote_to_remove = 0
        vote_to_select = 0
        reasons= []
        for res in results:
            flag,action,molId,vstr,filtername,entryname,lb,ub = res
            if (flag == True  and action == "select") or \
               (flag == False and action == "remove") :
                vote_to_select += 1
            else:
                vote_to_remove += 1
                reasons.append(res)

        # verdict
        if vote_to_remove > 0:
            num_remove += 1
            for res in reasons:
              logWriter.writerow(res) # rejected
              if verbose: print("  %-10s %s %-30s %10s [%s..%s]" % res[2:])
        else:
            num_select += 1
            outWriter.write(m) # passed
        
        done.append(Id)
        num += 1

    num_done = len(done)
    print("%d/%d done     <-- %s" % (num_done,num_records,filename))
    print("%d/%d selected --> %s" % (num_select,num_done,outfile))
    print("%d/%d rejected --> %s" % (num_remove,num_done,logfile))
Ejemplo n.º 24
0
class MolReader(MolIO):
    """
    Read molecules from files and file-like objects. Supports SDF, SMILES,
    and RDKit binary format (via pickle).

    Parameters
    ----------
    f : file, optional
        File-like object.
    mol_format : str, optional
        Molecule file format. Currently supports 'sdf', 'smi', and 'pkl'.
    remove_hydrogens : bool, optional (default False)
        Remove hydrogens from molecules.
    remove_salts : bool, optional (default True)
        Remove salts from molecules. Note that this will remove any hydrogens
        present on the molecule.
    compute_2d_coords : bool, optional (default True)
        Compute 2D coordinates when reading SMILES. If molecules are written to
        SDF without 2D coordinates, stereochemistry information will be lost.
    """
    def __init__(self,
                 f=None,
                 mol_format=None,
                 remove_hydrogens=False,
                 remove_salts=True,
                 compute_2d_coords=True):
        if not remove_hydrogens and remove_salts:
            warnings.warn('Compounds with salts will have hydrogens removed')
        super(MolReader, self).__init__(f, mol_format)
        self.remove_hydrogens = remove_hydrogens
        self.remove_salts = remove_salts
        if remove_salts:
            self.salt_remover = SaltRemover()
        self.compute_2d_coords = compute_2d_coords

    def __iter__(self):
        """
        Iterate over molecules.
        """
        return self.get_mols()

    def get_mols(self):
        """
        Read molecules from a file-like object.

        Molecule conformers are grouped into a single molecule. Two
        molecules are considered conformers of the same molecule if they:
        * Are contiguous in the file
        * Have identical (canonical isomeric) SMILES strings
        * Have identical compound names (if set)

        Returns
        -------
        A generator yielding (possibly multi-conformer) RDKit Mol objects.
        """
        parent = None
        for mol in self._get_mols():
            if parent is None:
                parent = mol
                continue
            if self.are_same_molecule(parent, mol):
                if mol.GetNumConformers():
                    for conf in mol.GetConformers():
                        parent.AddConformer(conf, assignId=True)
                else:
                    continue  # skip duplicate molecules without conformers
            else:
                parent = self.clean_mol(parent)
                if parent is not None:
                    yield parent
                parent = mol
        if parent is not None:
            parent = self.clean_mol(parent)
            if parent is not None:
                yield parent

    def _get_mols(self):
        """
        Read molecules from a file-like object.

        This method returns individual conformers from a file and does not
        attempt to combine them into multiconformer Mol objects.

        Returns
        -------
        A generator yielding RDKit Mol objects.
        """
        if self.mol_format == 'sdf':
            mols = self._get_mols_from_sdf()
        elif self.mol_format == 'smi':
            mols = self._get_mols_from_smiles()
        elif self.mol_format == 'pkl':
            mols = self._get_mols_from_pickle()
        else:
            raise NotImplementedError('Unrecognized molecule format ' +
                                      '"{}"'.format(self.mol_format))

        # skip read errors
        while True:
            try:
                mol = mols.next()
            except StopIteration:
                break
            except Exception:
                warnings.warn('Skipping molecule.')
                continue
            else:
                if mol is not None:
                    yield mol

    def _get_mols_from_sdf(self):
        """
        Read SDF molecules from a file-like object.
        """
        supplier = Chem.ForwardSDMolSupplier(self.f,
                                             removeHs=self.remove_hydrogens)
        for mol in supplier:
            yield mol

    def _get_mols_from_smiles(self):
        """
        Read SMILES molecules from a file-like object.
        """
        for line in self.f.readlines():
            line = line.strip()
            if not line:
                continue
            split_line = line.split()
            if len(split_line) > 1:
                smiles, name = split_line
            else:
                smiles, = split_line
                name = None

            # hydrogens are removed by default, which triggers sanitization
            try:
                if self.remove_hydrogens:
                    mol = Chem.MolFromSmiles(smiles)
                else:
                    mol = Chem.MolFromSmiles(smiles, sanitize=False)
                    Chem.SanitizeMol(mol)

                if self.compute_2d_coords:
                    AllChem.Compute2DCoords(mol)
            except Exception:
                warnings.warn('Skipping ' + line)
                continue
            else:
                if name is not None:
                    mol.SetProp('_Name', name)
                yield mol

    def _get_mols_from_pickle(self):
        """
        Read pickled molecules from a file-like object.

        Files that contain multiple pickles are supported by repeated calls
        to load.
        """
        while True:
            try:
                mols = cPickle.load(self.f)
                for mol in np.atleast_1d(mols):
                    yield mol
            except EOFError:
                break

    def are_same_molecule(self, a, b):
        """
        Test whether two molecules are conformers of the same molecule.

        Test for:
        * Identical (canonical isomeric) SMILES strings
        * Identical compound names (if set)

        Parameters
        ----------
        a, b : RDKit Mol
            Molecules to compare.
        """

        # get names, if available
        a_name = self._get_name(a)
        b_name = self._get_name(b)

        # get canonical isomeric SMILES
        a_smiles = self._get_isomeric_smiles(a)
        b_smiles = self._get_isomeric_smiles(b)
        assert a_smiles and b_smiles

        # test for same molecule
        return a_smiles == b_smiles and a_name == b_name

    def _get_name(self, mol):
        """
        Get molecule name, if available.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        if mol.HasProp('_Name'):
            return mol.GetProp('_Name')
        else:
            return None

    def _get_isomeric_smiles(self, mol):
        """
        Get canonical isomeric SMILES for a molecule. Also sets the
        isomericSmiles property to avoid recomputing.

        Note that stereochemistry is not assigned from 3D coordinates; it
        must be explicitly present in the file or it will not show up in
        the SMILES conversion.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        if mol.HasProp('isomericSmiles'):
            return mol.GetProp('isomericSmiles')
        else:
            smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
            mol.SetProp('isomericSmiles', smiles, computed=True)
            return smiles

    def clean_mol(self, mol):
        """
        Clean a molecule.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        if self.remove_salts:
            # hydrogens must be removed for pattern matching to work properly
            try:
                mol_no_h = Chem.RemoveHs(mol)
            except ValueError:
                if mol.HasProp('_Name'):
                    name = mol.GetProp('_Name')
                else:
                    name = Chem.MolToSmiles(mol,
                                            isomericSmiles=True,
                                            canonical=True)
                warnings.warn('Skipping ' + name)
                return None
            new = self.salt_remover.StripMol(mol_no_h)
            # only keep if it is valid (# the molecule may _be_ a salt) and has
            # actually been changed
            if new.GetNumAtoms() and mol_no_h.ToBinary() != new.ToBinary():
                mol = new
        return mol
Ejemplo n.º 25
0
    df["STND_SMILES"].values)) == len(df), "Standardize Smiles reduced"

non_cases = np.squeeze(np.argwhere((df["STND_SMILES"] == "-").values))

df = df.drop(non_cases.tolist())
df = df.reset_index(drop=True)

print(df[df.duplicated("STND_SMILES",
                       False)][["Result", "STND_SMILES"]])  # No Contradict

df = df.drop_duplicates("STND_SMILES")  # Drop first duplicated ones
df = df.reset_index(drop=True)

# 4. Salt 제거
df = df.reset_index(drop=True)
remover = SaltRemover()
base_dir = "pic_salt"
res_smi_list = []
res_stnd_smi_list = []
for i in range(len(df)):
    smi, stnd_smi = df.loc[i][["SMILES", "STND_SMILES"]]

    current_dir = os.path.join(base_dir, str(i))
    #os.mkdir(current_dir)

    try:
        mol = MolFromSmiles(smi)
        #MolToImage(mol).save(os.path.join(current_dir,"smi.jpeg"))
    except Exception as e:
        print(i, e)
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover

remover = SaltRemover()


def standardize_smiles(smiles: str) -> str:
    smiles = smiles.replace('\\', '')
    smiles = smiles.replace('/', '')
    smiles = smiles.replace('@', '')
    mol = Chem.MolFromSmiles(smiles)
    res = remover.StripMol(mol, dontRemoveEverything=True)
    smiles = Chem.MolToSmiles(res)

    return smiles

Ejemplo n.º 27
0
    def prepareChem(self, prSMIclean):

        psmiclean = prSMIclean + self.name + ".smi"

        # try if existing
        if path.exists(psmiclean):
            psmiclean = prSMIclean + self.name + ".smi"
            fsmiclean = open(psmiclean, "r")
            smiclean = fsmiclean.readlines()
            fsmiclean.close()

            smiclean = smiclean[0].strip()
            self.smiclean = smiclean
            self.mol = Chem.MolFromSmiles(smiclean)
            self.log = self.log + "Prep SMI :" + str(self.smi) + "\n"
            self.log = self.log + "Prepared SMI :" + str(self.smiclean) + "\n"

        else:
            #self.mol = loader.ReadMolFromSmile(self.smi)

            s = Standardizer()
            mol = Chem.MolFromSmiles(self.smi)

            try:
                out = toolbox.timeFunction(normalize, mol)
                if out == "ERROR":
                    self.log = self.log + "Normalize SMILES: ERROR DURING THE PROCESS\n"
                else:
                    molstandardized = out
            except:
                self.log = self.log + "Normalize SMILES: ERROR INPUT SMI\n"

            if "molstandardized" in locals():

                smilestandadized = Chem.MolToSmiles(molstandardized)

                # remove salt
                # 1.default
                remover = SaltRemover(defnFilename="Salts.txt")
                mol = Chem.MolFromSmiles(smilestandadized)
                molcleandefault = remover(mol)
                # 2. Personal remover
                homeremover = SaltRemover(defnData=LSALT)
                molclean = homeremover(molcleandefault)
                smilesclean = Chem.MolToSmiles(molclean)
                # 3. SMILES remove other manual salts + fragments -> for fragment take one if exactly same compound
                lelem = smilesclean.split(".")
                if len(lelem) > 1:
                    # reduce double, case of several salts are included - 255
                    lelem = list(set(lelem))
                    for smilesdel in LSMILESREMOVE:
                        if smilesdel in lelem:
                            lelem.remove(smilesdel)
                    try:
                        lelem.remove("")  # case of bad smile
                    except:
                        pass
                    if len(lelem) == 1:
                        smilesclean = str(lelem[0])
                    else:
                        # 4. Fragments
                        # Case of fragment -> stock in log file, check after to control
                        self.log = self.log + "Fragments after standardization: " + smilesclean + "\n"
                        smilesclean = ""

                if smilesclean == "":
                    self.log = self.log + "ERROR SMILES: SMILES empty after preparation\n"

                else:
                    self.log = self.log + "Prepared SMI :" + str(
                        smilesclean) + "\n"

                    fsmiclean = open(psmiclean, "w")
                    fsmiclean.write(smilesclean)
                    fsmiclean.close()

                    self.smiclean = smilesclean
                    self.psmiclean = psmiclean
Ejemplo n.º 28
0
from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.rdmolops import RemoveStereochemistry
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.Descriptors import HeavyAtomMolWt
from rdkit.Chem.SaltRemover import SaltRemover


#Putting "O" in here will unify hydrates, like morphine and morphine monohydrate (called morphine by mesh!)
remover = SaltRemover(defnData='[Cl,Br,K,I,Na,O]')

chems = []
n = 0
with open('smiles.txt','r') as inf, open('simpler.txt','w') as outf:
    for line in inf:
        n+=1
        if n % 10000 == 0:
            print(n)
        x = line.strip().split('\t')
        cid = x[0]
        #this set of identifiers causes rdkit to segfault :(
        # given the number of things in the list, a better strategy than run it till it dies, and try
        # again is probably advisable
        #if cid in ['CHEBI:17627', 'CHEBI:50385','CHEBI:18140','CHEBI:38277','CHEBI:50162',
        #           'CHEBI:29297','CHEBI:29293','CHEBI:133488','CHEBI:30158','CHEBI:51220',
        ##           'CHEBI:30470','CHEBI:36301','CHEBI:38284','CHEBI:48998','CHEBI:37189',
        #           'CHEBI:60532','CHEBI:51221','CHEBI:29416', 'CHEBI:36163','CHEBI:29296',
        #           'CHEBI:51508','CHEBI:30665','CHEBI:29886','CHEBI:85715','CHEBI:49851',
        #           'CHEBI:30197','CHEBI:30125','CHEBI:37856','CHEBI:38283','CHEBI:10098',
        #           'CHEBI:132769','CHEBI:133489','CHEBI:134067','CHEBI:141330','CHEBI:15432',
Ejemplo n.º 29
0
def remove_water(m):
    from rdkit.Chem.SaltRemover import SaltRemover
    remover = SaltRemover(defnData="[O]")
    return remover.StripMol(m)
Ejemplo n.º 30
0
class DrugNameConverter:
    """
    Class for converting drug names to InChI keys using PubChem API to query drug names and RDKit for generating InChI keys.
    Includes options for using isomeric forms and for removing salts from drugs.
    """
    remover = SaltRemover()

    @classmethod
    def to_inchi_keys(cls, name, isomeric=True, strip_salts=True):
        """
        Queries PubChem API for a drug with a given name and returns a set of corresponding InChI Keys.

        Parameters:
            name (str):         name of drug
        
        Keyword arguments:
            isomeric (bool):    if True, returns InChI Keys computed from isomeric SMILES
                                otherwise, returns InChI Keys computed from canonical SMILES
            strip_salts (bool): if True, computed InChI Keys using both the original drug SMILES
                                and also the SMILES where all salts are removed
        
        Returns:
            inchi_keys (set): set of InChI Keys corresponding to the drug name queried
        """
        inchi_keys = set()
        for smiles in PubChemQuery.name_to_smiles(name, isomeric=isomeric):
            mol = Chem.MolFromSmiles(smiles)
            inchi_keys.add(Chem.MolToInchiKey(mol))
            if strip_salts:
                stripped_mol = cls.remover.StripMol(mol,
                                                    dontRemoveEverything=True)
                inchi_keys.add(Chem.MolToInchiKey(stripped_mol))
        return inchi_keys

    @classmethod
    def batch_to_inchi_keys_single_thread(cls, names, verbose=0, **kwargs):
        """
        Queries PubChem API for a list of drug names and returns a dictionary mapping each name
        to a set of corresponding InChI Keys.

        Parameters:
            names (list or set):    drug names to query
        
        Keyword arguments:
            verbose (bool):         print progess if True
            **kwargs:               keyword arguments passed to cls.to_inchi_keys
        
        Returns:
            all_inchi_keys (dict):  dictionary mapping each drug name to a set of corresponding InChI Keys
        """
        all_inchi_keys = {}
        names = set(names)
        for name in names:
            inchi_keys = cls.to_inchi_keys(name, **kwargs)
            all_inchi_keys[name] = inchi_keys

            if verbose:
                print(
                    f'Completed { len(all_inchi_keys) }/{ len(names) } drugs...',
                    end='\r')
        return all_inchi_keys

    @classmethod
    def batch_to_inchi_keys(cls, names, num_cores=3, verbose=1, **kwargs):
        """
        Queries PubChem API for a list of drug names and returns a dictionary mapping each name
        to a set of corresponding InChI Keys. Uses multi-threading to parallelize requests.

        Parameters:
            names (list or set):    drug names to query
        
        Keyword arguments:
            num_cores (int/None):   number of threads to use; if None, uses CPU count (at least 1 and at most 12)
            verbose (bool):         show status bar if True
            **kwargs:               keyword arguments passed to cls.to_inchi_keys
        
        Returns:
            all_inchi_keys (dict):  dictionary mapping each drug name to a set of corresponding InChI Keys
        """
        requests.get(
            f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound'
        )  # initial request necessary before pooling (gives status code 400)
        if num_cores is None:
            num_cores = min(max(mp.cpu_count(), 1),
                            12)  # uses at least 1 core and at most 12
        names = list(set(names))
        with Pool(num_cores) as p:
            if verbose:
                res = list(
                    tqdm(p.imap(partial(cls.to_inchi_keys, **kwargs), names),
                         total=len(names)))
            else:
                res = p.map(partial(cls.to_inchi_keys, **kwargs), names)
        return dict(zip(names, res))

    @staticmethod
    def invert_dict(key_to_value_set: dict) -> dict:
        """
        Converts a dictionary with keys mapping to sets of values (e.g. drug name to set of InChI keys)
        into a dictionary with the values as keys, mapping to sets of the former keys (e.g. InChI key to drug names).
        """
        assert (isinstance(key_to_value_set, dict))
        value_to_key_set = {}
        for key in key_to_value_set:
            assert (isinstance(key_to_value_set[key], set))
            for value in key_to_value_set[key]:
                if value not in value_to_key_set:
                    value_to_key_set[value] = {key}
                else:
                    value_to_key_set[value].add(key)
        return value_to_key_set