Exemple #1
0
 def test_withSmiles(self):
     remover = SaltRemover(defnData="[Na+]\nCC(=O)O",
                           defnFormat=InputFormat.SMILES)
     self.assertEqual(len(remover.salts), 2)
     mol = Chem.MolFromSmiles('CC(=O)O.[Na+]')
     res = remover.StripMol(mol)
     self.assertEqual(res.GetNumAtoms(), 0)
Exemple #2
0
def generateMoleculeHierarchyTask(structure, debug=False):

    if debug:
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    molecule = structure.molecule
    if not molecule.moleculeHierarchy:
        hierarchy = MoleculeHierarchy(molecule=molecule)
    else:
        hierarchy = molecule.moleculeHierarchy

    saltRemover = SaltRemover()
    mol = Chem.MolFromMolBlock(str(structure.molfile))
    base = saltRemover.StripMol(mol)

    if mol.GetNumAtoms() == base.GetNumAtoms():
        hierarchy.parent_molecule = molecule
    else:
        hierarchy.parent_molecule = getParentMolregnoFromBase(
            MolToMolBlock(base))

    hierarchy.active_molecule = hierarchy.parent_molecule

    try:
        hierarchy.save()

    except IntegrityError as e:
        if debug:
            print e.message
        else:
            raise e
Exemple #3
0
def remove_salts(mol, dictionary=True, *args, **kwargs):
    """Removes salts from a molecule.

    This function removes detected salts following a salts dictionary by
    default.

    Parameters
    ----------
    mol: rdkit.Chem.Mol
        The molecule to be modified.
    dictionary: bool, optional
        True (default): Activates the use of the salt dictionary.
        False: Uses the standard StripMol functionality, provided by
        rdkit.Chem.SaltRemover.
    defnData: list of str, optional
        If the dictionary is set to False, a custom dictionary can be
        set up. If not rdkit default values from
        '/scratch/RDKit_git/Data/Salts.txt' are used.

    Returns
    -------
    mol: rdkit.Chem.Mol
        A new molecule with salts removed.

    Notes
    -----
    The Salts Dictionary
        The dictionary used is a derived version from the ChEMBL salt
        dictionary, created for the standardiser application by Francis 
        Atkinson. The salts are stored as list of (neutral) SMILES.
    """
    lg = RDLogger.logger()
    lg.setLevel(RDLogger.ERROR)
    i = 0

    if dictionary == True:
        salts = _extract_row_from_csv(0)
        salt_names = _extract_row_from_csv(1)
        list_len = len(salts)

        while i < list_len:
            salt = salts[i]
            salt_name = salt_names[i]
            test = Chem.MolToSmiles(mol)
            i += 1
            remover = SaltRemover(defnData=salt)
            stripped_mol = remover.StripMol(mol)
            test_smiles = Chem.MolToSmiles(stripped_mol)
            if test_smiles != test:
                logging.debug("Following salt was stripped: %s", salt_name)
                mol = stripped_mol
                continue
    else:
        mol = SaltRemover(*args, **kwargs).StripMol(mol)

    return mol
Exemple #4
0
 def test_SmilesVsSmarts(self):
   # SMARTS
   remover = SaltRemover(defnData="[Cl,Br]")
   mol = Chem.MolFromSmiles('CN(Br)Cl.Cl')
   res = remover.StripMol(mol)
   self.assertEqual(res.GetNumAtoms(), 4)
   self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br')
   mol = Chem.MolFromSmiles('CN(C)C.Cl.Br')
   res, deleted = remover.StripMolWithDeleted(mol)
   self.assertEqual(Chem.MolToSmiles(res), 'CN(C)C')
   # Because we read in SMARTS, we should output as well. Otherwise, we will have
   # mismatches
   self.assertListEqual([Chem.MolToSmarts(m) for m in deleted], ['[Cl,Br]'])
   # SMILES
   remover = SaltRemover(defnData="Cl", defnFormat=InputFormat.SMILES)
   mol = Chem.MolFromSmiles('CN(Br)Cl.Cl')
   res = remover.StripMol(mol)
   self.assertEqual(res.GetNumAtoms(), 4)
   self.assertEqual(Chem.MolToSmiles(res), 'CN(Cl)Br')
Exemple #5
0
def parse_smiles(smiles):
    """ Sanity check and normalization for drugs """
    try:
        # Remove salts
        smiles = smiles.split()[0]
        mol = Chem.MolFromSmiles(smiles)
        remover = SaltRemover()
        mol = remover.StripMol(mol)
        parsed_smiles = Chem.MolToSmiles(mol)
        return parsed_smiles

    except Exception as e:
        pass

    return smiles
Exemple #6
0
def generateCompoundPropertiesTask(structure, debug=False):
    if debug:
        pydevd.settrace('localhost',
                        port=6901,
                        stdoutToServer=True,
                        stderrToServer=True)

    molecule = structure.molecule
    if not molecule.compoundProperty:
        prop = CompoundProperties(molecule=molecule)
    else:
        prop = molecule.compoundProperty

    saltRemover = SaltRemover()
    mol = Chem.MolFromMolBlock(str(structure.molfile))
    base = saltRemover.StripMol(mol)
    prop.hbd = Descriptors.CalcNumHBD(mol)
    prop.hba = Descriptors.CalcNumHBA(mol)
    prop.rtb = Descriptors.CalcNumRotatableBonds(mol)
    prop.alogp = Crippen.MolLogP(mol)
    prop.psa = Descriptors.CalcTPSA(mol)
    prop.full_mwt = NewDescriptors.MolWt(mol)
    # prop.exact_mass = Descriptors.CalcExactMolWt(mol)

    if base.GetNumAtoms():
        prop.mw_freebase = NewDescriptors.MolWt(base)

    prop.full_molformula = Descriptors.CalcMolFormula(mol)

    try:
        prop.save()

    except IntegrityError as e:
        if debug:
            print e.message
        else:
            raise e
Exemple #7
0
def remove_water(m):
    from rdkit.Chem.SaltRemover import SaltRemover
    remover = SaltRemover(defnData="[O]")
    return remover.StripMol(m)
Exemple #8
0
df = pd.read_csv(file_path).set_index('CID')

# ## Make 3D optimized versions of the molecules

# +
# Make basic mol objects
mols = {cid: Chem.MolFromSmiles(smi) for cid, smi in df['IsomericSMILES'].items()}

# Then optimize them
s = SaltRemover()
p = ProgressBar(len(df))
for i, (cid, mol) in enumerate(mols.items()):
    p.animate(i, status=cid)
    try:
        mol.SetProp("_Name","%d: %s" % (cid, df.loc[cid, 'IsomericSMILES']))
        mol = s.StripMol(mol, dontRemoveEverything=True)
        mol = Chem.AddHs(mol)
        AllChem.Compute2DCoords(mol)
        AllChem.EmbedMolecule(mol)
        AllChem.UFFOptimizeMolecule(mol) # Is this deterministic?  
    except Exception as e:
        p.log('Exception for %d: %s' % (cid, e))
        mols[cid] = None
    else:
        mols[cid] = mol
        
# Remove CIDs without a successful optimization
mols = {cid: mol for cid, mol in mols.items() if mol}
# -

print("%d mol files successfully optimized from %d CIDs" % (len(mols), len(df)))
Exemple #9
0
else:
    print('\tCongratulations, your dataset has not incorrect smiles.')
    

##############################################################################
############################ STEP 2: salt elimination ########################
##############################################################################

print('[+] Eliminating salts ')

withoutsalts = []

for smi in df_clean_by_sanit['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    remover = SaltRemover(defnData='[Na,Cl,K,O,OH,Fe,F,H,Al,Mg,Co,Ti,NH4,Mn,Si,Ca,Au,I,Hg,Mo,Zn,Br,Ag,Sr,Cu,Bi,S,Li,NH3,He,Y,Ar,Ba,La]')
    mol = remover.StripMol(mol)
    smiles_new = Chem.MolToSmiles(mol)
    smiles_new = smiles_new.replace('.[H+]', '').replace('[H+].', '') # because saltremover do not eliminate water
    withoutsalts.append(smiles_new)




df_clean_by_sanit.insert(2,'W/O SALTS',withoutsalts)


prompt = []

for smile_with, smile_without in zip(df_clean_by_sanit['SAN_SMILES'],df_clean_by_sanit['W/O SALTS']):
    if smile_with != smile_without:
        prompt.append('\t{} --> {}'.format(smile_with,smile_without))
Exemple #10
0
class MolReader(MolIO):
    """
    Read molecules from files and file-like objects. Supports SDF, SMILES,
    and RDKit binary format (via pickle).

    Parameters
    ----------
    f : file, optional
        File-like object.
    mol_format : str, optional
        Molecule file format. Currently supports 'sdf', 'smi', and 'pkl'.
    remove_hydrogens : bool, optional (default False)
        Remove hydrogens from molecules.
    remove_salts : bool, optional (default True)
        Remove salts from molecules. Note that this will remove any hydrogens
        present on the molecule.
    compute_2d_coords : bool, optional (default True)
        Compute 2D coordinates when reading SMILES. If molecules are written to
        SDF without 2D coordinates, stereochemistry information will be lost.
    """
    def __init__(self,
                 f=None,
                 mol_format=None,
                 remove_hydrogens=False,
                 remove_salts=True,
                 compute_2d_coords=True):
        if not remove_hydrogens and remove_salts:
            warnings.warn('Compounds with salts will have hydrogens removed')
        super(MolReader, self).__init__(f, mol_format)
        self.remove_hydrogens = remove_hydrogens
        self.remove_salts = remove_salts
        if remove_salts:
            self.salt_remover = SaltRemover()
        self.compute_2d_coords = compute_2d_coords

    def __iter__(self):
        """
        Iterate over molecules.
        """
        return self.get_mols()

    def get_mols(self):
        """
        Read molecules from a file-like object.

        Molecule conformers are grouped into a single molecule. Two
        molecules are considered conformers of the same molecule if they:
        * Are contiguous in the file
        * Have identical (canonical isomeric) SMILES strings
        * Have identical compound names (if set)

        Returns
        -------
        A generator yielding (possibly multi-conformer) RDKit Mol objects.
        """
        parent = None
        for mol in self._get_mols():
            if parent is None:
                parent = mol
                continue
            if self.are_same_molecule(parent, mol):
                if mol.GetNumConformers():
                    for conf in mol.GetConformers():
                        parent.AddConformer(conf, assignId=True)
                else:
                    continue  # skip duplicate molecules without conformers
            else:
                parent = self.clean_mol(parent)
                if parent is not None:
                    yield parent
                parent = mol
        if parent is not None:
            parent = self.clean_mol(parent)
            if parent is not None:
                yield parent

    def _get_mols(self):
        """
        Read molecules from a file-like object.

        This method returns individual conformers from a file and does not
        attempt to combine them into multiconformer Mol objects.

        Returns
        -------
        A generator yielding RDKit Mol objects.
        """
        if self.mol_format == 'sdf':
            mols = self._get_mols_from_sdf()
        elif self.mol_format == 'smi':
            mols = self._get_mols_from_smiles()
        elif self.mol_format == 'pkl':
            mols = self._get_mols_from_pickle()
        else:
            raise NotImplementedError('Unrecognized molecule format ' +
                                      '"{}"'.format(self.mol_format))

        # skip read errors
        while True:
            try:
                mol = mols.next()
            except StopIteration:
                break
            except Exception:
                warnings.warn('Skipping molecule.')
                continue
            else:
                if mol is not None:
                    yield mol

    def _get_mols_from_sdf(self):
        """
        Read SDF molecules from a file-like object.
        """
        supplier = Chem.ForwardSDMolSupplier(self.f,
                                             removeHs=self.remove_hydrogens)
        for mol in supplier:
            yield mol

    def _get_mols_from_smiles(self):
        """
        Read SMILES molecules from a file-like object.
        """
        for line in self.f.readlines():
            line = line.strip()
            if not line:
                continue
            split_line = line.split()
            if len(split_line) > 1:
                smiles, name = split_line
            else:
                smiles, = split_line
                name = None

            # hydrogens are removed by default, which triggers sanitization
            try:
                if self.remove_hydrogens:
                    mol = Chem.MolFromSmiles(smiles)
                else:
                    mol = Chem.MolFromSmiles(smiles, sanitize=False)
                    Chem.SanitizeMol(mol)

                if self.compute_2d_coords:
                    AllChem.Compute2DCoords(mol)
            except Exception:
                warnings.warn('Skipping ' + line)
                continue
            else:
                if name is not None:
                    mol.SetProp('_Name', name)
                yield mol

    def _get_mols_from_pickle(self):
        """
        Read pickled molecules from a file-like object.

        Files that contain multiple pickles are supported by repeated calls
        to load.
        """
        while True:
            try:
                mols = cPickle.load(self.f)
                for mol in np.atleast_1d(mols):
                    yield mol
            except EOFError:
                break

    def are_same_molecule(self, a, b):
        """
        Test whether two molecules are conformers of the same molecule.

        Test for:
        * Identical (canonical isomeric) SMILES strings
        * Identical compound names (if set)

        Parameters
        ----------
        a, b : RDKit Mol
            Molecules to compare.
        """

        # get names, if available
        a_name = self._get_name(a)
        b_name = self._get_name(b)

        # get canonical isomeric SMILES
        a_smiles = self._get_isomeric_smiles(a)
        b_smiles = self._get_isomeric_smiles(b)
        assert a_smiles and b_smiles

        # test for same molecule
        return a_smiles == b_smiles and a_name == b_name

    def _get_name(self, mol):
        """
        Get molecule name, if available.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        if mol.HasProp('_Name'):
            return mol.GetProp('_Name')
        else:
            return None

    def _get_isomeric_smiles(self, mol):
        """
        Get canonical isomeric SMILES for a molecule. Also sets the
        isomericSmiles property to avoid recomputing.

        Note that stereochemistry is not assigned from 3D coordinates; it
        must be explicitly present in the file or it will not show up in
        the SMILES conversion.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        if mol.HasProp('isomericSmiles'):
            return mol.GetProp('isomericSmiles')
        else:
            smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
            mol.SetProp('isomericSmiles', smiles, computed=True)
            return smiles

    def clean_mol(self, mol):
        """
        Clean a molecule.

        Parameters
        ----------
        mol : RDKit Mol
            Molecule.
        """
        if self.remove_salts:
            # hydrogens must be removed for pattern matching to work properly
            try:
                mol_no_h = Chem.RemoveHs(mol)
            except ValueError:
                if mol.HasProp('_Name'):
                    name = mol.GetProp('_Name')
                else:
                    name = Chem.MolToSmiles(mol,
                                            isomericSmiles=True,
                                            canonical=True)
                warnings.warn('Skipping ' + name)
                return None
            new = self.salt_remover.StripMol(mol_no_h)
            # only keep if it is valid (# the molecule may _be_ a salt) and has
            # actually been changed
            if new.GetNumAtoms() and mol_no_h.ToBinary() != new.ToBinary():
                mol = new
        return mol
Exemple #11
0
    mol = input_cell
    if mol is None:
        stand_mol_list.append(
            ("Got empty molecule", index, mol, "No", None, None))
        continue
    try:
        mol = rdMolStandardize.MetalDisconnector().Disconnect(
            mol)  # Disconnect metals
    except ValueError as e:
        if len(Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)) > 1:
            mixture = "Yes"
        stand_mol_list.append(
            ("Failed at disconnect", index, None, mixture, None, str(e)))
        continue

    mol = r.StripMol(mol)

    # Check if we have multiple fragments present

    if len(Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)) > 1:
        mixture = "Yes"
    else:
        mixture = "No"

    # Standardize fragments separately

    for i, frag in enumerate(
            Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)):

        frag = r.StripMol(frag)
        if frag.GetNumAtoms() == 0:
Exemple #12
0
    #os.mkdir(current_dir)

    try:
        mol = MolFromSmiles(smi)
        #MolToImage(mol).save(os.path.join(current_dir,"smi.jpeg"))
    except Exception as e:
        print(i, e)

    try:
        stnd_mol = MolFromSmiles(stnd_smi)
        #MolToImage(stnd_mol).save(os.path.join(current_dir,"stnd_smi.jpeg"))
    except Exception as e:
        print(i, e)

    try:
        res = remover.StripMol(mol)
        #MolToImage(res).save(os.path.join(current_dir,"smi_res.jpeg"))
        res_smi = MolToSmiles(res)
        res_smi_list.append(res_smi)
    except Exception as e:
        res_smi = "-"
        res_smi_list.append(res_smi)
        print(i, e)

    try:
        stnd_res = remover.StripMol(stnd_mol)
        #MolToImage(stnd_res).save(os.path.join(current_dir,"stnd_smi_res.jpeg"))
        res_stnd_smi = MolToSmiles(stnd_res)
        res_stnd_smi_list.append(res_stnd_smi)
    except Exception as e:
        res_stnd_smi = "-"
def main(argv=sys.argv):
    valid_elements = ['H', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'Br', 'I']
    valid_atomic_num = [1, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53]

    if len(argv) < 3:
        print """
OBJ
  to strip self-defined counterions

Usage:
  %s [options] input output

[options]
  --strip            : if given, to run stripping salts/solvents
  --strip-sdf    file: specify the mols/fragments to be removed
  --strip-smarts file: one SMARTS string per line
  --filter-invalid   : if given, to remove molecules containing
                       R group or elements other than
                       %s
  --addh             : if given, to add hydrogens
  --make3d           : if given, 3D coordinates will be generated

Attention
  1. rdkit.Chem.SaltRemover.SaltRemover is called
  2. if neither `--strip-sdf` nor `--strip-smarts` is provided,
     stripping salts will be done according to default salts defined
     in `RDConfig.RDDataDir/Salts.txt`
  3. both `input` and `output` are .sdf
  4. whenever `--make3d` is given, please make sure that
     there is no complex or salts/solvents can be stripped.
     Otherwise, maybe there is something wrong with optimized structure.
""" % (argv[0], str(valid_elements))
        sys.exit(1)

    options, args = getopt(argv[1:], '', [
        'strip-sdf=', 'strip-smarts=', 'strip', 'filter-invalid', 'addh',
        'make3d'
    ])
    filter_invalid = False
    strip = False
    addh = False
    make3d = False
    strip_sdf = None
    strip_smarts = None
    for opt, val in options:
        if opt == '--strip':
            strip = True
        elif opt == "--strip-sdf":
            strip_sdf = val
        elif opt == '--strip-smarts':
            strip_smarts = val
        elif opt == '--filter-invalid':
            filter_invalid = True
        elif opt == '--addh':
            addh = True
        elif opt == '--make3d':
            make3d = True
        else:
            print "Error: invalid option", opt
            sys.exit(1)
    assert len(args) == 2
    infile = args[0]
    outfile = args[1]

    smarts = ""
    if strip_sdf is not None:
        print "To load fragments from", strip_sdf
        count = 0
        for m in Chem.SDMolSupplier(strip_sdf):
            count += 1
            if m is None:
                print "Warning: failed to read %dth molecule in %s" % (
                    count, strip_sdf)
                continue
            smarts += (Chem.MolToSmarts(m) + "\n")
    if strip_smarts is not None:
        print "to load fragments from", strip_smarts
        for line in open(strip_smarts, 'r'):
            smarts += line

    if strip:
        if smarts == "":
            remover = SaltRemover(defnData=smarts)
        else:
            remover = SaltRemover()
    else:
        remover = None

    inf = Chem.SDMolSupplier(infile)
    outf = Chem.SDWriter(outfile)
    count = 0
    for m in inf:
        count += 1
        if m is None:
            print "Warning: failed to load %dth molecule from %s" % (count,
                                                                     infile)
            continue
        if filter_invalid:
            invalid = False
            for a in m.GetAtoms():
                if a.GetAtomicNum() not in valid_atomic_num:
                    invalid = True
                    break
            if invalid:
                continue
        if strip:
            m = remover.StripMol(m)
            if m.HasProp("_Name"):
                name = m.GetProp("_Name")
            else:
                name = "%s_%d" % (infile, count)
            if num_components(m) > 1:
                print "Warning: %s still has more than one components!" % name
        if addh:
            m = AllChem.AddHs(m)
        if make3d:
            AllChem.EmbedMolecule(m)
            AllChem.UFFOptimizeMolecule(m)
        outf.write(m)
    outf.close()
Exemple #14
0
 def test_github_4550(self):
     m = Chem.MolFromSmiles('Cl.C[N]1=CC=CC=C1', sanitize=False)
     self.assertEqual(m.GetNumAtoms(), 8)
     saltstrip = SaltRemover()
     res = saltstrip.StripMol(m, sanitize=False)
     self.assertEqual(Chem.MolToSmiles(res), 'CN1=CC=CC=C1')
Exemple #15
0
def ARGE_function(root_filename_open):
    # STEP 1: from a dataset of molecules, generate all possible fragmentations depending brics bounds
    # STEP 2: determine best R0 depending on a score
    # STEP 3: when there is a match, characterize molecules from the dataset with best R0 and substituants associated
    # STEP 4: Iterate the process with only molecules undescribed with best R0

    # STEP 0: Dataset of molecules, sdf required. List of smiles created.
    suppl = Chem.SDMolSupplier(root_filename_open)

    list_smiles_n = []
    remover = SaltRemover()
    for mol in suppl:
        try:
            res = remover.StripMol(mol)
            list_smiles_n.append(Chem.MolToSmiles(res))
        except:
            print("a line of the SDF has been ignored: " + mol)
    print(
        "STEP 0 succeed: file recognized as sdf, all rows recognized as molecules"
    )

    # STEP 1: df_brics_frag_gen(), return df_brics_frag
    # Generation of all fragments combinations of all molecules depending on brics bonds cuts
    # A number of brics bonds cuts > 1 is used in a combination
    df_brics_frag = df_brics_frag_gen(list_smiles_n)

    # Iterative process, results are compilated in df_final
    dict_0 = {}
    df_final = pd.DataFrame(dict_0)

    num_ite = 1
    while len(list_smiles_n) > 0:

        # STEP 2: R0 ranking depending on a score
        df_unique_frag = df_unique_frag_gen(df_brics_frag)

        # STEP 3: when it is possible, characterize molecules from the dataset with best R0 and substituants associated
        df_subs_r0 = df_subs_r0_gen(df_unique_frag, list_smiles_n)

        # STEP 3-bis: clean the results, attribute R1, R2, Rn labels to substituants
        df_subs_r0 = r0_clean(df_subs_r0, df_unique_frag, num_ite)

        # Concat results in df_final
        df_final = pd.concat([df_final, df_subs_r0], axis=0, sort=False)

        # STEP 4: Prepare df_brics_frag and list_smiles_n of the next iteration
        df_brics_frag = df_brics_frag_ite(df_subs_r0, df_brics_frag)
        list_smiles_n = list_smiles_n_ite(df_subs_r0, list_smiles_n)

        print("num ite: = " + str(num_ite))

        num_ite = num_ite + 1

        if len(df_brics_frag) == 0:
            # Handle molecules in list_smiles_n undescribed in df_brics_frag due to brics bonds number of 0 or 1
            df_final = pd.concat(
                [df_final, final_undescribed_mol(list_smiles_n)],
                axis=0,
                sort=False)

    print("Iterative process succeed and residual molecules added")

    df_final["mol_mol"] = df_final["mol_smiles"].apply(
        lambda x: Chem.MolFromSmiles(x))

    df_final = df_final.fillna(0)
    for x in df_final:
        df_final[x] = df_final[x].apply(lambda n: "" if n == 0 else n)
        df_final[x] = df_final[x].apply(lambda n: "" if n == "0" else n)

    return df_final