Example #1
0
    def clean_mol(self, mol: Mol) -> Mol:
        """Cleans the specified molecule into standardised format.

        The steps are,

        - Removal of salts
        - Normalise structures
        - Normalise tautomers
        - Remove all charges (where possible)

        Args:
            mol: The molecule to clean.

        Returns:
            The cleaned molecule.
        """
        # Use RDKit standardizer to return the parent fragment (non-salt)
        # This will also apply more normalisation and clean up any charges

        mol = rdMolStandardize.ChargeParent(mol)

        # Custom tautomers

        mol = self._apply_reaction(mol, self._tautomerTetrazole)

        return mol
    def test3Parents(self):
        mol = Chem.MolFromSmiles("[Na]OC(=O)c1ccccc1")
        nmol = rdMolStandardize.FragmentParent(mol)
        self.assertEqual(Chem.MolToSmiles(nmol), "O=C([O-])c1ccccc1")

        mol = Chem.MolFromSmiles("C[NH+](C)(C).[Cl-]")
        nmol = rdMolStandardize.ChargeParent(mol)
        self.assertEqual(Chem.MolToSmiles(nmol), "CN(C)C")

        mol = Chem.MolFromSmiles("[O-]CCCC=CO.[Na+]")
        nmol = rdMolStandardize.TautomerParent(mol)
        self.assertEqual(Chem.MolToSmiles(nmol), "O=CCCCC[O-].[Na+]")
        nmol = rdMolStandardize.TautomerParent(mol, skipStandardize=True)
        # same answer because of the standardization at the end
        self.assertEqual(Chem.MolToSmiles(nmol), "O=CCCCC[O-].[Na+]")

        mol = Chem.MolFromSmiles("C[C@](F)(Cl)C/C=C/[C@H](F)Cl")
        nmol = rdMolStandardize.StereoParent(mol)
        self.assertEqual(Chem.MolToSmiles(nmol), "CC(F)(Cl)CC=CC(F)Cl")

        mol = Chem.MolFromSmiles("[12CH3][13CH3]")
        nmol = rdMolStandardize.IsotopeParent(mol)
        self.assertEqual(Chem.MolToSmiles(nmol), "CC")

        mol = Chem.MolFromSmiles(
            "[Na]Oc1c([12C@H](F)Cl)c(O[2H])c(C(=O)O)cc1CC=CO")
        nmol = rdMolStandardize.SuperParent(mol)
        self.assertEqual(Chem.MolToSmiles(nmol),
                         "O=CCCc1cc(C(=O)O)c(O)c(C(F)Cl)c1O")
        mol = Chem.MolFromSmiles(
            "[Na]Oc1c([12C@H](F)Cl)c(O[2H])c(C(=O)O)cc1CC=CO")
        nmol = rdMolStandardize.SuperParent(mol, skipStandardize=True)
        self.assertEqual(Chem.MolToSmiles(nmol),
                         "O=CCCc1cc(C(=O)[O-])c(O)c(C(F)Cl)c1O.[Na+]")
 def test20NoneHandling(self):
     with self.assertRaises(ValueError):
         rdMolStandardize.ChargeParent(None)
     with self.assertRaises(ValueError):
         rdMolStandardize.Cleanup(None)
     with self.assertRaises(ValueError):
         rdMolStandardize.FragmentParent(None)
     with self.assertRaises(ValueError):
         rdMolStandardize.Normalize(None)
     with self.assertRaises(ValueError):
         rdMolStandardize.Reionize(None)
Example #4
0
def clean_mol(smile, is_deep=True):
    smile = smile.replace('[O]', 'O').replace('[C]', 'C') \
        .replace('[N]', 'N').replace('[B]', 'B') \
        .replace('[2H]', '[H]').replace('[3H]', '[H]')
    try:
        mol = Chem.MolFromSmiles(smile)
        if is_deep:
            mol = rdMolStandardize.ChargeParent(mol)
        smileR = Chem.MolToSmiles(mol, 0)
        smile = Chem.CanonSmiles(smileR)
    except:
        print('Parsing Error:', smile)
        smile = None
    return smile
Example #5
0
    def calculate_single(self, smiles) -> Tuple:

        if smiles is nan:
            return None, False, "No smiles entry."
        try:
            mol = MolFromSmiles(
                smiles)  # Read SMILES and convert it to RDKit mol object.
        except (TypeError, ValueError, AttributeError) as e:
            return None, False, str(e)
        # Check, if the input SMILES has been converted into a mol object.
        if mol is None:
            return None, False, "failed to parse smiles {}".format(smiles)
        # check size of the molecule based on the non-hydrogen atom count.
        if mol.GetNumAtoms() >= self.max_num_atoms:
            return (
                None,
                False,
                "number of non-H atoms {0} exceeds limit of {1} for smiles {2}"
                .format(mol.GetNumAtoms(), self.max_num_atoms, smiles),
            )
        try:
            mol = rdMolStandardize.ChargeParent(
                mol)  # standardize molecules using MolVS and RDKit
            mol = self.isotope_parent(mol)
            if self.include_stereoinfo is False:
                Chem.RemoveStereochemistry(mol)
            mol = self.tautomerizer.Canonicalize(mol)
            mol_clean_tmp = self.my_standardizer(mol)
            smi_clean_tmp = MolToSmiles(
                mol_clean_tmp)  # convert mol object back to SMILES
            ## Double check if standardized SMILES is a valid mol object
            mol_clean = MolFromSmiles(smi_clean_tmp)
            smi_clean = MolToSmiles(mol_clean)
        except (TypeError, ValueError, AttributeError) as e:
            return None, False, str(e)
        return smi_clean, True, None
Example #6
0
 def test4ChargeParent(self):
     mol = Chem.MolFromSmiles("C[NH+](C)(C).[Cl-]")
     nmol = rdMolStandardize.ChargeParent(mol)
     self.assertEqual(Chem.MolToSmiles(nmol), "CN(C)C")
Example #7
0
    def get_simplified_smiles_for_chemicals(self) -> pd.DataFrame:
        """ This method gets SMILES for every chemical substance in the robokop neo4j graph database and creates a simplified SMILES from each.
            The simplified SMILES values will be used as a grouping mechanism and saved in the redis database.
        """
        # Create a target data frame for the processed data
        df: pd.DataFrame = pd.DataFrame(columns=[
            'chem_id', 'original_SMILES', 'simplified_SMILES', 'name'
        ])

        try:
            # Create the query. This is of course robokop specific
            # Query modified to exclude all chemical substances that have wildcard definitions
            c_query: str = f'match (c:chemical_substance) where c.smiles is not NULL and c.smiles <> "" and NOT c.smiles CONTAINS "*" RETURN c.id, c.smiles, c.name order by c.smiles {self._debug_record_limit}'

            self.print_debug_msg(
                f"Querying target database for chemical substances.", True)

            # check to see if we are in test mode
            if self._do_KGX != 0 or self._do_redis != 0:
                # execute the query
                records: list = self.run_neo4j_query(c_query)

                # to create a test data file
                # d = pd.DataFrame(records, columns=['c.id', 'c.smiles', 'c.name'])
                # d.to_json('datafile.json.test', orient='records')
            else:
                # open the test data file and use that instead of the database
                with open('./tests/datafile.json') as json_file:
                    records = json.load(json_file)

            # did we get some records
            if len(records) > 0:
                self.print_debug_msg(
                    f"{len(records)} chemical substance records will be processed.",
                    True)

                # init a counter
                count: int = 0

                # loop through the records
                for r in records:
                    # increment the record counter
                    count = count + 1

                    # inform user of progress
                    if count % 25000 == 0:
                        self.print_debug_msg(
                            f'get_simplified_smiles_for_chemicals() - At data record index {count}.',
                            True)

                    try:
                        # Construct a molecule from a SMILES string
                        molecule: Mol = Chem.MolFromSmiles(r['c.smiles'])
                    except Exception as e:
                        # alert the user there was an issue and continue
                        self.print_debug_msg(
                            f"Error - Exception trying to get a molecule for record {count}, chem id: {r['c.id']} with original SMILES: {r['c.smiles']}, Exception {e}. Proceeding.",
                            True)
                        continue

                    # did we get the molecule
                    if molecule is None:
                        # Couldn't parse the molecule
                        self.print_debug_msg(
                            f"Error - Got an empty molecule for record {count}, chem id: {r['c.id']} with smiles: {r['c.smiles']}. Proceeding.",
                            True)
                        continue
                    try:
                        # get the uncharged version of the largest fragment
                        molecule_uncharged: Mol = rdMolStandardize.ChargeParent(
                            molecule)

                        # Remove all stereo-chemistry info from the molecule
                        RemoveStereochemistry(molecule_uncharged)

                        # get the simplified SMILES value
                        simplified_smiles: str = Chem.MolToSmiles(
                            molecule_uncharged)

                        # convert the curie prefix to the new standard
                        if self._do_curie_update == 1:
                            chem_id = r['c.id'].replace(
                                "KEGG:", "KEGG.COMPOUND:").replace(
                                    "CHEMBL:", "CHEMBL.COMPOUND:")
                        else:
                            chem_id = r['c.id']

                        # check to see if there is a name
                        if r['c.name'] is None or r['c.name'] == '' or r[
                                'c.name'] == 'NULL':
                            name_fixed = chem_id
                        else:
                            # insure there are no dbl quotes in the name, it throws off the CSV file
                            name_fixed = r['c.name'].replace('"', "'")

                        # save the new record
                        record = {
                            'chem_id': chem_id,
                            'original_SMILES': r['c.smiles'],
                            'simplified_SMILES': simplified_smiles,
                            'name': name_fixed
                        }

                        # append the new record to the data frame
                        df = df.append(record, ignore_index=True)
                    except Exception as e:
                        # alert the user that something was discovered in the original graph record
                        self.print_debug_msg(
                            f"Error - Could not get a simplified SMILES for record {count}, chem id: {r['c.id']}, Original SMILES: {r['c.smiles']}, Exception: {e}"
                        )
            else:
                self.print_debug_msg(f"No records to process.", True)

        except Exception as e:
            raise e

        # return to the caller
        return df
Example #8
0
        ##           'CHEBI:30470','CHEBI:36301','CHEBI:38284','CHEBI:48998','CHEBI:37189',
        #           'CHEBI:60532','CHEBI:51221','CHEBI:29416', 'CHEBI:36163','CHEBI:29296',
        #           'CHEBI:51508','CHEBI:30665','CHEBI:29886','CHEBI:85715','CHEBI:49851',
        #           'CHEBI:30197','CHEBI:30125','CHEBI:37856','CHEBI:38283','CHEBI:10098',
        #           'CHEBI:132769','CHEBI:133489','CHEBI:134067','CHEBI:141330','CHEBI:15432',
        #           'CHEBI:26355','CHEBI:28163','CHEBI:29295','CHEBI:29417','CHEBI:29418',
        #           'CHEBI:29422','CHEBI:29440','CHEBI:29796','CHEBI:29880','CHEBI:30126',
        #           'CHEBI:30238']:
        #    continue
        smiles = x[2]
        if smiles == '[empty]':
            continue
        try:
            mol = Chem.MolFromSmiles(smiles)
        except Exception as e:
            print(f"error with {smiles}. Proceeding")
            continue
        if mol is None:
            #Couldn't parse
            continue
        try:
            print(f'{cid}\t{smiles}')
            molp = rdMolStandardize.ChargeParent(mol)
            RemoveStereochemistry(molp)
            newsmi = Chem.MolToSmiles(molp)
            #chems.append(chem)
            outf.write(f"{cid}\t{smiles}\t{newsmi}\n")
        except Exception as e:
            print(f"error with {x}")
            #exit()
Example #9
0
def structure_standardization(smi: str) -> str:
    """
    Standardization function to clean up smiles with RDKit. First, the input smiles is converted into a mol object.
    Not-readable SMILES are written to the log file. The molecule size is checked by the number of atoms (non-hydrogen).
    If the molecule has more than 100 non-hydrogen atoms, the compound is discarded and written in the log file.
    Molecules with number of non-hydrogen atoms <= 100 are standardized with the MolVS toolkit
    (https://molvs.readthedocs.io/en/latest/index.html) relying on RDKit. Molecules which failed the standardization
    process are saved in the log file. The remaining standardized structures are converted back into their canonical
    SMILES format.
    :param smi: Input SMILES from the given structure data file T4
    :return: smi_clean: Cleaned and standardized canonical SMILES of the given input SMILES.


    Args:
        smi (str): Non-standardized smiles string

    Returns:
        str: standardized smiles string
    """

    # tautomer.TAUTOMER_TRANSFORMS = update_tautomer_rules()
    # importlib.reload(MolVS_standardizer)
    # param = ReadConfig()
    standardization_param = ConfigDict.get_parameters()["standardization"]

    max_num_atoms = standardization_param["max_num_atoms"]
    max_num_tautomers = standardization_param["max_num_tautomers"]
    include_stereoinfo = standardization_param["include_stereoinfo"]

    ## Load new tautomer enumarator/canonicalizer
    tautomerizer = rdMolStandardize.TautomerEnumerator()
    tautomerizer.SetMaxTautomers(max_num_tautomers)
    tautomerizer.SetRemoveSp3Stereo(
        False)  # Keep stereo information of keto/enol tautomerization

    def isotope_parent(mol: Chem.Mol) -> Chem.Mol:
        """
        Isotope parent from MOLVS
        Return the isotope parent of a given molecule.
        The isotope parent has all atoms replaced with the most abundant isotope for that element.
        Args:
            mol (Chem.Mol): input rdkit mol object

        Returns:
            Chem.Mol: isotope parent rdkit mol object
        """
        mol = copy.deepcopy(mol)
        # Replace isotopes with common weight
        for atom in mol.GetAtoms():
            atom.SetIsotope(0)
        return mol

    def my_standardizer(mol: Chem.Mol) -> Chem.Mol:
        """
        MolVS implementation of standardization

        Args:
            mol (Chem.Mol): non-standardized rdkit mol object

        Returns:
            Chem.Mol: stndardized rdkit mol object
        """
        mol = copy.deepcopy(mol)
        Chem.SanitizeMol(mol)
        mol = Chem.RemoveHs(mol)
        disconnector = rdMolStandardize.MetalDisconnector()
        mol = disconnector.Disconnect(mol)
        normalizer = rdMolStandardize.Normalizer()
        mol = normalizer.normalize(mol)
        reionizer = rdMolStandardize.Reionizer()
        mol = reionizer.reionize(mol)
        Chem.AssignStereochemistry(mol, force=True, cleanIt=True)
        # TODO: Check this removes symmetric stereocenters
        return mol

    mol = MolFromSmiles(smi)  # Read SMILES and convert it to RDKit mol object.
    if (mol is not None
        ):  # Check, if the input SMILES has been converted into a mol object.
        if (
                mol.GetNumAtoms() <= max_num_atoms
        ):  # check size of the molecule based on the non-hydrogen atom count.
            try:

                mol = rdMolStandardize.ChargeParent(
                    mol)  # standardize molecules using MolVS and RDKit
                mol = isotope_parent(mol)
                if include_stereoinfo is False:
                    Chem.RemoveStereochemistry(mol)
                    mol = tautomerizer.Canonicalize(mol)
                    mol_clean = my_standardizer(mol)
                    smi_clean = MolToSmiles(
                        mol_clean)  # convert mol object back to SMILES
                else:
                    mol = tautomerizer.Canonicalize(mol)
                    mol_clean = my_standardizer(mol)
                    smi_clean = MolToSmiles(mol_clean)
            except (ValueError, AttributeError) as e:
                smi_clean = np.nan
                logging.error(
                    "Standardization error, " + smi + ", Error Type: " + str(e)
                )  # write failed molecules during standardization to log file

        else:
            smi_clean = np.nan
            logging.error("Molecule too large, " + smi)

    else:
        smi_clean = np.nan
        logging.error("Reading Error, " + smi)

    return smi_clean
Example #10
0
if args.gpu:
    model = model.cuda()

# Map smiles to embedding
embeddings = {}

input_smiles = open(args.smiles_list, "r").readlines()

# Filter title and remove new lines
input_smiles = [
    j.strip() for index, j in enumerate(input_smiles) if index >= 1
]

# Create charge parents
input_mols = [Chem.MolFromSmiles(i) for i in input_smiles]
standardized_mols = [rdMolStandardize.ChargeParent(i) for i in input_mols]
input_smiles = [Chem.MolToSmiles(i) for i in standardized_mols]

out_tensors = model.encode_from_smiles(input_smiles)
output = out_tensors.cpu().detach().numpy()
feature_mapping = {}

pickle_obj = dict(zip(input_smiles, output))
pickle.dump(pickle_obj, open(f"{args.out}.p", "wb"))

loaded_pickle = pickle.load(open(f"{args.out}.p", "rb"))
import pdb
pdb.set_trace()

print(loaded_pickle)