def get_atom_count(mol: rdkit.Chem.rdchem.Mol, radical_check: bool = False) -> collections.Counter: """Takes a mol object and returns a counter with each element type in the set. Parameters ---------- mol : rdkit.Chem.rdchem.Mol Mol object to count atoms for. radical_check : bool, optional Check for radical electrons and count if present. Returns ------- atoms : collections.Counter Count of each atom type in input molecule. """ atoms = collections.Counter() # Find all strings of the form A# in the molecular formula where A # is the element (e.g. C) and # is the number of atoms of that # element in the molecule. Pair is of form [A, #] for pair in re.findall(r"([A-Z][a-z]*)(\d*)", AllChem.CalcMolFormula(mol)): # Add # to atom count, unless there is no # (in which case # there is just one of that element, as ones are implicit in # chemical formulas) if pair[1]: atoms[pair[0]] += int(pair[1]) else: atoms[pair[0]] += 1 if radical_check: radical = any( [atom.GetNumRadicalElectrons() for atom in mol.GetAtoms()]) if radical: atoms["*"] += 1 return atoms
def set_computable(self): mol = tool_chemical.read_string("mol", self._mol) # molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula(mol) # molecular_weight = Descriptors.ExactMolWt(mol) self._smiles = Chem.MolToSmiles(mol, isomericSmiles=False) self._inchi = inchi.MolToInchi(mol) self._inchikey = inchi.MolToInchiKey(mol) self._molecular_formula = Chem.CalcMolFormula(mol) self._molecular_weight = Chem.CalcExactMolWt(mol)
def insert_core_compound(self, compound_dict, requests=None): """This method generates a mongo request to save a compound into the core database. The necessary fields for the API are calculated. If a list of requests are given the request is appended for later bulk writing. Otherwise a single entry is made. If a compound is already in the core database nothing is written. :param compound_dict: Compound Dictionary :type compound_dict: dict :param requests: List of requests for bulk insert :type requests: None """ core_dict = copy(compound_dict) cpd_id = core_dict['_id'] mol_object = AllChem.MolFromSmiles(core_dict['SMILES']) if 'Generation' in core_dict: del (core_dict['Generation']) if 'Expand' in core_dict: del (core_dict['Expand']) if 'Type' in core_dict: del (core_dict['Type']) if 'Product_of' in core_dict: del (core_dict['Product_of']) if 'Reactant_in' in core_dict: del (core_dict['Reactant_in']) # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary if not 'SMILES' in core_dict: core_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True) if not 'Inchi' in core_dict: core_dict['Inchi'] = AllChem.MolToInchi(mol_object) if not 'Inchikey' in core_dict: core_dict['Inchikey'] = AllChem.InchiToInchiKey(core_dict['Inchi']) core_dict['Mass'] = AllChem.CalcExactMolWt(mol_object) core_dict['Formula'] = AllChem.CalcMolFormula(mol_object) core_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0] core_dict['NP_likeness'] = nps.scoreMol(mol_object, self.nps_model) core_dict['Spectra'] = {} # Record which expansion it's coming from core_dict['MINES'] = [] if requests != None: requests.append( pymongo.UpdateOne({'_id': cpd_id}, {'$setOnInsert': core_dict}, upsert=True)) else: self.core_compounds.update_one({'_id': cpd_id}, {'$setOnInsert': core_dict}, upsert=True) return None
def test_api_addMolecule(self): response = self.client.post(path="/api/addMolecule", data={"molfile": self.propane}) self.assertEqual(response.status_code, 200) mol = AllChem.MolFromMolBlock(self.propane) mol_added = Molecule.objects.last() self.assertEqual(float("{0:.2f}".format(AllChem.CalcExactMolWt(mol))), mol_added.mw) self.assertEqual(AllChem.MolToSmiles(mol), mol_added.smiles) self.assertEqual(AllChem.CalcMolFormula(mol), mol_added.sum_formula) inchi = AllChem.MolToInchi(mol) self.assertEqual(inchi, mol_added.inchi) self.assertEqual(AllChem.InchiToInchiKey(inchi), mol_added.inchi_key)
def _make_compound_info(mol_object): return { 'smiles': AllChem.MolToSmiles(mol_object, True), 'inchikey': AllChem.InchiToInchiKey(AllChem.MolToInchi(mol_object)), 'mass': Descriptors.MolWt(mol_object), 'exactmass': AllChem.CalcExactMolWt(mol_object), 'formula': AllChem.CalcMolFormula(mol_object), 'charge': AllChem.GetFormalCharge(mol_object), 'fingerprints': { 'maccs': dict([(str(x), 1) for x in AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits()]), 'rdkit': dict([(str(x), 1) for x in AllChem.RDKFingerprint(mol_object).GetOnBits()]), }, 'dblinks': {}, }
def testMolFormula(self): for (smiles, expected) in (("[NH4+]", "H4N+"), ("c1ccccc1", "C6H6"), ("C1CCCCC1", "C6H12"), ("c1ccccc1O", "C6H6O"), ("C1CCCCC1O", "C6H12O"), ("C1CCCCC1=O", "C6H10O"), ("N[Na]", "H2NNa"), ("[C-][C-]", "C2-2"), ("[H]", "H"), ("[H-1]", "H-"), ("[H-1]", "H-"), ("[CH2]", "CH2"), ("[He-2]", "He-2"), ("[U+3]", "U+3"),): mol = Chem.MolFromSmiles(smiles) actual = AllChem.CalcMolFormula(mol) self.assertEqual(actual, expected)
def sdf_parser(soubor): mol_counter = 0 suppl = Chem.SDMolSupplier(soubor) for mol in suppl: if mol is None: continue print(mol.GetNumAtoms()) mol_counter += 1 new_inchi = Chem.MolToInchi(mol) new_inchikey = Chem.InchiToInchiKey(new_inchi) # kontrola jestli je molekula již v databázi dle inchikey - ten by měl být unikátní if Molecule.objects.filter(inchikey=new_inchikey).exists(): print(mol, "already exists") else: new_smiles = Chem.MolToSmiles(mol) new_summaryForm = AllChem.CalcMolFormula(mol) new_molweigth = AllChem.CalcExactMolWt(mol) if mol.HasProp('PUBCHEM_SUBSTANCE_SYNONYM'): new_name = mol.GetProp('PUBCHEM_SUBSTANCE_SYNONYM').split("\n")[0] newInsertedMolecule = Molecule(name=new_name, smiles=new_smiles, mol_weight=new_molweigth, inchi=new_inchi, inchikey=new_inchikey, summary_formula=new_summaryForm) newInsertedMolecule.save() """ new_name = django_form.cleaned_data['new_name'] new_smiles = django_form.cleaned_data.get('new_smiles', '') new_summaryForm = django_form.cleaned_data.get('new_summaryForm', '') newInsertedMolecule = Molecule(name=new_name, smiles=new_smiles, summary_formula=new_summaryForm) newInsertedMolecule.save() """ # ulož do databáze, naparsuj atd. #mols = [x for x in suppl] return mol_counter
def convert(structure: str, fmt: Format, get3d: bool) -> str: """Convenience function for conversion""" m_canon = rdkit_atom_order(smi_to_mol(structure)) m_canon.SetProp("_Name", AllChem.CalcMolFormula(m_canon)) if fmt == Format.sdf: print("SDF") if get3d: AllChem.EmbedMolecule(m_canon, randomSeed=0xF00D) return mol_to_sdf(m_canon) + f"\n> <SMILES>\n{structure}\n\n$$$$" if fmt == Format.smiles: print("SMILES") return mol_to_smi(m_canon) if fmt == Format.inchi: print("InChI") return mol_to_inchi(m_canon) if fmt == Format.svg: print("SVG") return mol_to_svg(m_canon) return "Broken"
def parse_comps(field): atoms = collections.Counter() compounds = collections.Counter(field.split(' // ')) half_rxn = [] for comp, stoich in compounds.items(): if comp in metacyc2hash: mol = metacyc2hash[comp] for pair in re.findall('([A-Z][a-z]*)(\d*)', AllChem.CalcMolFormula(mol)): if pair[1]: atoms[pair[0]] += int(pair[1]) * stoich else: atoms[pair[0]] += 1 * stoich if comp not in inserted: mine_db.insert_compound(mol, {'Generation': 0}) inserted.add(comp) half_rxn.append( utils.stoich_tuple(stoich, utils.get_compound_hash(mol))) else: raise ValueError('Undefined Compound: %s' % comp) return half_rxn, atoms
def _add_compound(self, id, smi, mol=None, type='Predicted'): """Adds a compound to the internal compound dictionary""" _id = utils.compound_hash(smi, type == 'Coreactant') self._raw_compounds[smi] = _id # We don't want to overwrite the same compound from a prior # generation so we check with hashed id from above if _id not in self.compounds: if not mol: mol = AllChem.MolFromSmiles(smi) i_key = AllChem.InchiToInchiKey(AllChem.MolToInchi(mol)) self.compounds[_id] = { 'ID': id, '_id': _id, "SMILES": smi, 'Inchikey': i_key, 'Type': type, 'Generation': self.generation, 'Formula': AllChem.CalcMolFormula(mol), '_atom_count': self._get_atom_count(mol), 'Charge': AllChem.GetFormalCharge(mol), 'Reactant_in': [], 'Product_of': [], "Sources": [] } # Don't track sources of coreactants if _id[0] == 'X': del self.compounds[_id]['Sources'] # If we are building a mine and generating images, do so here if self.image_dir and self.mine: try: with open(os.path.join(self.image_dir, _id + '.svg'), 'w') as outfile: nmol = rdMolDraw2D.PrepareMolForDrawing(mol) d2d = rdMolDraw2D.MolDraw2DSVG(1000, 1000) d2d.DrawMolecule(nmol) d2d.FinishDrawing() outfile.write(d2d.GetDrawingText()) except OSError: print("Unable to generate image for %s" % smi) return _id
def _get_atom_count(self, mol): """Takes a set of mol objects and returns a counter with each element type in the set""" atoms = collections.Counter() # Find all strings of the form A# in the molecular formula where A # is the element (e.g. C) and # is the number of atoms of that # element in the molecule. Pair is of form [A, #] for pair in re.findall('([A-Z][a-z]*)(\d*)', AllChem.CalcMolFormula(mol)): # Add # to atom count, unless there is no # (in which case # there is just one of that element, as ones are implicit in # chemical formulas) if pair[1]: atoms[pair[0]] += int(pair[1]) else: atoms[pair[0]] += 1 if self.radical_check: radical = any( [atom.GetNumRadicalElectrons() for atom in mol.GetAtoms()]) if radical: atoms["*"] += 1 return atoms
def _get_core_cpd_insert(cpd_dict: dict) -> pymongo.UpdateOne: """Generate core compound to be inserted""" core_keys = ["_id", "SMILES", "Inchi", "InchiKey", "Mass", "Formula"] core_dict = { key: cpd_dict.get(key) for key in core_keys if cpd_dict.get(key) != None } mol_object = AllChem.MolFromSmiles(core_dict["SMILES"]) rdk_fp = [ i for i, val in enumerate( list(AllChem.RDKFingerprint(mol_object, fpSize=512))) if val ] # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary if not "SMILES" in core_dict: core_dict["SMILES"] = AllChem.MolToSmiles(mol_object, True) if not "Inchi" in core_dict: core_dict["Inchi"] = AllChem.MolToInchi(mol_object) if not "Inchikey" in core_dict: core_dict["Inchikey"] = AllChem.InchiToInchiKey(core_dict["Inchi"]) core_dict["Mass"] = AllChem.CalcExactMolWt(mol_object) core_dict["Charge"] = AllChem.GetFormalCharge(mol_object) core_dict["Formula"] = AllChem.CalcMolFormula(mol_object) core_dict["logP"] = AllChem.CalcCrippenDescriptors(mol_object)[0] core_dict["RDKit_fp"] = rdk_fp core_dict["len_RDKit_fp"] = len(rdk_fp) # core_dict['NP_likeness'] = nps.scoreMol(mol_object, nps_model) core_dict["Spectra"] = {} # Record which expansion it's coming from core_dict["MINES"] = [] return pymongo.UpdateOne({"_id": core_dict["_id"]}, {"$setOnInsert": core_dict}, upsert=True)
def insert_compound(self, mol_object, compound_dict=None, bulk=None, kegg_db="KEGG", pubchem_db='PubChem-8-28-2015', modelseed_db='ModelSEED'): """This class saves a RDKit Molecule as a compound entry in the MINE. Calculates necessary fields for API and includes additional information passed in the compound dict. Overwrites preexisting compounds in MINE on _id collision. :param mol_object: The compound to be stored :type mol_object: RDKit Mol object :param compound_dict: Additional information about the compound to be stored. Overwritten by calculated values. :type compound_dict: dict :param bulk: A pymongo bulk operation object. If None, reaction is immediately inserted in the database :param kegg_db: The ID of the KEGG Mongo database :type kegg_db: str :param pubchem_db: The ID of the PubChem Mongo database :type pubchem_db: str :param modelseed_db: The ID of the ModelSEED Mongo database :type modelseed_db: str :return: The hashed _id of the compound :rtype: str """ if compound_dict is None: compound_dict = {} # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary compound_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True) compound_dict['Inchi'] = AllChem.MolToInchi(mol_object) compound_dict['Inchikey'] = AllChem.InchiToInchiKey( compound_dict['Inchi']) compound_dict['Mass'] = AllChem.CalcExactMolWt(mol_object) compound_dict['Formula'] = AllChem.CalcMolFormula(mol_object) compound_dict['Charge'] = AllChem.GetFormalCharge(mol_object) # Get indices where bits are 1 compound_dict['MACCS'] = list( AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits()) compound_dict['len_MACCS'] = len(compound_dict['MACCS']) # Get indices where bits are 1 compound_dict['RDKit'] = list( AllChem.RDKFingerprint(mol_object).GetOnBits()) compound_dict['len_RDKit'] = len(compound_dict['RDKit']) compound_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0] compound_dict['_id'] = utils.compound_hash( compound_dict['SMILES'], ('Type' in compound_dict and compound_dict['Type'] == 'Coreactant')) if '_atom_count' in compound_dict: del compound_dict['_atom_count'] # Caching this for rapid reaction mass change calculation self._mass_cache[compound_dict['_id']] = compound_dict['Mass'] # If the compound is a reactant, then make sure the reactant name is # in a correct format. if "Reactant_in" in compound_dict and isinstance( compound_dict['Reactant_in'], str) \ and compound_dict['Reactant_in']: compound_dict['Reactant_in'] = ast.literal_eval( compound_dict['Reactant_in']) # If the compound is a product, then make sure the reactant name is # in a correct format. if "Product_of" in compound_dict \ and isinstance(compound_dict['Product_of'], str) \ and compound_dict['Product_of']: compound_dict['Product_of'] = ast.literal_eval( compound_dict['Product_of']) # Store links to external databases where compound is present if compound_dict['Inchikey']: if kegg_db: compound_dict = self.link_to_external_database( kegg_db, compound=compound_dict, fields_to_copy=[('Pathways', 'Pathways'), ('Names', 'Names'), ('DB_links', 'DB_links'), ('Enzymes', 'Enzymes')]) if pubchem_db: compound_dict = self.link_to_external_database( pubchem_db, compound=compound_dict, fields_to_copy=[('COMPOUND_CID', 'DB_links.PubChem')]) if modelseed_db: compound_dict = self.link_to_external_database( modelseed_db, compound=compound_dict, fields_to_copy=[('DB_links', 'DB_links')]) # Calculate natural product likeness score and store in dict if not self.np_model: self.np_model = np.readNPModel() compound_dict["NP_likeness"] = np.scoreMol(mol_object, self.np_model) compound_dict = utils.convert_sets_to_lists(compound_dict) # Assign an id to the compound if self.id_db: mine_comp = self.id_db.compounds.find_one( {"Inchikey": compound_dict['Inchikey']}, { 'MINE_id': 1, "Pos_CFM_spectra": 1, "Neg_CFM_spectra": 1 }) # If compound already exists in MINE, store its MINE id in the dict if mine_comp: compound_dict['MINE_id'] = mine_comp['MINE_id'] if 'Pos_CFM_spectra' in mine_comp: compound_dict['Pos_CFM_spectra'] = mine_comp[ 'Pos_CFM_spectra'] if 'Neg_CFM_spectra' in mine_comp: compound_dict['Neg_CFM_spectra'] = mine_comp[ 'Neg_CFM_spectra'] # If compound does not exist, create new id based on number of # current ids in the MINE else: compound_dict['MINE_id'] = self.id_db.compounds.count() self.id_db.compounds.save(compound_dict) # If bulk insertion, upsert (insert and update) the database if bulk: bulk.find({'_id': compound_dict['_id']}).upsert().\ replace_one(compound_dict) else: self.compounds.save(compound_dict) return compound_dict['_id']
def get_molecular_formula(mol): """Make String of molecular formula from rdkit.Mol""" return AllChem.CalcMolFormula(mol)
def save(self, smiles=None, molfile=None, rdmol=None, inchi=None, name=None, update=False, *args, **kwargs): if not update: if molfile: mol = AllChem.MolFromMolBlock(molfile) elif smiles: mol = AllChem.MolFromSmiles(smiles) elif rdmol: mol = rdmol elif inchi: mol = AllChem.MolFromInchi(inchi) if mol: inchi = AllChem.MolToInchi(mol) smiles = AllChem.MolToSmiles(mol) if inchi and Molecule.objects.filter( inchi=inchi).count() == 0 and len(inchi) > 1: self.inchi = inchi self.mw = float("{0:.2f}".format( AllChem.CalcExactMolWt(mol))) self.sum_formula = AllChem.CalcMolFormula(mol) self.fingerprint = AllChem.GetMorganFingerprintAsBitVect( mol, 4, nBits=1024).ToBitString() self.inchi_key = AllChem.InchiToInchiKey(self.inchi) self.molfile = AllChem.MolToMolBlock(mol) self.smiles = smiles self.rdmol = mol # generating SVG image if self.smiles not in self.EXCLUDED_MOLECULES: binMol = AllChem.Mol(self.rdmol.ToBinary()) if not binMol.GetNumConformers(): rdDepictor.Compute2DCoords(self.rdmol) drawer = rdMolDraw2D.MolDraw2DSVG(100, 100) drawer.DrawMolecule(self.rdmol) drawer.FinishDrawing() svg = drawer.GetDrawingText().replace('svg:', '') # remove first line containg XML meta information self.image_svg = "\n".join(svg.split("\n")[1:]).strip() else: self.image_svg = None if name: self.name = name else: try: self.name = mol.GetProp("LONGNAME") except KeyError: self.name = None if Molecule.objects.all().count() == 0: self.internal_id = "MI-J-1" else: self.internal_id = "MI-J-{}".format( Molecule.objects.latest("id").id + 1) super(Molecule, self).save(*args, **kwargs) else: raise self.MoleculeExistsInDatabase(smiles) else: raise self.MoleculeCreationError else: super(Molecule, self).save(*args, **kwargs)
if (mol): mol_source = "OpenBabel" else: mol_source = "RDKit" except Exception as e: pass if (mol is None): unresolved_structures.write(external_id + "\t" + struct_stage + "\t" + structure + "\n") continue new_formula = "" if (mol_source == "RDKit"): new_formula = AllChem.CalcMolFormula(mol) elif (mol_source == "OpenBabel"): new_formula = mol.formula new_charge = 0 if (mol_source == "RDKit"): new_charge = AllChem.GetFormalCharge(mol) match = re.search('([-+]\d?)$', new_formula) if (match): new_formula = new_formula.replace(match.group(), '') elif (mol_source == "OpenBabel"): new_charge = mol.charge match = re.search('([-+]+)$', new_formula) if (match): new_formula = new_formula.replace(match.group(), '')
def pipe_calc_props(stream, props, force2d=False, summary=None, comp_id="pipe_calc_props"): """Calculate properties from the Mol_List. props can be a single property or a list of properties. Calculable properties: 2d, date, formula, hba, hbd, logp, molid, mw, smiles, rotb, sa (synthetic accessibility), tpsa Synthetic Accessibility (normalized): 0: hard to synthesize; 1: easy access as described in: | Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions | *Peter Ertl and Ansgar Schuffenhauer* | Journal of Cheminformatics 1:8 (2009) (`link <http://www.jcheminf.com/content/1/1/8>`_) """ rec_counter = 0 if not isinstance(props, list): props = [props] # make all props lower-case: props = list(map(lambda x: x.lower(), props)) for rec in stream: if "mol" in rec: mol = rec["mol"] if "2d" in props: check_2d_coords(mol, force2d) if "date" in props: rec["Date"] = time.strftime("%Y%m%d") if "formula" in props: rec["Formula"] = Chem.CalcMolFormula(mol) if "hba" in props: rec["HBA"] = str(Desc.NOCount(mol)) if "hbd" in props: rec["HBD"] = str(Desc.NHOHCount(mol)) if "logp" in props: rec["LogP"] = "{:.2f}".format(Desc.MolLogP(mol)) if "mw" in props: rec["MW"] = "{:.2f}".format(Desc.MolWt(mol)) if "rotb" in props: mol.SetProp("RotB", str(Desc.NumRotatableBonds(mol))) if "smiles" in props: mol.SetProp("Smiles", Chem.MolToSmiles(mol)) if SASCORER and "sa" in props: score = sascorer.calculateScore(mol) norm_score = 1 - (score / 10) rec["SA"] = "{:.2f}".format(norm_score) if "tpsa" in props: rec["TPSA"] = str(int(Desc.TPSA(mol))) rec_counter += 1 if summary is not None: summary[comp_id] = rec_counter yield rec
# Check for missing or unknown formulas. if cpd['formula'] == '' or cpd['formula'] == 'noformula' or cpd[ 'formula'] == 'unknown': noFormula.append(index) # Check for duplicate and missing compound structures. mol = AllChem.MolFromInchi(cpd['smiles']) if mol: inchikey = AllChem.InchiToInchiKey(cpd['smiles']) if inchikey in structureDict: if inchikey not in duplicateStructure: duplicateStructure[inchikey] = [structureDict[inchikey]] duplicateStructure[inchikey].append(index) else: structureDict[inchikey] = index if cpd['formula'] != AllChem.CalcMolFormula(mol): inconsistentFormula[index] = (cpd['formula'], AllChem.CalcMolFormula(mol)) else: noStructure.append(index) # Check for charges that are too big. if 'charge' in cpd: if abs(cpd['charge']) > args.charge: largeCharge.append(index) else: noCharge.append(index) # Check for invalid is_core flags. if cpd['is_core'] != 0 and cpd['is_core'] != 1: badCore.append(index)