def compute_one_inchi(mol): molfrom = Chem.MolFromSmiles(mol) if molfrom == None: return ((None, None, None)) ini = inchi.MolToInchi(molfrom) inikey = inchi.InchiToInchiKey(ini) hsh = hashlib.md5(ini.encode('utf-8')).hexdigest() return ((hsh, inikey, ini))
def set_computable(self): mol = tool_chemical.read_string("mol", self._mol) # molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula(mol) # molecular_weight = Descriptors.ExactMolWt(mol) self._smiles = Chem.MolToSmiles(mol, isomericSmiles=False) self._inchi = inchi.MolToInchi(mol) self._inchikey = inchi.MolToInchiKey(mol) self._molecular_formula = Chem.CalcMolFormula(mol) self._molecular_weight = Chem.CalcExactMolWt(mol)
def convert(input, input_mod='smi'): """ convert SMILES into other molecular identifier :param input: SMILES :param input_mod: 'smi' :return: str(molecular formula), str(inchi), str(inchikey) """ mol = read_string(input_mod, input) molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula(mol) molecular_inchi = inchi.MolToInchi(mol) molecular_inchikey = inchi.MolToInchiKey(mol) return molecular_formula, molecular_inchi, molecular_inchikey
def set_computables_from_mol(self, mol): try: # warning comes up in pycharm (bug of pycharm) self.molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula( mol) self.molecular_weight = Descriptors.ExactMolWt(mol) self.inchi = inchi.MolToInchi(mol) self.inchikey = inchi.MolToInchiKey(mol) self.smiles = Chem.MolToSmiles(mol, isomericSmiles=False) except Exception as e: raise SpectrumError("Error occurred while computing properties" + e.args) from e assert self.molecular_formula is not None, "molecular-formula can't be None" assert self.molecular_weight is not None, "molecular-weight can't be None" assert self.inchi is not None, "inchi can't be None" assert self.inchikey is not None, "inchikey can't be None" assert self.smiles is not None, "smiles can't be None"
def fill_base_test(cursor): df = pd.read_csv("toxicity_85832.csv") #df = df.drop("Unnamed: 0", axis=1) names_of_columns = list(df.columns) smiles = list(df["SMILES"]) df = df.drop("SMILES", axis=1) toxic_vals = np.array(df.values) #molecules canonize_smiles = [_canonize_mixture(smile) for smile in smiles] inchi_smiles = [ inchi.MolToInchi(Chem.MolFromSmiles(smile)) for smile in canonize_smiles ] inchikey = [ inchi.MolToInchiKey(Chem.MolFromSmiles(smile)) for smile in canonize_smiles ] ids = [x for x in range(len(canonize_smiles))] ziped_vals = zip(inchikey, inchi_smiles, canonize_smiles) cursor.executemany( """insert into 'molecules' (inchi_key,inchi,canonical_smiles) values (?,?,?)""", ziped_vals) #tasks descr_tasks = [ randomStringwithDigitsAndSymbols(random.randint(1, 30)) for i in range(20) ] cursor.executemany("""insert into 'tasks' (descr) values (?)""", zip(descr_tasks)) #tasks_running completed = [random.randint(0, 1) for i in range(1000)] id_tasks = [random.randint(1, len(descr_tasks)) for i in range(1000)] id_molecules = [random.randint(1, len(smiles)) for i in range(1000)] zip_tasks_running = zip(id_tasks, id_molecules, completed) cursor.executemany( """insert into 'tasks_running' (id_task, id_molecule, completed) values (?,?,?)""", zip_tasks_running) #descriptors name_of_descr = [ randomStringwithDigitsAndSymbols(random.randint(1, 30)) for i in range(10) ] name_of_version = [ randomStringwithDigitsAndSymbols(random.randint(1, 30)) for i in range(10) ] ziped_versions = zip(name_of_descr, name_of_version) cursor.executemany( """insert into 'descriptors' (descriptor, version) values (?,?)""", ziped_versions) cursor.execute( """insert into 'descriptors' (descriptor, version) values (?,?)""", ("mordred", "0.315")) #descriptor_values id_descriptor = [11 for i in range(len(smiles))] id_molecule = [x + 1 for x in range(len(smiles))] id_tasks = [ random.randint(1, len(descr_tasks)) for i in range(len(smiles)) ] valid = [random.randint(0, 1) for i in range(len(smiles))] value = func(canonize_smiles) ziped_descr_vals = zip(id_molecule, id_descriptor, id_tasks, valid, value) cursor.executemany( """insert into 'descriptors_values' (id_molecule, id_descriptor, id_task, valid, value) values (?,?,?,?,?)""", ziped_descr_vals) #endpoints features = names_of_columns[1:] descriptions = [feature.split('_')[1] for feature in features] types = ['_'.join(feature.split('_')[2:]) for feature in features] ziped_endpoints = zip(descriptions, types) cursor.executemany("""insert into 'endpoints' (desc, type) values (?,?)""", ziped_endpoints) #experimnetal data ids_molecules = [] ids_endpoints = [] values_endpoints = [] for i in range(len(toxic_vals[:, 0])): for j in range(len(toxic_vals[0, :])): if (~np.isnan(toxic_vals[i, j])): ids_molecules.append(i + 1) ids_endpoints.append(j + 1) values_endpoints.append(toxic_vals[i, j]) ziped_experimental_data = zip(ids_molecules, ids_endpoints, values_endpoints) cursor.executemany( """insert into 'experimental_data' (id_molecule, id_endpoint, value) values (?,?,?)""", ziped_experimental_data) return cursor
def calculate_inchi(smile): return inchi.MolToInchi(Chem.MolFromSmiles(smile))
def loadSDF(sdfPath): # Create images #generateImages(sdfPath) # Create a molecule supplier suppl = Chem.SDMolSupplier(sdfPath) # Filter empty entries sdf = [x for x in suppl if x is not None] # For each molecule in supplier for mol in sdf: data = {} try: data['fCharge'] = mol.GetProp('Charge') except: data['fCharge'] = Chem.GetFormalCharge(mol) try: data['name'] = mol.GetProp('DATABASE_ID') except: data['name'] = 'unkown' try: data['molMass'] = mol.GetProp('Total Molweight') except: data['molMass'] = Descriptors.ExactMolWt(mol) try: data['cLogP'] = mol.GetProp('cLogP') except: data['cLogP'] = Crippen.MolLogP(mol) # não sei se ta certo try: data['cLogS'] = mol.GetProp('cLogS') except: data['cLogS'] = 0.0 try: data['tpsa'] = mol.GetProp('Polar Surface Area') except: data['tpsa'] = rdMolDescriptors.CalcTPSA(mol) try: data['totalSurfaceArea'] = mol.GetProp('Total Surface Area') except: data['totalSurfaceArea'] = rdMolDescriptors.CalcTPSA(mol) try: data['hbondAcceptors'] = mol.GetProp('H-Acceptors') except: data['hbondAcceptors'] = rdMolDescriptors.CalcNumHBA(mol) try: data['hbondDonnors'] = mol.GetProp('H-Donors') except: data['hbondDonnors'] = rdMolDescriptors.CalcNumHBD(mol) try: data['rotable'] = mol.GetProp('Rotatable Bonds') except: data['rotable'] = rdMolDescriptors.CalcNumRotatableBonds(mol) try: data['mutagenic'] = mol.GetProp('Mutagenic') except: data['mutagenic'] = 'Unknown' try: data['tumorigenic'] = mol.GetProp('Tumorigenic') except: data['tumorigenic'] = 'Unknown' try: data['irritant'] = mol.GetProp('Irritant') except: data['irritant'] = 'Unkown' try: data['smiles'] = mol.GetProp('SMILES') except: data['smiles'] = Chem.MolToSmiles(mol) try: data['InChI'] = mol.GetProp('INCHI_IDENTIFIER') except: data['InChI'] = inchi.MolToInchi(mol) try: data['inchiKey'] = mol.GetProp('INCHI_KEY') except: data['inchiKey'] = inchi.MolToInchiKey(mol) try: data['nonHAtoms'] = mol.GetProp('Non-H Atoms') except: data['nonHAtoms'] = -1 # Não sei calcular try: data['numAtoms'] = mol.GetProp('numAtoms') except: data['numAtoms'] = mol.GetNumAtoms() try: data['stereoCenters'] = mol.GetProp('Stereo Centers') except: data['stereoCenters'] = mol.GetNumAtoms() try: data['provider'] = mol.GetProp('DATABASE_NAME') except: print("Nenhum fornecedor encontrado, o campo é obrigatório!") continue tmp = AllChem.Compute2DCoords(mol) # Compute its coordinates Draw.MolToFile(mol, os.path.join(settings.FILES_DIR, f'molImages/' + data["inchiKey"] + '.png'), size=(300,300), kekulize=True, wedgeBonds=True, fitImage=True) # Save it Draw.MolToFile(mol, os.path.join(settings.FILES_DIR, f'molThumbs/' + data["inchiKey"] + '.png'), size=(150,150), kekulize=True, wedgeBonds=True, fitImage=True) feedDatabase(data) if Compounds.objects.filter(inChIKey=data['inchiKey']).exists(): if not Compounds.objects.filter(provider=['provider']).exists(): feedDatabase(data) print("feed1") # append no sdf da base de dados a = 1 else: print("continue123") continue else: a = 1 feedDatabase(data) print("feed2") '''except:
def parse_f(f): names = [''] cid = -1 CAS = f.split('/')[1] if '/' in f else f CAS = CAS.split('.')[0] if CAS in ignored_CASs: return None failed_mol = False try: if CAS in syn_data: d = syn_data[CAS] if 'pubchem' in d: raise Exception( 'Pubchem specified, not trying to use the mol file') elif 'formula' in d: raise Exception( 'Formula specified, not trying to use the mol file') try: mol = Chem.MolFromMolFile(f) assert mol is not None except: print('Cannot read %s' % f) 1 / 0 try: inchi_val = inchi.MolToInchi(mol) except: print('BAILING ON %s' % f) 1 / 0 mol = inchi.MolFromInchi(inchi_val) # Works better for ions if mol is None: print('BAILING ON reconversion to mol %s' % f) 1 / 0 except: failed_mol = True if CAS in syn_data: d = syn_data[CAS] if 'pubchem' in d: if str(d['pubchem']) in mycache: cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula = mycache[ str(d['pubchem'])] else: pc = Compound.from_cid(d['pubchem']) cid = pc.cid iupac_name = pc.iupac_name names = pc.synonyms mw = pc.molecular_weight smi = pc.canonical_smiles inchi_val = pc.inchi inchikey = pc.inchikey formula = pc.molecular_formula mycache[str(d['pubchem'])] = (cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula) else: cid = -1 names = d['synonyms'] if 'synonyms' in d else [''] mw = float(d['MW']) smi = d['smiles'] if 'smiles' in d else '' formula = d['formula'] if 'formula' in d else '' inchi_val = d['inchi'] if 'inchi' in d else '' inchikey = d['inchikey'] if 'inchikey' in d else '' iupac_name = '' else: print('FAILED on %s and no custom data was available either' % CAS) return None if not failed_mol: smi = Chem.MolToSmiles(mol, True) inchi_val = inchi.MolToInchi(mol) inchikey = inchi.InchiToInchiKey(inchi_val) mw = Descriptors.MolWt(mol) # for i in mol.GetAtoms(): # if i.GetIsotope(): # mw = Descriptors.ExactMolWt(mol) # break formula = CalcMolFormula(mol, True, True) iupac_name = '' try: if not failed_mol: if str(inchikey) in mycache: cid, iupac_name, names = mycache[str(inchikey)] else: try: pc = get_compounds(inchikey, 'inchikey')[0] cid = pc.cid iupac_name = pc.iupac_name names = pc.synonyms mycache[str(inchikey)] = (cid, iupac_name, names) except: mycache[str(inchikey)] = (-1, '', ['']) except: cid = -1 iupac_name = '' names = [''] other_CAS = [] if CAS in pdf_data: d = pdf_data[CAS] name = d['Name'] if 'Other Names' in d: syns = d['Other Names'] else: syns = [] if not iupac_name: iupac_name = name else: syns.insert(0, name) if 'Deleted CAS' in d: other_CAS.extend(d['Deleted CAS']) if 'Alternate CAS' in d: other_CAS.extend(d['Alternate CAS']) syns = [i for i in syns if i not in dup_names] names = syns + [i for i in names if i not in all_names] + other_CAS actual_names = [] for name in names: if name in all_user_names: # If the name is in the user db, only add it if it corresponds to this CAS number if CAS in syn_data and 'synonyms' in syn_data[ CAS] and name in syn_data[CAS]['synonyms']: actual_names.append(name) else: # Discard it otherwise pass else: # If the name is not in the user db we're all good actual_names.append(name) if CAS in syn_data and 'synonyms' in syn_data[CAS]: # If the user has any syns for this cas number, add those names if the name hasn't already been aded for n in syn_data[CAS]['synonyms']: if n not in actual_names: actual_names.append(n) actual_names = [i for i in actual_names if i] if inchi_val is not None: inchi_val = inchi_val.replace('InChI=1S/', '') formula = serialize_formula(formula) s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi, inchi_val, inchikey, iupac_name) s += '\t'.join(actual_names) print(s) return None
except Exception: print("Not able to remove stereochemistry. Chembl.") try: mol = standardise.run(mol) except standardise.StandardiseException as e: logging.warn(e.message) try: mol = s.standardize(mol) except Exception: print("Not able to standardize. Chembl.") try: mol = s.tautomer_parent(mol, skip_standardize=True) except Exception: print("Not able to make tautomer parent. Chembl.") mol = s.stereo_parent(mol, skip_standardize=True) chembl_help[lig][0] = inchi.MolToInchi(mol) #BDB preparing bdb_help = [] list_help = [] conn = psycopg2.connect('dbname=bdb user=data host=/tmp/') curs = conn.cursor() curs.execute( "select bdb_2_ligand_inchi, bdb_2_ligand_inchi from bdb_base B FULL OUTER JOIN bdb_extend E ON B.bdb_0_bindingdb_reactant_set_id = E.bdb_0_bindingdb_reactant_set_id where bdb_40_uniprot__swissprot__entry_name_of_target_chain IN ('THA_HUMAN', 'THB_HUMAN', 'RARA_HUMAN', 'RARB_HUMAN', 'RARG_HUMAN', 'PPARA_HUMAN', 'PPARD_HUMAN', 'PPARG_HUMAN', 'NR1D1_HUMAN', 'NR1D2_HUMAN', 'RORA_HUMAN', 'RORB_HUMAN', 'RORG_HUMAN', 'NR1H2_HUMAN', 'NR1H3_HUMAN', 'NR1H4_HUMAN', 'VDR_HUMAN', 'NR1I2_HUMAN', 'NR1I3_HUMAN', 'HNF4A_HUMAN', 'HNF4G_HUMAN', 'RXRA_HUMAN', 'RXRB_HUMAN', 'RXRG_HUMAN', 'NR2C1_HUMAN', 'NR2C2_HUMAN', 'NR2E1_HUMAN', 'NR2E3_HUMAN', 'COT1_HUMAN', 'COT2_HUMAN', 'NR2F6_HUMAN', 'ESR1_HUMAN', 'ESR2_HUMAN', 'ERR1_HUMAN', 'ERR2_HUMAN', 'ERR3_HUMAN', 'GCR_HUMAN', 'MCR_HUMAN', 'PRGR_HUMAN', 'ANDR_HUMAN', 'NR4A1_HUMAN', 'NR4A2_HUMAN', 'NR4A3_HUMAN', 'STF1_HUMAN', 'NR5A2_HUMAN', 'NR6A1_HUMAN', 'NR0B1_HUMAN', 'NR0B2_HUMAN') OR bdb_41_uniprot__swissprot__primary_id_of_target_chain IN ('P10827', 'P10828', 'P10276', 'P10826', 'P13631', 'Q07869', 'Q03181', 'P37231', 'P20393', 'Q14995', 'P35398', 'Q92753', 'P51449', 'P55055', 'Q13133', 'Q96RI1', 'P11473', 'O75469', 'Q14994', 'P41235', 'Q14541', 'P19793', 'P28702', 'P48443', 'P13056', 'P49116', 'Q9Y466', 'Q9Y5X4', 'P10589', 'P24468', 'P10588', 'P03372', 'Q92731', 'P11474', 'O95718', 'P62508', 'P04150', 'P08235', 'P06401', 'P10275', 'P22736', 'P43354', 'Q92570', 'Q13285', 'O00482', 'Q15406', 'P51843', 'Q15466');" ) bdb = curs.fetchall() #BDB duplicates i = 0 while i < len(bdb): delete = False