def smiles_extraction(cas_keys, main): if len(cas_keys) != 0: try: top_ck = mode(cas_keys) # take most frequent cas key top_cmpds = pcp.get_compounds(top_ck, 'name') if len(top_cmpds) != 0: top_smiles = top_cmpds[ 0].isomeric_smiles # pick top result (best match) for cas key new_smiles[main] = top_smiles except statistics.StatisticsError: # two are equally frequent top_ck = [] smiles1 = [] lst_count = [x for x in set(cas_keys) if cas_keys.count(x) > 1] for i in range(len(lst_count)): top_ck.append(lst_count[i]) for j in top_ck: top_cmpds = pcp.get_compounds(j, 'name') if len(top_cmpds) != 0: top_smiles = top_cmpds[0].isomeric_smiles smiles1.append(top_smiles) new_smiles[main] = [smiles1]
def save(self, *args, additional_data=None, cid2=False, **kwargs): """ Sets data for various fields. Assumes that if the object does not have inchikey data that it has a SMILES string """ if not all([self.smiles, self.cid_number, self.chemical_properties]): try: pcp_data = pcp.get_compounds(self.inchikey, 'inchikey')[0] if hasattr(self, 'inchikey') else \ pcp.get_compounds(self.smiles, 'smiles')[0] except (IndexError, pcp.BadRequestError): raise ValidationError('Something went wrong') if not self.iupac_name: self.iupac_name = pcp_data.iupac_name or 'n/a' self.smiles = pcp_data.isomeric_smiles or pcp_data.canonical_smiles or '' self.set_chemical_data(pcp_query=pcp_data) if not self.chemical_name: self.chemical_name = self.scrape_compound_name(self.cid_number) or \ self.synonyms.split(',')[0] if self.synonyms != 'n/a' else '' if cid2 and len(self.smiles.split('.')) > 1: try: self.cid_number_2 = pcp.get_compounds( self.smiles.split('.')[0], 'smiles')[0].cid except (IndexError, pcp.BadRequestError): pass if len(self.smiles) > 200: self.smiles = '' if self.iupac_name and len(self.iupac_name) > 250: self.iupac_name = '' if hasattr(self, 'activity') and not self.activity and hasattr( self, 'category') and self.category == 1: act_find = FindActivity(self.chemical_name) self.activity = act_find.activity super(CompoundMixin, self).save(*args, **kwargs)
def FingerprintCOMP(a1, a2, key): p = pcp.get_compounds(a1, "name") d = pcp.get_compounds(a2, "name") for c in p: if key == "isomeric_smiles": a = c.isomeric_smiles elif key == "molecular_formula": a = c.molecular_formula elif key == "fingerprint": a = c.fingerprint else: a = c.fingerprint print "Input 1 = " + a1 print a print len(a) for i in d: if key == "isomeric_smiles": b = i.isomeric_smiles elif key == "molecular_formula": b = i.molecular_formula elif key == "fingerprint": b = i.fingerprint else: b = i.fingerprint print "Input 2 = " + a2 print b print len(b) s = difflib.SequenceMatcher(None, a, b) print a1 + " and " + a2 + " Have a similarity score of " + str( s.ratio() * 100) for block in s.get_matching_blocks(): print "Input 1 1[%d] and Input 2[%d] match for %d elements" % block
def get_molecule_cid(file_name, test=False): """ Take a file_name, remove the ending and prefix and return the pubchem molecule object. If the file_name does not have the cas number, try the name of the system, else return None so as to disreguard the molecule from training. """ my_file = file_name.split("_") #try to get the cas # if not then try name. cas = my_file[1] if test: print("Get CID, CAS: ", cas) else: pass try: cid = pcp.get_compounds(cas, 'name')[0] except: #file doesn't have a cas number with it, try name cas = my_file[-1] cas.replace("_", ' ') my_name = cas.split(".") try: cid = pcp.get_compounds(my_name[0], 'name')[0] except: cid = None finally: if test: print("Get CID, CID: ", cid) return cid
def mol_prop_gen(dataframe,outputpickle): #loading initial data # dataframe = pd.read_csv(filename) # newdf = pd.DataFrame() finaldf = pd.DataFrame() for index, row in dataframe.iterrows(): if index == 0: name = row['Name'] inchi = row['InChI'] cmpd = pcp.get_compounds(inchi,'inchi') props = cmpd[0].to_dict(properties=['cactvs_fingerprint', 'isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity', 'exact_mass','fingerprint']) smiles=props['isomeric_smiles'] props['mol']=Chem.MolFromSmiles(smiles) props['RT'] = row['RT'] props['Name'] = name props['System'] = row['System'] desc = np.array(fps_plus_mw(props['mol'])) descdf = pd.DataFrame(desc) descdf = descdf.T descdf.reindex([index]) newdf=pd.DataFrame(props,index=[index]) finaldf=pd.concat([descdf,newdf],axis=1) print('test') else: inchi = row['InChI'] try: cmpd = pcp.get_compounds(inchi,'inchi') except: print('line bypassed') pass try: props = cmpd[0].to_dict(properties=['cactvs_fingerprint','isomeric_smiles', 'xlogp', 'rotatable_bond_count','charge','complexity','exact_mass','fingerprint']) except: print('line bypassed') pass name = row['Name'] smiles=props['isomeric_smiles'] props['mol']=Chem.MolFromSmiles(smiles) props['RT'] = row['RT'] props['Name'] = name props['System'] = row['System'] newdf=pd.DataFrame(props,index=[index]) desc = np.array(fps_plus_mw(props['mol'])) cols=range(len(desc)) descdf=pd.DataFrame(desc) descdf = descdf.T descdf.index = [index] # descdf = descdf.T # descdf = pd.DataFrame(descdf, index=[index]) interdf = pd.concat([descdf,newdf],axis=1) finaldf = finaldf.append(interdf) print('on index ' + str(index+1) + ' of ' + str(len(dataframe))) finaldf.to_pickle(outputpickle) #mol_prop_gen('PredRetDB.csv','test.pickle')
def set_chem_data(self): for d in self.drugs_data: try: pcp_query = pcp.get_compounds(d['cid_number'], 'cid')[0] smiles = pcp_query.canonical_smiles d.update({ 'smiles': smiles, 'inchikey': pcp_query.inchikey, 'iupac_name': pcp_query.iupac_name or cirpy.resolve(smiles, 'iupac_name', ['smiles']), 'chemical_properties': dict_from_query_object(smiles, pcp_query, additional=True), }) if len(smiles.split('.')) > 1: d.update({ 'cid_number_2': pcp.get_compounds(smiles.split('.')[0], 'smiles')[0].cid }) except (IndexError, TypeError, pcp.BadRequestError): self.drugs_data.remove(d) return self.drugs_data
def get_iupac_name_from_smiles(smiles): try: cpd = pcp.get_compounds(smiles, 'smiles') except pcp.PubChemHTTPError: cpd = pcp.get_compounds(smiles, 'smiles') if len(cpd) != 1: return else: return cpd[0].iupac_name
def convert_feature_to_standard_pubchem(df): # create an empty df for later use dummyarray = np.empty((len(df), 6)) dummyarray[:] = np.nan df0 = pd.DataFrame( dummyarray, columns=['iupac', 'synonyms', 'compounds', 'formulae', 'cid', 'flag']) # deal with second column with CAS number print('2nd') df2 = df.iloc[:, 1].copy() df2 = pd.Series(df2).replace('nan', np.nan) # df2[mask1] = None df2 = [ str(string).lstrip("'") if string is not np.nan else np.nan for string in df2 ] # remove ' from the beginning of the cas number idx2 = [ pcp.get_compounds(component, 'name') if component is not np.nan else [] for component in df2 ] df0 = _fill_df_pubchem(df0, idx2) mask2 = [not not elem for elem in idx2] df0.loc[mask2, 'flag'] = 'from column B' # deal with the third column with mix number print('3rd') df3 = df.iloc[:, 4].copy() df3[mask2] = np.nan idx3 = [ pcp.get_compounds(component, 'name') if component is not np.nan else [] for component in df3 ] df0 = _fill_df_pubchem(df0, idx3) mask3 = [not not elem for elem in idx3] df0.loc[mask3, 'flag'] = 'from column E' # deal with first column with given names print('1st') df1 = df.iloc[:, 0].copy() df1[list(pd.Series(mask3) | pd.Series(mask2))] = np.nan name2trans = list(df1) name_transed_cls = [ Translator().translate(name, src='nl', dest='en') if name is not np.nan else [] for name in name2trans ] # name_transed_cls = name2trans idx1 = [ pcp.get_compounds(component.text, 'name') if not not component else [] for component in name_transed_cls ] df0 = _fill_df_pubchem(df0, idx1) mask1 = [not not elem for elem in idx1] df0.loc[mask1, 'flag'] = 'from column A' return df0
def geometry_from_pubchem(name, structure=None): """Function to extract geometry using the molecule's name from the PubChem database. The 'structure' argument can be used to specify which structure info to use to extract the geometry. If structure=None, the geometry will be constructed based on 3D info, if available, otherwise on 2D (to keep backwards compatibility with the times when the argument 'structure' was not implemented). Args: name: a string giving the molecule's name as required by the PubChem database. structure: a string '2d' or '3d', to specify a specific structure information to be retrieved from pubchem. The default is None. Recommended value is '3d'. Returns: geometry: a list of tuples giving the coordinates of each atom with distances in Angstrom. """ import pubchempy if structure in ['2d', '3d']: pubchempy_molecule = pubchempy.get_compounds(name, 'name', record_type=structure) elif structure is None: # Ideally get the 3-D geometry if available. pubchempy_molecule = pubchempy.get_compounds(name, 'name', record_type='3d') # If the 3-D geometry isn't available, get the 2-D geometry instead. if not pubchempy_molecule: pubchempy_molecule = pubchempy.get_compounds(name, 'name', record_type='2d') else: raise ValueError('Incorrect value for the argument structure=%s' % structure) # Check if pubchempy_molecule is an empty list or None if not pubchempy_molecule: print( 'Unable to find structure info in the PubChem database for the specified molecule "%s".' % name) return None pubchempy_geometry = \ pubchempy_molecule[0].to_dict(properties=['atoms'])['atoms'] geometry = [(atom['element'], (atom['x'], atom['y'], atom.get('z', 0))) for atom in pubchempy_geometry] return geometry
def get_iupac_name_from_smiles(smiles): if not pcp: raise RuntimeError('No Pubchempy') try: cpd = pcp.get_compounds(smiles, 'smiles') except pcp.PubChemHTTPError: cpd = pcp.get_compounds(smiles, 'smiles') if len(cpd) != 1: return else: return cpd[0].iupac_name
def process_bioactive_identifier(request): cas_no = request.GET.get('cas_number') inchikey = request.GET.get('inchikey', '').strip() obj = None if cas_no: obj = Bioactive.objects.filter( chemical_properties__synonyms__icontains=cas_no).first() elif inchikey: obj = Bioactive.objects.filter(inchikey__exact=inchikey).first() if obj: data = { 'object_exists': obj.get_absolute_url(), 'object_exists_name': str(obj), } return JsonResponse(data) try: iupac_name = None if cas_no: smiles = cirpy.query(cas_no, 'smiles')[0].value if '.' in smiles: smiles = [i for i in smiles.split('.') if len(i) > 5][0] pcp_query = pcp.get_compounds(smiles, 'smiles')[0] if not pcp_query.iupac_name: iupac_name = cirpy.resolve(smiles, 'iupac_name', ['smiles']) else: pcp_query = pcp.get_compounds(inchikey, 'inchikey')[0] if not pcp_query.iupac_name: iupac_name = cirpy.resolve(inchikey, 'iupac_name', ['stdinchikey']) if not pcp_query.cid: raise IndexError except (IndexError, pcp.BadRequestError): return JsonResponse({'error': 'No compound found for this CAS number'}) data = { 'chemical_name': Bioactive.scrape_compound_name(pcp_query.cid), 'iupac_name': pcp_query.iupac_name or iupac_name or 'n/a', 'inchikey': pcp_query.inchikey, 'structure_url': 'https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid={}&t=l'. format(pcp_query.cid), 'hidden_cid': pcp_query.cid, 'smiles': pcp_query.isomeric_smiles or pcp_query.canonical_smiles or '', } return JsonResponse(data)
def _load_from_pubchem(self, name): ''' Method used to load data from pubchem name - name of compound to search returns nothing ''' import pubchempy as pcp try: name = int(name) except: pass record_type = '3d' mol = pcp.get_compounds(name, ('name', 'cid')[type(name) is int], record_type=record_type) if len(mol) == 0: utils.message( 'Error: Could not find 3d structure of {name}... Attempting to find 2d structure...', 'red') record_type = '2d' mol = pcp.get_compounds(name, ('name', 'cid')[type(name) is int], record_type=record_type) if len(mol) == 0: utils.message('Error: No structural data found for {name}.', 'red') else: mol = mol[0] coords = np.asarray([[a.x, a.y, a.z] for a in mol.atoms]) coords = np.where(coords == None, 0, coords).astype(float) elements = np.asarray([a.element for a in mol.atoms]) self.name = name.capitalize() self.atoms = [] [ self.atoms.append(Atom(elements[i], coords[i])) for i in range(len(coords)) ] if record_type == '3d': self.save_to_xyz(os.getcwd() + rf'\Molecules\{name.lower()}.xyz') self._mol_load_finish()
def getChemicalName(smiles): try: name = pcp.get_compounds(smiles, "smiles")[0].iupac_name return name except: print("Pubchempy could not convert SMILES to a IUPAC name") return smiles
def sid_to_smiles(sid): """Takes an SID and prints the associated SMILES string.""" substance = pc.Substance.from_sid(sid) cid = substance.standardized_cid compound = pc.get_compounds(cid)[0] return compound.isomeric_smiles
def import_from_pubchem(): compounds = read_csv() # cria uma planilha no mesmo local do arquivo .py workbook = xlsxwriter.Workbook(filename='to_database.xlsx') # cria uma aba worksheet = workbook.add_worksheet(name='results') row = 1 print('\nEstabelecendo conexão com o PubChem...') # para cada composto na tabela for comp in compounds: # pega dados no pubChem results = pcp.get_compounds(comp[0], 'smiles') # baixa a imagem de composto pcp.download('PNG', os.path.join(CURR_PATH, 'images', comp[0] + '.png'), comp[0], 'smiles', overwrite=True) # para cada resultado, escreve na planilha nova o SMILES, o aroma, o nome IUPAC e a fórmula molecular for c in results: print('\nComposto ' + c.iupac_name) worksheet.write(row, 0, comp[0]) worksheet.write(row, 1, comp[1]) worksheet.write(row, 2, c.iupac_name) worksheet.write(row, 3, c.molecular_formula) row += 1 workbook.close() print('Pronto! Compostos Atualizados')
def get_compound_from_smiles(smiles): attempts = 5 time_delay = 1 # in seconds while attempts >= 1: try: compounds = pcp.get_compounds(smiles, namespace='smiles') cid = compounds[0].cid if cid == None: print( 'No PubChem record' ) # https://pubchempy.readthedocs.io/en/latest/guide/gettingstarted.html return None compound = pcp.Compound.from_cid(cid) except: attempts -= 1 print('Could not get compound. ' + str(attempts) + ' attempts remaining.') time.sleep(time_delay) else: return compound if attempts <= 0: print( 'Failed to get compound from smiles after exhausting all attempts') return None
def get_smiles(idx, names=None, cids=None, sids=None, binding_db=None): try: cid = int(cids[idx]) smiles_string = binding_db.loc[binding_db['PubChem CID'] == cid]["Ligand SMILES"].values[0] except: try: sid = int(sids[idx]) smiles_string = binding_db.loc[binding_db['PubChem SID'] == sid]["Ligand SMILES"].values[0] except: try: cid = int(cids[idx]) smiles_string = str( Compound.from_cid(int(cid)).isomeric_smiles) except: try: chembl_id = Substance.from_sid(sids[idx]).source_id print(chembl_id) try: compounds = CompoundResource() c = compounds.get(chembl_id) smiles_string = c["smiles"] except: smiles_string = binding_db.loc[ binding_db["ChEMBL ID of Ligand"] == chembl_id]["Ligand SMILES"].values[0] except: try: name = names[idx] cs = get_compounds(name, 'name') smiles_string = cs[0].isomeric_smiles except: smiles_string = np.nan return (smiles_string)
def get_name_from_pubchem(self, smiles: str) -> Optional[str]: """Tries to get the name of a molecule from the Pubchem website. Args: smiles: A SMILES string. Returns: The molecule name if a match is found else ``None``. """ try: comp = get_compounds(smiles, namespace="smiles")[0] except (BadRequestError, IndexError): return None traditional = comp.synonyms[0] if comp.synonyms else None names = {"traditional": traditional, "iupac": comp.iupac_name} match = None for source in self.name_preference: if source in names and names[source]: match = names[source] break if isinstance(match, str): match = match.lower() return self._process_match(smiles, match)
def process_cas(request): cas_no = request.GET.get('cas_number') try: obj = Odorant.objects.get(cas_number__exact=cas_no) data = { 'object_exists': obj.get_absolute_url(), 'object_exists_name': str(obj), } return JsonResponse(data) except ObjectDoesNotExist: pass try: smiles = cirpy.query(cas_no, 'smiles')[0].value pcp_query = pcp.get_compounds(smiles, 'smiles')[0] cid_no = pcp_query.cid except IndexError: return JsonResponse({ 'error': 'No compound found for this CAS number' }) if smiles and cid_no: data = { 'chemical_name': Odorant.scrape_compound_name(cid_no), 'iupac_name': pcp_query.iupac_name, 'structure_url': 'https://pubchem.ncbi.nlm.nih.gov/image/imgsrv.fcgi?cid={}&t=l'.format(cid_no), 'hidden_cid': cid_no, 'smiles': smiles, } return JsonResponse(data)
def cid_df_to_smiles(df, cid_colname): """ Args: df : pandas dataframe with SID numbers column_name (str) : name of column that contains PubChem SID numbers Returns: df : modified with columns containing CID and SMILES CID becomes first column, SMILES second unsuccessful_list : list of SIDs for which no CID or SMILES were found """ res = [] unsuccessful_list = [] for index, row in df.iterrows(): cid = row[cid_colname] try: compound = pc.get_compounds(cid)[0] smiles = compound.canonical_smiles res.append(smiles) except BaseException: res.append('none') unsuccessful_list.append(cid) pass df['SMILES'] = res #df.to_csv(r'../datasets/df_cleaned_kegg_with_smiles.csv') return df, unsuccessful_list
def _set_inchi_pcc(self, in_str, pcp_type, elem): """Check pubchem compounds via API for both an inchikey and any available compound details """ if not in_str: return 0 try: pccs = pcp.get_compounds(in_str, pcp_type) except pcp.BadRequestError as e: print(e) return 0 except pcp.TimeoutError as e: print(e) return 0 except pcp.ServerError as e: print(e) return 0 except URLError as e: print(e) return 0 except BadStatusLine as e: print(e) return 0 if pccs: pcc = pccs[elem] self.compound_info['inchikey_id'] = pcc.inchikey self.compound_info['pubchem_id'] = pcc.cid self.compound_info['molecular_formula'] = pcc.molecular_formula self.compound_info['molecular_weight'] = pcc.molecular_weight self.compound_info['exact_mass'] = pcc.exact_mass self.compound_info['smiles'] = pcc.canonical_smiles if len(pccs) > 1: print('WARNING, multiple compounds for ', self.compound_info)
def download_from_list(self, name_list, type="name"): """Given a list of compounds, download them Parameters ---------- name_list type Returns ------- """ results = [] for n in name_list: try: r = get_compounds(n, namespace=type) except: print("Ligand %s not found" % n) r = [] if len(r): results.append(r[0]) else: results.append(None) return results
def getCompoundsInfo(ChemIDList, MoleculeNameList): compoundList = [] molecular_formulasList = [] isomeric_smilesList = [] CidsList = [] chemIDList = [] i = 0 for chemID, molecule in list(zip(ChemIDList, MoleculeNameList)): #print('ChemID:', chemID) compoundsResults = pubchempy.get_compounds(molecule, namespace='name') #print('compoundsResults:\n', compoundsResults) #compoundsResults.to_excel(outputPath+"result.xlsx") for compound in compoundsResults: #print('CID: {}\tMass: {}\tName: {}\tMolfor: {}\tSmi: {}\tSyn: {}'.format(compound.cid,compound.exact_mass,compound.iupac_name,compound.molecular_formula, compound.isomeric_smiles, compound.synonyms)) molecular_formulas=compound.molecular_formula #MWs=compound.molecular_weight isomeric_smiles=compound.isomeric_smiles #synonyms=compound.synonyms Cids=compound.cid chemIDList.append(chemID) compoundList.append(molecule) molecular_formulasList.append(molecular_formulas) isomeric_smilesList.append(isomeric_smiles) CidsList.append(Cids) dataframe=pd.DataFrame({'ChemID':chemIDList, 'CompoundName':compoundList, 'molecular_formula':molecular_formulasList, 'smiles':isomeric_smilesList, 'cid':CidsList}) dataframe.to_excel(outputPath+"SmilesResult.xlsx",index=False) print(f'{i}-{chemID}:{molecule} write to excel file successfully!') i += 1 time.sleep(3)
def apply_get_compounds(mol_name): try: iso_smiles = pcp.get_compounds(mol_name, 'name')[0].isomeric_smiles except: iso_smiles = '' return iso_smiles
def get_from_pubchem(name, path=structures_folder): ''' Function that downloads a molecule from pubchem Save the molecule to path name - name of molecule path - path to save molecule to ''' import pubchempy as pcp mols = pcp.get_compounds(name, 'name', record_type='3d') #check if there was a match if len(mols) == 0: raise Exception(f'Molecule {mol} not found on PubChem or on disk.') #save xyz file to disk else: coords = np.asarray([[a.x, a.y, a.z] for a in mols[0].atoms]) coords = np.where(coords == None, 0, coords).astype(float) elements = np.asarray([a.element for a in mols[0].atoms]) mol_path = path + '\\' + name + '.xyz' with open(mol_path, 'w+') as f: f.write(f'{len(elements)}\n') f.write('Downloaded from PubChem\n') for i, e in enumerate(elements): f.write( f'{e: <2} \t {coords[i][0]: >8.5f} \t {coords[i][1]: >8.5f} \t {coords[i][2]: >8.5f}\n' ) return mol_path
def get_molecular_weight(name): try: cpd = pcp.get_compounds(name, "name") return cpd[0].to_dict( properties=["molecular_weight"])["molecular_weight"] except: return 300 #not real mw, but it's in the middle - therefore won't be misued in min/max calculations
def name2inchikey(odor_names): """ Args: odor_names (iterable of strs): the odors to be translated to inchikeys Returns: odor_inchikeys (list of strs): the inchikeys for the input odors inchikey2name (dict: str -> str): for quickly getting the name for an inchikey """ # TODO support single str inputs too odor_inchikeys = [] inchikey2name = dict() for o in odor_names: matches = pcp.get_compounds(o, 'name') if len(matches) > 1: print('WARNING: more than one pubchem match for {}. ' + \ 'ambiguous!'.format(o)) continue elif len(matches) == 0: print('WARNING: no pubchem matches found for {}!'.format(o)) continue match = matches[0] odor_inchikeys.append(match.inchikey) inchikey2name[match.inchikey] = o return odor_inchikeys, inchikey2name
def _default(self): if self.app.pargs.by_name: compounds = pubchempy.get_compounds(self.app.pargs.name_or_id, 'name') results = [[compound.synonyms[0], 'pubchem.compound', compound.cid, compound.inchi] for compound in compounds] else: unichem = bioservices.UniChem() structure = unichem.get_structure(int(float(self.app.pargs.name_or_id)), self.app.pargs.namespace) if structure: results = [['', self.app.pargs.namespace, self.app.pargs.name_or_id, structure['standardinchi']]] else: results = [] if not results: print('Unable to find structure', file=sys.stderr) return lens = [ max(4, max(len(str(r[0])) for r in results)), max(9, max(len(str(r[1])) for r in results)), max(2, max(len(str(r[2])) for r in results)), max(9, max(len(str(r[3])) for r in results)), ] format = '{{:<{}}} {{:<{}}} {{:<{}}} {{:<{}}}'.format(*lens) print(format.format('Name', 'Namespace', 'ID', 'Structure')) print(format.format('=' * lens[0], '=' * lens[1], '=' * lens[2], '=' * lens[3])) for result in results: print(format.format(*result))
def get_pubchem_by_name(dir): files = [ f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f)) ] for in_file in files: #print in_file file_name, extension = os.path.splitext(str(in_file)) if "smiles" not in file_name: out_file = file_name + "_smiles" + extension print "processing:", in_file, "writing to:", out_file with open(dir + in_file, 'r') as input, open(dir + out_file, 'w') as output: mol_info = [re.split(',|:', line)[-2:] for line in input] #print mol_info smiles_string = "" for molecule in mol_info: query_results = pubchempy.get_compounds( molecule[0], "name") if len(query_results) != 1: print "Query for:", molecule[0], "yielded", len( query_results), "results" if len(query_results) > 1: print "Using 1st result: ", query_results[ 0].canonical_smiles if len(query_results) > 0: smiles_string += query_results[ 0].canonical_smiles + '\n' output.write(smiles_string)
def convertsmiles(): t_smiles.set('') t_sol.set('') t_lip.set('') t_sasc.set('') molecule = pcp.get_compounds(t_name.get(), 'name') print('molecule') print(molecule[0]) #print('canocical_smile', molecule[0].canonical_smiles) print('isomeric_smile', molecule[0].isomeric_smiles) mol_canonical_smiles = molecule[0].canonical_smiles mol_isomeric_smiles = molecule[0].isomeric_smiles t_smiles.set(mol_isomeric_smiles) mol_ = Chem.MolFromSmiles(mol_isomeric_smiles) Draw.MolToFile(mol_, 'tmp.png') global image_ image_open = Image.open('tmp.png') image_ = ImageTk.PhotoImage(image_open, master=frame1) canvas.create_image(150, 75, image=image_)
def product_prediction_algorithm(target_smiles_list, df, row): start = timeit.default_timer() targets = [] substrates = [] products = [] results = [] enzymes = [] for x in xrange(len(df)): try: results.append( explore_substrate(target_smiles_list[x], row, df) ) substrates.append( df['Substrates'].irow(row) ) products.append( df['Products'].irow(row) ) targets.append( target_smiles_list[x] ) enzymes.append( df['Enzymes'].irow(row) ) except: pass stop = timeit.default_timer() #unique_prods, unique_substrates, unique_products, unique_enzymes = filter_results(results, substrates, products, TestUni, enzymes) unique_prods, unique_substrates, unique_products, unique_enzymes = results, substrates, products, enzymes print "Finished reactions... searching hits on PubChem... " #search pubchem for unique hits novel_compounds = [] for i in unique_prods: try: searches = pcp.get_compounds('CanonicalSMILES', str(Chem.MolToSmiles(i)), 'smiles') if str(searches) == '[Compound()]': novel_compounds.append( i ) except: pass #put things in a df unique_prods_smiles = [] novel_smiles = [] for i in unique_prods: unique_prods_smiles.append( Chem.MolToSmiles(i) ) for i in novel_compounds: novel_smiles.append( Chem.MolToSmiles(i) ) novels =[ ] for i in unique_prods_smiles: if i in novel_smiles: novels.append( 'Novel' ) else: novels.append( 'Found in Pubchem' ) Results = pd.DataFrame({'Native Substrate': unique_substrates, 'Native Product': unique_products, 'Products': unique_prods_smiles, 'Novel Compound?': novels, 'Enzymes': unique_enzymes}) Results = Results.drop_duplicates(cols=['Products']) print 'Novel Compounds found...' print len(novel_compounds) print "Runtime..." print stop - start product_pictures = Draw.MolsToGridImage(unique_prods,molsPerRow=8, includeAtomNumbers=False) return Results, product_pictures
def chunks_get_compounds(l,n): """ Yield successive n-sized chunks from l. (From Stack Overflow)""" for i in xrange(0,len(l),n): try: yield pcb.get_compounds(l[i:i+n]) except Exception, e: print e pass
def pubchem_search(comp_name, search_type='name'): iupac = '' inchi = '' inchi_key = '' smiles = '' cid = '' formula = '' synonyms = '' structure = '' try: compound = None # For this to work on Mac, run: cd "/Applications/Python 3.6/"; sudo "./Install Certificates.command # or comment out the line below: # ssl._create_default_https_context = ssl._create_unverified_context # If no root certificates installed pubchem_compound = get_compounds(comp_name, namespace=search_type) try: compound = pubchem_compound[0] # Only read the first record from PubChem = preferred entry except IndexError: logger.info('Could not find PubChem compound for ' + comp_name) # Nothing was found if compound: inchi = compound.inchi inchi_key = compound.inchikey smiles = compound.canonical_smiles iupac = compound.iupac_name iupac = iupac.replace('~', '').replace('{', '').replace('}', '') cid = compound.cid formula = compound.molecular_formula for synonym in compound.synonyms: if get_relevant_synonym(synonym): synonyms = synonyms + ';' + synonym if synonyms: synonyms = synonyms.replace(";", "", 1) # Remove the leading ";" logger.debug('Searching PubChem for "' + comp_name + '", got cid "' + str(cid) + '" and iupac name "' + iupac + '"') except Exception as error: logger.error("Unable to search PubChem for compound " + comp_name) logger.error(error) return iupac, inchi, inchi_key, smiles, cid, formula, synonyms, structure
import pandas as pd import pubchempy as pcp df=pd.read_csv('SAMPL4.csv') for oname in df.NAME: cs = pcp.get_compounds(oname, 'name') print oname,cs[0].canonical_smiles df['NSMILES'] = [pcp.get_compounds(oname, 'name')[0].canonical_smiles for oname in df.NAME] print df.to_csv(index=False)
smiles = [] for i in positive_smiles: smiles.append( i ) for i in negative_smiles: smiles.append( i ) for i in range(len(positive_smiles)): activity.append( 1 ) for i in range(len(negative_smiles)): activity.append( 0 ) for i in negative_smiles: try: results = pcp.get_compounds(str(i), 'smiles') cpn = pcp.Compound.from_cid( int(str(results[0])[9:-1]) ) compound_names.append( cpn.iupac_name ) except: compound_names.append( 'error parsing' ) # <codecell> training_set = pd.DataFrame({'Smiles': smiles, 'Activity': activity, 'Compound Name': compound_names}) # <codecell> import numpy as np import pandas as pd from random import sample
def get_name(name): chemical_name = pcp.get_compounds(name, 'name') try: return chemical_name[0].isomeric_smiles except IndexError: return "\n"
def rf_find_alternative_substrates(enzyme, dataset, threshold): ###grabs the substrates of the given enzyme and their smiles compound_names, positive_smiles = retrieve_enzyme_substrates(enzyme) ###generate 3D coordinates for each molecule positive_structures = [] for i in positive_smiles: positive_structures.append( Chem.MolFromSmiles(i) ) endogenous_structures = [] for i in endogenous_steroids: endogenous_structures.append( Chem.MolFromSmiles(i) ) for m in positive_structures: AllChem.Compute2DCoords(m) for m in endogenous_structures: AllChem.Compute2DCoords(m) positive_fps=[AllChem.GetMorganFingerprintAsBitVect(x,2) for x in positive_structures] endogenous_fps=[AllChem.GetMorganFingerprintAsBitVect(x,2) for x in endogenous_structures] #recombine endogenous structures with their scores sims = DataStructs.BulkTanimotoSimilarity(positive_fps[0],endogenous_fps) nbrs = sorted(zip(sims,endogenous_structures),reverse=False) #grab bottom 10% of matches negative_structures = [x[1] for x in nbrs[:12]] negative_smiles = [] for i in negative_structures: negative_smiles.append( Chem.MolToSmiles(i) ) #Draw.MolsToGridImage([x[1] for x in nbrs[:12]],legends=['%.4f'%x[0] for x in nbrs]) ###gets the information for each molecule that we've grabbed thus far activity = [] smiles = [] for i in positive_smiles: smiles.append( i ) for i in negative_smiles: smiles.append( i ) for i in range(len(positive_smiles)): activity.append( 1 ) for i in range(len(negative_smiles)): activity.append( 0 ) for i in negative_smiles: try: results = pcp.get_compounds(str(i), 'smiles') cpn = pcp.Compound.from_cid( int(str(results[0])[9:-1]) ) compound_names.append( cpn.iupac_name ) except: compound_names.append( 'error parsing' ) ###our model training_set = pd.DataFrame({'Smiles': smiles, 'Activity': activity, 'Compound Name': compound_names}) ###identity is in the RF_functions.py file values = [] mols = training_set['Smiles'].tolist() activities = training_set['Activity'].tolist() for i in range(len(dataset)): if str( rf_classifier(mols, activities, dataset[i], threshold) ) == '[1]': if dataset[i] not in mols: values.append( dataset[i] ) values = canonicalize_smiles(values) return values, training_set
def cross_validate_sliding_threshold(ec_number): '''Cross validates the input enzyme across a sliding identity threshold. Output will be the average accuracy over 100 simulations per each threshold.''' #Compute the coordinates of both the positive hits and all the endogenous steroids positive_structures = [] for i in positive_smiles: positive_structures.append( Chem.MolFromSmiles(i) ) endogenous_structures = [] for i in endogenous_steroids: endogenous_structures.append( Chem.MolFromSmiles(i) ) for m in positive_structures: AllChem.Compute2DCoords(m) for m in endogenous_structures: AllChem.Compute2DCoords(m) positive_fps=[AllChem.GetMorganFingerprintAsBitVect(x,2) for x in positive_structures] endogenous_fps=[AllChem.GetMorganFingerprintAsBitVect(x,2) for x in endogenous_structures] #recombine endogenous structures with their scores sims = DataStructs.BulkTanimotoSimilarity(positive_fps[0],endogenous_fps) nbrs = sorted(zip(sims,endogenous_structures),reverse=False) #grab most disimilar matches, we're arbitrarily picking 12 negative_structures = [x[1] for x in nbrs[:12]] negative_smiles = [] for i in negative_structures: negative_smiles.append( Chem.MolToSmiles(i) ) #Let's code the cross validation section activity = [] smiles = [] compound_names = [] for i in positive_smiles: smiles.append( i ) for i in negative_smiles: smiles.append( i ) for i in range(len(positive_smiles)): activity.append( 1 ) for i in range(len(negative_smiles)): activity.append( 0 ) for i in negative_smiles: try: results = pcp.get_compounds(str(i), 'smiles') cpn = pcp.Compound.from_cid( int(str(results[0])[9:-1]) ) compound_names.append( cpn.iupac_name ) except: compound_names.append( 'error parsing' ) for i in positive_smiles: try: results = pcp.get_compounds(str(i), 'smiles') cpn = pcp.Compound.from_cid( int(str(results[0])[9:-1]) ) compound_names.append( cpn.iupac_name ) except: compound_names.append( 'error parsing' ) training_set = pd.DataFrame({'Smiles': smiles, 'Activity': activity, 'Compound Name': compound_names}) #Scrambles up the training_set dataframe so I can test the rest of it percentages = list(np.arange(0, 1.05, 0.10)) validation_per_percent = [] for iteration in percentages: cross_validation_scores = [] for x in xrange(100): # given data frame df # create random index rindex = np.array(sample(xrange(len(training_set)), int( len(training_set)*.25 ))) #get some random rows from df pulled_data = training_set.ix[rindex] pulled_smiles = pulled_data['Smiles'].tolist() training_set_culled = training_set.drop(rindex) predicted_values = [] actual_values = pulled_data['Activity'].tolist() mols = training_set_culled['Smiles'].tolist() activities = training_set_culled['Activity'].tolist() try: for j in range(len(pulled_smiles)): if rf_validate(mols, activities, pulled_smiles[j]) > iteration: if pulled_smiles[j] not in mols: predicted_values.append( 1 ) else: predicted_values.append( 0 ) #cross_validation_scores.append( predicted_values == actual_values ) sm = difflib.SequenceMatcher(None, predicted_values, actual_values) ###uses the Ratcliff and Obershelp algorithm for matching these lists cross_validation_scores.append( sm.ratio() ) except: pass #validation_per_percent.append( float(cross_validation_scores.count(True))/len(cross_validation_scores) ) validation_per_percent.append( mean(cross_validation_scores) ) return validation_per_percent
names=[] formula="" inchi="" for line in open(sys.argv[1]): if "CH$NAME" in line: names.append(''.join(line.split(" ")[1:]).strip().lower()) if "CH$FORMULA" in line: formula=''.join(line.split(" ")[1:]).strip() if "CH$IUPAC" in line: inchi=''.join(line.split(" ")[1:]).strip() if "CH$SMILES" in line: smiles=''.join(line.split(" ")[1:]).strip() results = pcp.get_compounds(inchi, namespace=u'inchi') if len(results) != 1: print "#results != 1; exiting" sys.exit(1) if results[0].molecular_formula == formula: print 'Formula matches ' + '\033[92m[OK]\033[0m' else: print results[0].molecular_formula print formula print 'Formulas different ' + '\033[93m[OK]\033[0m' synonyms=[x.encode('utf-8').lower() for x in results[0].synonyms] common_names=[]
def get_pubchem_cids(com, user=None): cid_array = [] for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["cid", "pubchem cid", "pubchem compound", "pubchem compound id", "pubchem compound identifier"]): if iden["identifier"] not in cid_array: cid_array.append(iden["identifier"]) if cid_array: return cid_array ids_completed = [] for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["chemspider", "chemspider id", "chemspider identifier", "cs id", "csid"]): if iden["identifier"] not in ids_completed: ids_completed.append(iden["identifier"]) results = pubchem.get_compounds(gnomics.objects.compound.Compound.inchi(com, user = user), 'inchi') for x in results: gnomics.objects.compound.Compound.add_identifier(com, identifier=x.cid, identifier_type="PubChem CID", language=None, source="PubChem") cid_array.append(x.cid) elif user is None: print("Cannot use ChemSpider conversion when user is None. Please create and pass a valid user with a ChemSpider security token to this method.") for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["kegg compound", "kegg compound id", "kegg compound identifier", "kegg", "kegg compound accession", "kegg id", "kegg identifier", "kegg accession"]): if iden["identifier"] not in ids_completed: ids_completed.append(iden["identifier"]) for kegg_com in gnomics.objects.compound.Compound.kegg_compound_db_entry(com): # Returns PubChem SID. pubchem_sid = str(kegg_com["DBLINKS"]["PubChem"]) # Get CIDs from SIDs. server = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" ext = "/substance/sid/" + str(pubchem_sid) + "/JSONP" r = requests.get(server+ext, headers={"Content-Type": "application/json"}) if not r.ok: print("Something went wrong when trying to access the PubChem PUG REST.") else: str_r = r.text try: l_index = str_r.index("(") + 1 r_index = str_r.rindex(")") res = str_r[l_index:r_index] decoded = json.loads(res) for temp_com in decoded["PC_Substances"][0]["compound"]: if "id" in temp_com: if temp_com["id"]["type"] == 1 and "id" in temp_com["id"]: if "cid" in temp_com["id"]["id"]: if temp_com["id"]["id"]["cid"] not in cid_array: cid_array.append(temp_com["id"]["id"]["cid"]) gnomics.objects.compound.Compound.add_identifier(com, identifier=temp_com["id"]["id"]["cid"], identifier_type="PubChem CID", source="KEGG", language=None) except ValueError: print("Input is not in a JSONP format.") for iden in gnomics.objects.auxiliary_files.identifier.filter_identifiers(com.identifiers, ["wikidata", "wikidata accession", "wikidata id", "wikidata identifier"]): if iden["identifier"] not in ids_completed: ids_completed.append(iden["identifier"]) for wikidata_object in gnomics.objects.compound.Compound.wikidata(com): found_array = gnomics.objects.auxiliary_files.wiki.wikidata_property_check(wikidata_object, "pubchem cid", wikidata_property_language = "en") for x in found_array: if x not in cid_array: cid_array.append(x) gnomics.objects.compound.Compound.add_identifier(com, identifier = x, identifier_type = "PubChem CID", language = None, source = "Wikidata") return cid_array
import pandas as pd data = pd.read_csv("compound_names.csv") #data = data[0:11] outdf = pd.DataFrame(columns=('Compound Name', 'CID')) i = 0 for compound in data.values: for item in get_compounds(compound, 'name'): #print "Compound Name : "+compound+", "+"CID : "+str(item.cid) #outdf["Compound Name"] = compound #outdf["CID"] = item.cid outdf.loc[i] = [compound, item.cid] i = i +1 outdf.to_csv('mydb.csv')
def prediction_algorithm2(target_smiles, df): '''iterates through an entire df instead of a target smiles list''' start = timeit.default_timer() TestUni = df chol = str(target_smiles) tally = [] for z in range(len(TestUni)): try: t = Chem.MolFromSmiles( chol ) s = Chem.MolFromSmiles( TestUni['Substrates'].irow(z) ) p = Chem.MolFromSmiles( TestUni['Products'].irow(z) ) tally.append( scan_atoms(t, s, p) ) if z % 10 == 0 and z != 0: print z except: tally.append( 'do not proceed' ) TestUni['Anabolic Compatible'] = tally TestUni2 = TestUni[TestUni['Anabolic Compatible'] == 'proceed'] print "Potential hits..." print len(TestUni2) anabolic = chol substrates = [] products = [] targets = [] results = [] enzymes = [] for x in range(len(TestUni2)): try: results.append( explore_substrate(anabolic, x, TestUni2) ) substrates.append( TestUni2['Substrates'].irow(x) ) products.append( TestUni2['Products'].irow(x) ) targets.append( anabolic ) enzymes.append( TestUni2['Enzymes'].irow(x) ) except: pass stop = timeit.default_timer() #unique_prods, unique_substrates, unique_products, unique_enzymes = filter_results(results, substrates, products, TestUni, enzymes) unique_prods, unique_substrates, unique_products, unique_enzymes = results, substrates, products, enzymes print "Finished reactions... searching hits on PubChem... " #search pubchem for unique hits novel_compounds = [] for i in unique_prods: searches = pcp.get_compounds('CanonicalSMILES', str(Chem.MolToSmiles(i)), 'smiles') if str(searches) == '[Compound()]': novel_compounds.append( i ) #put things in a df unique_prods_smiles = [] novel_smiles = [] for i in unique_prods: unique_prods_smiles.append( Chem.MolToSmiles(i) ) for i in novel_compounds: novel_smiles.append( Chem.MolToSmiles(i) ) novels =[ ] for i in unique_prods_smiles: if i in novel_smiles: novels.append( 'Novel' ) else: novels.append( 'Found in Pubchem' ) Results = pd.DataFrame({'Native Substrate': unique_substrates, 'Native Product': unique_products, 'Products': unique_prods_smiles, 'Novel Compound?': novels, 'Enzymes': unique_enzymes}) Results = Results.drop_duplicates(cols=['Products']) print 'Novel Compounds found...' print len(novel_compounds) print "Runtime..." print stop - start product_pictures = Draw.MolsToGridImage(unique_prods,molsPerRow=8, includeAtomNumbers=False) return Results, product_pictures
def search(query, user=None, search_type=None, source="chemspider", mass_plus_minus=0.001): result_set = [] if source.lower() in ["chemspider", "all"]: if user is not None and user.chemspider_security_token is not None: cs = chemspider(user.chemspider_security_token) if search_type == None: for result in cs.search(query): temp_com = gnomics.objects.compound.Compound(identifier = result.csid, identifier_type = "ChemSpider ID", source = "ChemSpider", name = None) result_set.append(temp_com) elif search_type == "formula" or search_type == "molecular formula": for result in cs.simple_search_by_formula(query): temp_com = gnomics.objects.compound.Compound(identifier = result.csid, identifier_type = "ChemSpider ID", source = "ChemSpider", name = None) result_set.append(temp_com) elif search_type == "mass" and mass_plus_minus is not None: for result in cs.simple_search_by_mass(query, mass_plus_minus): temp_com = gnomics.objects.compound.Compound(identifier = result.csid, identifier_type = "ChemSpider ID", source = "ChemSpider", name = None) result_set.append(temp_com) else: print("No valid search type for ChemSpider was provided.") print("Continuing with search type 'None'...") return search(query, user = user, search_type = None, source = "chemspider") elif source.lower() == "chemspider": print("Searching with ChemSpider requires the creation of a User object with a valid ChemSpider security token. Information on obtaining such a token can be found here: 'http://www.chemspider.com/AboutServices.aspx?'.\n") print("Continuing with PubChem search...\n") return search(query, source = "pubchem") if source.lower() in ["pubchem", "all"]: if search_type == None: try: server = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" ext = "/compound/name/" + str(query) + "/synonyms/JSONP" r = requests.get(server+ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() print("No results found.") return result_set str_r = r.text try: l_index = str_r.index("(") + 1 r_index = str_r.rindex(")") except ValueError: print("Input is not in a JSONP format.") exit() res = str_r[l_index:r_index] decoded = json.loads(res) for result in decoded["InformationList"]["Information"]: result_cid = result["CID"] temp_com = gnomics.objects.compound.Compound(identifier = result_cid, identifier_type = "PubChem CID", source = "PubChem", name = result["Synonym"][0]) result_set.append(temp_com) except requests.exceptions.RequestException as e: print(e) print("No results found.") elif search_type == "substructure": return pubchem.get_compounds(query, "substructure") elif search_type == "superstructure": return pubchem.get_compounds(query, "superstructure") elif search_type == "similarity": return pubchem.get_compounds(query, "similarity") elif search_type == "identity": return pubchem.get_compounds(query, "identity") elif search_type.lower() == "smiles": for temp_com in pubchem.get_compounds(query, "smiles"): if temp_com.synonyms: new_com = gnomics.objects.compound.Compound(identifier = temp_com.cid, identifier_type = "PubChem CID", source = "PubChem", name = temp_com.synonyms[0]) result_set.append(new_com) else: new_com = gnomics.objects.compound.Compound(identifier = temp_com.cid, identifier_type = "PubChem CID", source = "PubChem", name = temp_com.iupac_name) result_set.append(new_com) elif search_type.lower() == "inchi": for temp_com in pubchem.get_compounds(query, "inchi"): if temp_com.synonyms: new_com = gnomics.objects.compound.Compound(identifier = temp_com.cid, identifier_type = "PubChem CID", source = "PubChem", name = temp_com.synonyms[0]) result_set.append(new_com) else: new_com = gnomics.objects.compound.Compound(identifier = temp_com.cid, identifier_type = "PubChem CID", source = "PubChem", name = temp_com.iupac_name) result_set.append(new_com) elif search_type == "sdf": return pubchem.get_compounds(query, "sdf") elif search_type == "cid": return pubchem.get_compounds(query, "cid") elif search_type == "cas": server = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" ext = "/compound/name/" + str(query) + "/synonyms/JSONP" r = requests.get(server+ext, headers={"Content-Type": "application/json"}) if not r.ok: print("There was a problem attempting to access the PubChem PUG REST service.") else: str_r = r.text try: l_index = str_r.index("(") + 1 r_index = str_r.rindex(")") except ValueError: print("Input is not in a JSONP format.") exit() res = str_r[l_index:r_index] decoded = json.loads(res) for result in decoded["InformationList"]["Information"]: for syn in result["Synonym"]: if syn == query: result_cid = result["CID"] temp_com = gnomics.objects.compound.Compound(identifier = result_cid, identifier_type = "PubChem CID", source = "PubChem", name = result["Synonym"][0]) result_set.append(temp_com) else: print("No valid search type for PubChem was provided.") print("Continuing with search type 'None'...") return search(query, user = None, search_type = None, source = "pubchem") if source.lower() != "chemspider" and source.lower() != "pubchem" and source.lower() != "all": print("No valid search source was provided.") if user is not None and user.chemspider_security_token is not None: print("Because user and ChemSpider security token are provided, continuing with ChemSpider search...") return search(query, user = user, search_type = None, source = "chemspider") elif user.chemspider_security_token is not None: print("Because either user not provided or ChemSpider security token is not valid, continuing with PubChem search...") return search(query, user = None, search_type = None, source = "pubchem") else: return result_set return result_set