def _obtain_entry_api(self, search_text, name, output_format) -> Optional[str]: cid = None cids = pcp.get_cids(search_text, "name", record_type="3d") if len(cids) == 0: print("No exact match found, please try the web search") else: cid = str(cids[0]) if output_format.lower() == "smiles": compound = pcp.Compound.from_cid(int(cid)) print("SMILES code:", compound.canonical_smiles) elif output_format.lower() == "pdb": sdf_file = os.path.join(self.write_dir, name + "_" + cid + ".sdf") pdb_file = os.path.join(self.write_dir, name + "_" + cid + ".pdb") pcp.download("SDF", sdf_file, cid, record_type="3d", overwrite=True) sdf_to_pdb(sdf_file, pdb_file) else: pcp.download( output_format.upper(), os.path.join( self.write_dir, name + "_" + cid + "." + output_format.lower()), cid, record_type="3d", overwrite=True, ) return cid
def get_pubchem_id(name): print(name) try: cpd_id = pcp.get_cids(name, "name") return name, cpd_id[0] except: return name, ""
def scrape_super_rest(self, cids, match_isotopes=False, match_charges=False, match_tautomers=False, rings_not_embedded=False, single_double_bonds_match=True, chains_match_rings=True, strip_hydrogen=False, stereo="ignore", max_records=10000): """ Generalized function for superstructure searches (searching for molecules that contain a given molecule within them). Kwargs are those used by PUG and PUG-REST for similarity queries. Parameter descriptions are largely taken from http://pubchemdocs.ncbi.nlm.nih.gov/pug-rest :param cids: A dict {"category": [ids]}, where each category is a molecular type of interest. :param match_isotopes: Atoms must be of the same specified isotope. :param match_charges: Atoms must match the specified charge. :param match_tautomers: Allows matching with tautomers. :param rings_not_embedded: Rings may not be embedded in a larger system. :param single_double_bonds_match: In an aromatic compound, either single or double bonds may match the aromatic bonds. :param chains_match_rings: Chain bonds in the query may match rings in hits. :param strip_hydrogen: Remove explicit hydrogens before searching. :param stereo: How to handle stereoisomers: either "ignore", "exact", "relative", or "nonconflicting". :param max_records: Maximum number of hits. :return: A dict {"category": {id:[matches]}}, where each id in each category is stored along with its matches (which take the form of CIDs. """ output = {} queries_run = 0 for cat in cids.keys(): output[cat] = {} for cid in cids[cat]: queries_run = self.check_queries(queries_run) result = pcp.get_cids( cid, namespace="cid", domain="compound", searchtype="superstructure", match_isotopes=match_isotopes, match_charges=match_charges, match_tautomers=match_tautomers, rings_not_embedded=rings_not_embedded, single_double_bonds_match=single_double_bonds_match, chains_match_rings=chains_match_rings, strip_hydrogen=strip_hydrogen, stereo=stereo, max_records=max_records) output[cat][cid] = result return output
def form2(): if request.method == 'POST': rawtext = request.files['rawtext'] words = set(nltk.corpus.words.words()) fr = PyPDF2.PdfFileReader(rawtext) text = "" num_pages = fr.numPages count = 0 text = "" while count < num_pages: pageObj = fr.getPage(count) count += 1 text += pageObj.extractText() if text != "": text = text else: text = textract.process(input, method='tesseract', language='eng') s=" ".join(w for w in nltk.wordpunct_tokenize(text) \ if (w.lower() not in words and w.upper() not in words)) l = list(filter(lambda x: x[0].isupper(), s.split())) l = list(dict.fromkeys(l)) freq = [] for i in range(50): try: results = pc.get_cids(l[i], 'name') c = pc.Compound.from_cid(results) freq.append((l[i], c.synonyms[0])) except: pass return render_template('index.html', final_summary=freq)
def scrape_similar_rest(self, cids, threshold=90, max_records=10000): """ Searches by similarity (2D) using PUG-REST. :param cids: :param threshold: :param max_records: :return: A dict {"category": {id:[matches]}}, where each id in each category is stored along with its matches (which take the form of CIDs. """ output = {} queries_run = 0 for cat in cids.keys(): output[cat] = {} for cid in cids[cat]: queries_run = self.check_queries(queries_run) result = pcp.get_cids(cid, namespace="cid", domain="compound", searchtype="similarity", threshold=threshold, max_records=max_records) output[cat][cid] = result return output
def get_structure(code, cutoff, dir_search, BRENDA_PARSER): proteins = BRENDA_PARSER.get_proteins(code) substrate, counted = choose_substrate(proteins) if substrate == None: print("No suitable substrate found, skip..") return False #for s in counted: #print(str(counted[s]), " : \t", s ) print("\nmost common: ", substrate) try: CID = pcp.get_cids(substrate, 'name', 'substance', list_return='flat')[0] except: print("CID not found..") return False if cutoff: if check_size_of_substrate(CID, cutoff) == False: return False file = (f'{dir_search}/{CID}.json') #file = (f'{dir_search}/{str(substrate).strip()}.json') try: pcp.download('JSON', file, CID, 'cid') except: return False return True
def search_pubchem(formula, output_file=None, timeout=999): output_file = check_output_file(output_file) # get pubchem cid based on formula cids = pc.get_cids(formula, 'formula', list_return='flat') idstring = '' smiles = [] inchikey = [] all_cids = [] # search pubchem via formula with pug for i, cid in enumerate(cids): idstring += ',' + str(cid) if ((i%100==99) or (i==len(cids)-1)): url_i = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + idstring[1:(len(idstring))] + "/property/InChIKey,CanonicalSMILES/JSON" res_i = requests.get(url_i, timeout=timeout) soup_i = BeautifulSoup(res_i.content, "html.parser") str_i = str(soup_i) properties_i = json.loads(str_i)['PropertyTable']['Properties'] idstring = '' for properties_ij in properties_i: smiles_ij = properties_ij['CanonicalSMILES'] if smiles_ij not in smiles: smiles.append(smiles_ij) inchikey.append(properties_ij['InChIKey']) all_cids.append(str(properties_ij['CID'])) else: wh = np.where(np.array(smiles)==smiles_ij)[0][0] all_cids[wh] = all_cids[wh] + ', ' + str(properties_ij['CID']) result = pd.DataFrame({'InChIKey': inchikey, 'SMILES': smiles, 'PubChem': all_cids}) output = pd.DataFrame({'ID': result.index, 'Smiles': result['SMILES']}) output.to_csv(output_file, header=False, index=False, sep=' ') return result
def get_cid(dataframe, source_column, new_column): """This function will retrieve the pubchem cid's for chemicals in a dataframe. The dataframe, source column for which to retrieve cid's and name of a new column to append to the dataframe""" cid_results = [ ] #temporary empty list that will contain the cid's retrieved by pubchem final_cid = [] #empty list that will contain the final cid's for i, row in dataframe.iterrows(): names = row[source_column] #pubchempy command for retrieving cid's and appending to temporary list #note you need to input how you wish to search for the cid, here we are searching by chemcial 'name' cid_results.append(pcp.get_cids(names, 'name', list_return='flat')) for j in cid_results: if len(j) == 0: #case in which no cid was found by pubchempy final_cid.append("no cid found") if len(j) >= 1: #append only the first cid final_cid.append(j[0]) dataframe[new_column] = final_cid return dataframe
def query_similar(ligand_path): result = os.popen('obabel -ipdb {} -osmi'.format(ligand_path)) smiles = result.read().split('\t')[0] cids = pcp.get_cids(smiles, 'smiles', searchtype='similarity', list_return='flat')
def get_molecular_weights(model, medium): import pubchempy as pcp import ssl # Alternative to using the xialab api (it wouldn't work for me on OSX) # Get molar weights from Chebi using pubchempy # medium: list of metabolites in media, formatted: ['EX_Lcyst(e)', 'EX_Lcystin(e)', 'EX_ade(e)'...] # model: metabolic model in question (read in from Cobrapy) # Returns: df with the columns ["query", "BiGG_ID", "hit", "mol_weight"] # query: substance name # hit: cid x = [model.reactions.get_by_id(compound_id).name for compound_id in medium] x = [compound.rstrip("exchange") for compound in x] ";".join(x) ssl._create_default_https_context = ssl._create_unverified_context df = pd.DataFrame(columns=["query", "BiGG_ID", "hit", "mol_weight"]) for i in range(0, len(x)): new_dict = {'query': x[i]} cid = pcp.get_cids(x[i]) new_dict['BiGG_ID'] = medium[i].lstrip('EX_') try: new_dict['hit'] = cid[0] new_dict['mol_weight'] = pcp.Compound.from_cid( cid[0]).molecular_weight except: new_dict['hit'] = "NA" new_dict['mol_weight'] = "NA" df = df.append(new_dict, ignore_index=True) return df
def change_smiles_pubchem_fingerprint(smiles_list): # smiles_list: list of smiles strings # Uses pubchempy import pubchempy as pcp main_list = [] for i in smiles_list: p_id = pcp.get_cids(identifier=i, namespace="smiles") fingerprint = pcp.Compound.from_cid(p_id).cactvs_fingerprint main_list.append([int(i) for i in str(fingerprint)]) return main_list
def change_smiles_pubchem_fingerprint(smiles_list): # smiles_list: list of smiles strings # Uses pubchempy import pubchempy as pcp main_list = [] for i in smiles_list: p_id = pcp.get_cids(identifier=i, namespace="smiles") fingerprint = pcp.Compound.from_cid(p_id).cactvs_fingerprint main_list.append([int(i) for i in str(fingerprint)]) return main_list
def searchFormulaInfo(formula): info = '' cids = set(pcp.get_cids(formula, 'name')).intersection( pcp.get_cids(formula, 'formula')) for cid in cids: c = pcp.Compound.from_cid(cid) name = c.iupac_name altNames = c.synonyms[:4] if not name and len(altNames) > 0: name = altNames[0] if name in altNames: altNames.remove(name) if name: info += 'This formula has the IUPAC name \\textbf{' + name + '}' if c.molecular_formula: info += ' and the molecular formula \\textbf{' + c.molecular_formula + '}' info += '. ' if len(altNames) > 1: info += 'Some alternative names are ' + ', '.join( str(n) for n in altNames[:-1]) + ' and ' + altNames[-1] + '. ' elif len(altNames) == 1: info += 'An alternative name is ' + altNames[0] + '. ' if c.complexity: info += 'Its complexity has value ' + str(c.complexity) + '. ' if c.exact_mass: info += 'The exact mass is ' + str(c.exact_mass) + '. ' if c.molecular_weight: info += 'The molecular weight is ' + str(c.molecular_weight) + '. ' if c.monoisotopic_mass: info += 'The monoisotopic mass is ' + str( c.monoisotopic_mass) + '. ' return info
def convert(input): words = set(nltk.corpus.words.words()) f = open(input, 'rb') fr = PyPDF2.PdfFileReader(f) #pageObj=fr.getPage(1) text = "" num_pages = fr.numPages count = 0 text = "" #The while loop will read each page while count < num_pages: pageObj = fr.getPage(count) count += 1 text += pageObj.extractText() if text != "": text = text # newString = (text.encode('ascii', 'ignore')).decode("utf-8") else: text = textract.process(input, method='tesseract', language='eng') # newString = (text.encode('ascii', 'ignore')).decode("utf-8") s=" ".join(w for w in nltk.wordpunct_tokenize(text) \ if (w.lower() not in words and w.upper() not in words)) #print(s) #print(text) l = list(filter(lambda x: x[0].isupper(), s.split())) #print(l) l = list(dict.fromkeys(l)) freq = {} for i in range(10): try: results = pc.get_cids(l[i], 'name') c = pc.Compound.from_cid(results) ''' if(i in freq): freq[l[i]][1]+=1 else: freq[l[i]]=[c.synonyms[0],1]''' freq[l[i]] = c.synonyms[0] #print(l[i]+" NAME:- "+c.synonyms[0]) except: pass #print(l) f.close() return freq
def check_size_of_substrate(name, cutoff): counter = 0 try: p = pcp.get_cids(name, 'name', 'substance', list_return='flat') except: print("substrate not found..") return False if len(p) == 0: return False c = pcp.Compound.from_cid(p[0]) c = c.to_dict(properties=['atoms', 'bonds', 'inchi']) for atom in c['atoms']: if atom['element'] is not 'H': counter += 1 if counter > cutoff: return False return True
def get_cid(identifier: str, kind: str = None, verbose: bool = True, fix_smiles_on_error: bool = True, attempt=0) -> int: """ Return data about a molecule from any synonym, including a chemical name or a CAS. """ if isinstance(identifier, float) and np.isnan(identifier): return 0 replace = [('α', 'alpha'), ('β', 'beta'), ('γ', 'gamma'), ('δ', 'delta')] for a, b in replace: identifier = identifier.replace(a, b) if kind is None: kind = get_kind(identifier) else: kind = kind.lower() try: result = pcp.get_cids(identifier, namespace=kind) except pcp.BadRequestError: logger.warning('Request Error for "%s"' % identifier) result = [] except pcp.PubChemHTTPError as e: if attempt == 0: import time time.sleep(10) return get_cid(identifier, kind, verbose, fix_smiles_on_error, 1) else: raise e if not len(result): cid = 0 else: if (len(result) > 1) and verbose: logger.warning("Multiple CIDs for %s: %s" % (identifier, result)) cid = result[0] if not cid and kind == "smiles" and fix_smiles_on_error: # Retry with canonical SMILES identifier = canonical_smiles(identifier) if identifier: cid = get_cid(identifier, kind=kind, verbose=verbose, fix_smiles_on_error=False) return cid
def retrieve( name: Optional[str] = None, smiles: Optional[str] = None, inchi: Optional[str] = None, inchikey: Optional[str] = None, ) -> mtr.Structure: kwargs = ( (name, "name"), (smiles, "smiles"), (inchi, "inchi"), (inchikey, "inchikey"), ) try: identifier, identifier_type = next( (k, v) for k, v in kwargs if k is not None) except StopIteration: raise ValueError( "Provide name, SMILES, InChi, or InChiKey to retrieve structure." ) try: # this just picks the first returned compound # if there are multiple, we are assuming that the # first such compound is the "most relevant" in some sense cid, *_ = pcp.get_cids(identifier, identifier_type) if cid == 0: raise ValueError except (ValueError, OSError): raise ValueError(f"Structure retrieval for {identifier} failed.") try: return _structure_from_pubchem_compound( compound=pcp.Compound.from_cid(cid, record_type="3d")) except pcp.NotFoundError: # no 3d structure from pubchem # there must be a 2d structure since a cid was found [property_dict] = pcp.get_properties(properties="IsomericSMILES", identifier=cid, namespace="cid") return Structure.generate(smiles=property_dict["IsomericSMILES"])
def get_cid( identifier: str, kind: str = "name", verbose: bool = True, fix_smiles_on_error: bool = True ) -> int: """Return data about a molecule from any synonym, including a chemical name or a CAS""" kind = kind.lower() try: result = pcp.get_cids(identifier, namespace=kind) except pcp.BadRequestError: logger.warning('Request Error for "%s"' % identifier) result = [] if not len(result): cid = 0 else: if (len(result) > 1) and verbose: logger.warning("Multiple CIDs for %s: %s" % (identifier, result)) cid = result[0] if not cid and kind == "smiles" and fix_smiles_on_error: # Retry with canonical SMILES identifier = canonical_smiles(identifier) if identifier: cid = get_cid(identifier, kind=kind, verbose=verbose, fix_smiles_on_error=False) return cid
def clean(self): iupac_name = self.cleaned_data.get('iupac_name') trivial_name = self.cleaned_data.get('trivial_name') cas_number = self.cleaned_data.get('cas') override = self.cleaned_data.get('override_iupac') if not (override) or not override: cids = pcp.get_cids(iupac_name, 'name') if not cids: raise forms.ValidationError( "No PubChem match on suggested iupac name. " "Are you sure this is a chemical? " "If so, check CAS, trivial name, " "molecular formula and then use override below. " "An email will be sent to notify admin.") out = "" for cid in cids: out = out + pcp.Compound.from_cid(cid).iupac_name + ", " cmp = pcp.Compound.from_cid(cids[0]) if not cmp.synonyms: trivial = "" else: trivial = cmp.synonyms[0] try: cas = chem.CAS_from_any(iupac_name) except ValueError: cas = "Non found" formula = cmp.isomeric_smiles if pcp.Compound.from_cid( cids[0]).iupac_name.lower() != iupac_name.lower(): raise forms.ValidationError( "Iupac Name is not registered in Pubchem, try: " + out + ", suggested trivial name " + trivial + ", cas " + cas + ", suggested formula (SMILES) " + formula) return self.cleaned_data
def searchPubChem(self, searchterm='', filters=[], numresults=10,\ randomized=True, save=True): ''' Get list of initialized Molecule cids from database that pass given filters Parameters: filters ([Filter]): list of filters to apply Returns: list of cids ([int]) ''' #Retrieve search results from PubChem searchcids = \ pcp.get_cids(searchterm , namespace='smiles', \ searchtype='substructure', MaxRecords=10000, record_type='3d') if randomized: random.shuffle(searchcids) results = [] for cid in searchcids: if len(results) == numresults: break if str(cid) in self.molecules: continue print(f"fetching molecule {cid}") mol = Molecule.from_cid(cid) if mol is None: continue print("saving") self.save(mol) passed = True for molfilter in filters: if not molfilter.check(mol): passed = False break if passed: print('passed') results.append(mol.id) return results
import math import pubchempy as pcp import matplotlib.pyplot as plt import pandas as pd cids_95 = pcp.get_cids( 'CCCC1=NN(C2=C1N=C(NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C', 'smiles', searchtype='similarity', Threshold=95) print(f'len cids for 95: {len(cids_95)}') cids_80 = pcp.get_cids( 'CCCC1=NN(C2=C1N=C(NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C', 'smiles', searchtype='similarity', Threshold=80) print(f'len cids for 80: {len(cids_80)}') cids_70 = pcp.get_cids( 'CCCC1=NN(C2=C1N=C(NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C', 'smiles', searchtype='similarity', Threshold=70) print(f'len cids for 70: {len(cids_70)}') # график зависимости количества результатов поиска от порога схожести threshholds = [95, 80, 70] threshholds_lengths = [len(cids_95), len(cids_80), len(cids_70)] plt.plot(threshholds, threshholds_lengths)
""" import pubchempy as pcp import urllib.request import bs4 as BS products = {} with open('LabNetworkSearch.txt') as l: for line in l: information = line.split() productID = information[0] smile = information[1] products[productID] = {"SMILES": smile} try: getCID = str(pcp.get_cids(smile, 'smiles')) getCID = getCID.replace("[", "") getCID = getCID.replace("]", "") products[productID]["CID"] = getCID except: products[productID]["CID"] = "NA" print('Record completed, checking the next record') else: print('Record completed, checking the next record') print("CID added to the dictionary\n") partOne = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/' partTwo = '/XML?heading=LCSS' for product, product_info in products.items():
def get_cid_from_inchikey(inkey): """ Search PubChem for a matching InChiKey """ result = pcp.get_cids(inkey, "inchikey") return result.pop() if result else None
#!/usr/bin/env python import sys import pubchempy as pcp f = open("drugs.txt").readlines() drug_names = [] drug_smiles = [] drug_cns = [] for i in f[1:]: l = i.split() if len(l) > 2: drug_names.append(l[0]) drug_cns.append(l[1]) drug_smiles.append(l[2].strip()) drug_cids = [] for i in drug_names: cid_list = pcp.get_cids(i, 'name', 'compound', list_return='flat') print "%20s %5d hits" % (i, len(cid_list)) drug_cids.append(cid_list) of = open("drug_pubchem_index.txt", "w") for i in range(len(drug_names)): of.write("%20s, %6s, %s, %s\n" % (drug_names[i], drug_cns[i], drug_cids[i], drug_smiles[i])) of.close()
def getResults(query, queryType): cids = pcp.get_cids(query, queryType, 'substance', list_return='flat') results = [pcp.Compound.from_cid(cid) for cid in cids] return results
def pka_lookup_pubchem(identifier, namespace=None, domain='compound') -> Optional[str]: global debug if len(sys.argv) == 2 and sys.argv[1] in ['--debug=True', '--debug=true', '--debug', '-d']: debug = True # if debug: # print(f'In DEBUG mode: {debug}') # Identify lookup source (Pubchem in this case) lookup_source = 'Pubchem' try: headers = { 'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'} # print('Searching Pubchem...') # Using pubchem api for python # Getting CID number, the result of this, by default is exact match. The result is returned as a list. cids = [] identifier_type = '' if not namespace: identifier_type = classify(identifier) # print(f'identifier_type determined by classify() is: {identifier_type}') # If the input is inchi, inchikey or smiles (this could be a false smiles): if identifier_type in ['smiles', 'inchi', 'inchikey']: lookup = pcp.get_cids(identifier, namespace=identifier_type) if lookup: cids.append(lookup[0]) else: lookup = pcp.get_cids(identifier, namespace='name') if lookup: cids.append(lookup[0]) # print(f'namespace from pubchem lookup is: {namespace}') elif namespace == 'cas': cids = pcp.get_cids(identifier, namespace='name') else: cids = pcp.get_cids(identifier, namespace=namespace) if not cids: lookup = pcp.get_cids(identifier, namespace='name') if lookup: cids.append(lookup[0]) # cids = pcp.get_cids(identifier, namespace=namespace) identifier_type = namespace # print(cids) # this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical. if len(cids) > 0: # if Pubchem found the result, get the first result of the list cid = cids[0] # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid))) exact_match = True # synonyms = [] synonyms = pcp.get_synonyms(cid)[0]['Synonym'] or [] # Extract CAS number from the list of synonyms returned_cas = '' for synonym in synonyms: cas_nr = re.search(r'^\d{2,7}-\d{2}-\d$', synonym) if cas_nr: cas_nr = cas_nr.group() returned_cas = cas_nr break # lookup_result = [] lookup_result = pcp.get_properties(['inchi', 'inchikey', 'canonical_smiles', 'isomeric_smiles', 'iupac_name'], cid) if identifier_type == 'cas': # To double check if the CAS number is correct: # using pubchem api, get a list of synonym. The result is a list of dict. # choose the first result and check all values for 'Synonym' key: exact_match = identifier in synonyms elif identifier_type in ['inchi', 'inchikey']: if identifier_type == 'inchi': # print(lookup_result[0].get('InChI', False)) # print(f'input:\n{identifier}') exact_match = (identifier == lookup_result[0].get('InChI', False)) elif identifier_type == 'inchikey': exact_match = (identifier == lookup_result[0].get('InChIKey', False)) if not exact_match: if debug: print(f'Exact match between input and Pubchem return value? {identifier in synonyms}') raise ValueError('This is not an exact match on Pubchem!') ''' get url from Pubchem to get pka lookup result 'XML' can be replaced with 'JSON' but it is harder to parse later on for more info about Pubchem output types: https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest$_Toc494865558 ''' pka_lookup_result_xml = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{}/XML?heading=Dissociation+Constants'.format(cid) # Get the html request info using CID number from pubchem r = requests.get(pka_lookup_result_xml, headers=headers, timeout=15) # Check to see if give OK status (200) and not redirect if r.status_code == 200 and len(r.history) == 0: # print(r.text) # Use python XML to parse the return result tree = ET.fromstring(r.text) # Get the XML tree of <Information> only info_node = tree.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}Information') # Get the pKa reference: original_source = info_node.find('{http://pubchem.ncbi.nlm.nih.gov/pug_view}Reference').text # Get the pKa result: pka_result = info_node.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}String').text pka_result = re.sub(r'^pKa = ', '', pka_result) # remove 'pka = ' part out of the string answer # print(pka_result) # print(original_source) # print(lookup_result) core_result = { 'source': lookup_source, 'Pubchem_CID': str(cid), 'pKa': pka_result, 'reference': original_source, 'Substance_CASRN': returned_cas, } extra_info = lookup_result[0] extra_info.pop('CID', None) # Remove 'CID': ... from lookup_result[0] # Merge 2 dict: https://treyhunner.com/2016/02/how-to-merge-dictionaries-in-python/ result = {**core_result, **extra_info} # Rename some keys in the dict s = pd.Series(result) s = s.rename({ 'CanonicalSMILES': 'Canonical_SMILES', 'IsomericSMILES': 'Isomeric_SMILES', 'IUPACName': 'IUPAC_Name' }) result = s.to_dict() return result else: raise RuntimeError('pKa not found in Pubchem.') else: raise RuntimeError('Compound not found in Pubchem.') except Exception as error: if debug: traceback_str = ''.join(traceback.format_exception(etype=type(error), value=error, tb=error.__traceback__)) print(traceback_str) return None
def extract_mol_from_pubchem(cas_nr): global download_path headers = { 'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36' } try: # print('\tSearching Pubchem...') # Using pubchem api for python # Getting CID number, the result of this, by default is exact match. The result is returned as a list. # cid = pcp.get_cids(cas_nr, 'name', 'substance', list_return='flat') cid = pcp.get_cids(cas_nr, 'name') file_name = cas_nr + '.mol' download_file = Path(download_path) / file_name # Check if the file not exists and download # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists if download_file.exists() and os.stat(download_file).st_size != 0: # print('{} already downloaded'.format(file_name)) return -1 else: # this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical. if len(cid) > 0: # if Pubchem found the result, get the first result of the list cid = cid[0] # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid))) # To double check if the CAS number is correct: # using pubchem api, get a list of synonym. The result is a list of dict. # choose the first result and check first 5 values for 'Synonym' key: # synonyms = pcp.get_synonyms(cid)[0]['Synonym'][:7] synonyms = pcp.get_synonyms(cid)[0]['Synonym'] # print('List of synonyms is: {}'.format(synonyms)); exit(0) if cas_nr not in synonyms: raise ValueError('\tThis is not an exact match!') # get url from Fisher to get url to download sds file get_sdf_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/sdf'.format( cid) # # Check if the file not exists and download # # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists # if download_file.exists(): # # print('{} already downloaded'.format(file_name)) # return -1 # else: # # Another way to get sdf, from pubchempy --------------------------------------- # sdf = pcp.get_sdf(cid) # with open('159857-81-5.mol', 'w') as f: # f.write(sdf) # # ---------------------------------------------------------------------------------- # Get the html request info using CID number from pubchem r = requests.get(get_sdf_url, headers=headers, timeout=15) # print('url is: {}'.format(get_sdf_url)) # Check to see if give OK status (200) and not redirect if r.status_code == 200 and len(r.history) == 0: download_file.write_text(data=r.text) # Check if the mol file is a binary string (some error during downloading) or empty mol file: if is_binary_string(open(download_file, 'rb').read( 1024)) or is_empty_mol_file(download_file): os.remove(download_file) # remove the error mol file return cas_nr else: return 0 # If not, try to find substances as well elif len(cid) == 0: '''pcp.get_substances(cas_nr, 'name') returns a list of Substances if found: Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L328''' substances = pcp.get_substances(cas_nr, 'name') # print(sid); exit(0) if len(substances) == 0: # print('nothing here') raise ValueError( 'Could not find any compounds or substances with this CAS {} on Pubchem.' .format(cas_nr)) else: for substance in substances: # print('Substance ID (SID) from PubChem is: {} and type is: {}'.format(substance, type(substance))) '''Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L735''' # substance_synonyms = substance.to_dict(properties=['synonyms'])['synonyms'] ''' substance.to_dict(properties=['synonyms']) return example: {'synonyms': ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 'Iron oxide (Fe203), hydrate']} ''' substance_synonyms = substance.synonyms # https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L1095 ''' substance.synonyms' return example: ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 'Iron oxide (Fe203), hydrate'] ''' # Check to make sure the substance has the same CAS# if cas_nr in substance_synonyms: sdf = pcp.get_sdf(identifier=substance.sid, namespace='sid', domain='substance') # print(sdf) if sdf: # pcp.get_sdf return None if not found SDF download_file.write_text(data=sdf) # Check if the mol file is a binary string (some error during downloading) or empty mol file: if is_binary_string( open(download_file, 'rb').read(1024) ) or is_empty_mol_file(download_file): os.remove(download_file ) # remove the error mol file else: return 0 # If none of the Substances has the same CAS and/or has SDF (mol) file, then return the CAS # return cas_nr except Exception as error: # print('.', end='') if debug: print('Error during search structure in Pubchem:\n\t{}'.format( error)) return cas_nr
def get_pubchem_id(name): try: cpd_id = pcp.get_cids(name, "name") return cpd_id[0] except: return ""
'isomeric_smiles', 'inchi', 'iupac_name', 'exact_mass')) tup_list = [item for item in legend.items()] lege = '' for tup in tup_list: lege += f'{str(tup)}\n' print(lege) image = MolsToImage([mol_formated_molecule], subImgSize=(1200, 800), fitimage=True, legends=[lege]) image.show() request = input('Search: molecule\n') responce = pcp.get_cids(request, 'name') if len(responce) < 1: print("I couldn't find what you were looking for") elif len(responce) == 1: print('Searching') compound = pcp.Compound.from_cid(responce[0]) describe_me(compound) else: print('Compound : CID') for responces in responce: print({pcp.Compound.from_cid(responces).synonyms[0]: responces}) choice = input('Select a CID to search\n') try: int(choice) compound = pcp.Compound.from_cid(int(choice))
#results = pcp.get_substances('ethanol','name') #print(results) #cids = pcp.get_cids('ethanol','name') #print(cids) #c = pcp.Compound.from_cid(cids[0]) #structure = c.inchi #print(structure) for cmp in cmps: print(cmp) # We'll just grab the first cid cid = pcp.get_cids(cmp, 'name')[0] c = pcp.Compound.from_cid(cid) print(c.cid) pcp.download('PNG', 'images/' + cmp.replace(" ", "_") + '.png', c.cid, 'cid', overwrite=True) m = Chem.MolFromInchi(c.inchi) #atoms_list = list(m.GetAtoms()) #atoms = [] #for i in range(len(atoms_list)): # atoms.append(atoms_list[i]) #print("Atoms: ", atoms) print("Alcohol: ", id_fg.is_alcohol(m)) print("COOH: ", id_fg.is_cooh(m))