def workOnCMG(CMGName): source_filename='CIDS_CMG/CIDS_%s.txt' %(CMGName) target_filename='CIDS_Results/%s.csv' %(CMGName) def batchIndexes(list_size): # Returns list of indicies to avoid timeout with pcp.get_synonyms batch_size = 300 batches = int(list_size/batch_size) + 1 remainder = list_size % batch_size begin = 0 end = 0 batch_indexes = [] index = 0 while index < batches: end = begin + batch_size - 1 if end >= list_size - 1: batch_indexes.append((begin,list_size - 1)) break batch_indexes.append((begin,end)) begin = end index += 1 return batch_indexes def cidsList(source_filename): # Build cids list cids=[] master=open(source_filename, 'r') for line in master: cids.append(line.replace('\n','')) master.close() return cids cids=cidsList(source_filename) findings = open(target_filename, 'a') for index in batchIndexes(len(cids)): print "Processing pcp.get_synonyms with cids batch ", index results = pcp.get_synonyms(cids[index[0]:index[1]]) print "Finding CASRN matches in the synonyms ..." for result in results: for syn in result.get('Synonym', []): match = re.match('(\d{2,7}-\d\d-\d)', syn) c=pcp.Compound.from_cid(result.get('CID')) if match and c.iupac_name: findings.write(CMGName + "," + match.group(1) + "," + c.iupac_name + '\n') elif match and not c.iupac_name: findings.write(CMGName + "," + match.group(1) + ",\n") elif not match and c.iupac_name: findings.write(CMGName + ",," + c.iupac_name + '\n') findings.close()
def workOnCMG(CMGName): source_filename = 'CIDS_CMG/CIDS_%s.txt' % (CMGName) target_filename = 'CIDS_Results/%s.csv' % (CMGName) def batchIndexes(list_size): # Returns list of indicies to avoid timeout with pcp.get_synonyms batch_size = 300 batches = int(list_size / batch_size) + 1 remainder = list_size % batch_size begin = 0 end = 0 batch_indexes = [] index = 0 while index < batches: end = begin + batch_size - 1 if end >= list_size - 1: batch_indexes.append((begin, list_size - 1)) break batch_indexes.append((begin, end)) begin = end index += 1 return batch_indexes def cidsList(source_filename): # Build cids list cids = [] master = open(source_filename, 'r') for line in master: cids.append(line.replace('\n', '')) master.close() return cids cids = cidsList(source_filename) findings = open(target_filename, 'a') for index in batchIndexes(len(cids)): print "Processing pcp.get_synonyms with cids batch ", index results = pcp.get_synonyms(cids[index[0]:index[1]]) print "Finding CASRN matches in the synonyms ..." for result in results: for syn in result.get('Synonym', []): match = re.match('(\d{2,7}-\d\d-\d)', syn) c = pcp.Compound.from_cid(result.get('CID')) if match and c.iupac_name: findings.write(CMGName + "," + match.group(1) + "," + c.iupac_name + '\n') elif match and not c.iupac_name: findings.write(CMGName + "," + match.group(1) + ",\n") elif not match and c.iupac_name: findings.write(CMGName + ",," + c.iupac_name + '\n') findings.close()
def get_cas_pcp(cid): """Extracts CAS fom synonyms of pubchempy.Compound or empty string. :param cid: (int) CID from PubChem. :return: cas: (str) CAS Registry Number. """ results = pcp.get_synonyms(cid, 'cid') for result in results: for syn in result.get('Synonym', []): match = re.match('(\d{2,7}-\d\d-\d)', syn) if match: cas = match.group(1) return cas return ''
Entrez.email = '*****@*****.**' handle = Entrez.efetch(db='pubmed', retmode='xml', id=ids) results = Entrez.read(handle) return results for i in range(110,729): text = '' count = 0 for id in mydf[mydf[str(i)]==1].sample(frac=1).iterrows(): # print(id[1]['0_x']) try: k = pcp.get_synonyms(id[1]['0_x'], 'inchi') except: k = [] try: pubmedIDs = search(k[0]['Synonym'][0]) papers = fetch_details(pubmedIDs['IdList']) count += 1 except: print('NO RESULTS') papers = {} try: for x in papers['PubmedArticle']: print('#############################################')
def extract_mol_from_pubchem(cas_nr): global download_path headers = { 'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36' } try: # print('\tSearching Pubchem...') # Using pubchem api for python # Getting CID number, the result of this, by default is exact match. The result is returned as a list. # cid = pcp.get_cids(cas_nr, 'name', 'substance', list_return='flat') cid = pcp.get_cids(cas_nr, 'name') file_name = cas_nr + '.mol' download_file = Path(download_path) / file_name # Check if the file not exists and download # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists if download_file.exists() and os.stat(download_file).st_size != 0: # print('{} already downloaded'.format(file_name)) return -1 else: # this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical. if len(cid) > 0: # if Pubchem found the result, get the first result of the list cid = cid[0] # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid))) # To double check if the CAS number is correct: # using pubchem api, get a list of synonym. The result is a list of dict. # choose the first result and check first 5 values for 'Synonym' key: # synonyms = pcp.get_synonyms(cid)[0]['Synonym'][:7] synonyms = pcp.get_synonyms(cid)[0]['Synonym'] # print('List of synonyms is: {}'.format(synonyms)); exit(0) if cas_nr not in synonyms: raise ValueError('\tThis is not an exact match!') # get url from Fisher to get url to download sds file get_sdf_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/sdf'.format( cid) # # Check if the file not exists and download # # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists # if download_file.exists(): # # print('{} already downloaded'.format(file_name)) # return -1 # else: # # Another way to get sdf, from pubchempy --------------------------------------- # sdf = pcp.get_sdf(cid) # with open('159857-81-5.mol', 'w') as f: # f.write(sdf) # # ---------------------------------------------------------------------------------- # Get the html request info using CID number from pubchem r = requests.get(get_sdf_url, headers=headers, timeout=15) # print('url is: {}'.format(get_sdf_url)) # Check to see if give OK status (200) and not redirect if r.status_code == 200 and len(r.history) == 0: download_file.write_text(data=r.text) # Check if the mol file is a binary string (some error during downloading) or empty mol file: if is_binary_string(open(download_file, 'rb').read( 1024)) or is_empty_mol_file(download_file): os.remove(download_file) # remove the error mol file return cas_nr else: return 0 # If not, try to find substances as well elif len(cid) == 0: '''pcp.get_substances(cas_nr, 'name') returns a list of Substances if found: Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L328''' substances = pcp.get_substances(cas_nr, 'name') # print(sid); exit(0) if len(substances) == 0: # print('nothing here') raise ValueError( 'Could not find any compounds or substances with this CAS {} on Pubchem.' .format(cas_nr)) else: for substance in substances: # print('Substance ID (SID) from PubChem is: {} and type is: {}'.format(substance, type(substance))) '''Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L735''' # substance_synonyms = substance.to_dict(properties=['synonyms'])['synonyms'] ''' substance.to_dict(properties=['synonyms']) return example: {'synonyms': ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 'Iron oxide (Fe203), hydrate']} ''' substance_synonyms = substance.synonyms # https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L1095 ''' substance.synonyms' return example: ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 'Iron oxide (Fe203), hydrate'] ''' # Check to make sure the substance has the same CAS# if cas_nr in substance_synonyms: sdf = pcp.get_sdf(identifier=substance.sid, namespace='sid', domain='substance') # print(sdf) if sdf: # pcp.get_sdf return None if not found SDF download_file.write_text(data=sdf) # Check if the mol file is a binary string (some error during downloading) or empty mol file: if is_binary_string( open(download_file, 'rb').read(1024) ) or is_empty_mol_file(download_file): os.remove(download_file ) # remove the error mol file else: return 0 # If none of the Substances has the same CAS and/or has SDF (mol) file, then return the CAS # return cas_nr except Exception as error: # print('.', end='') if debug: print('Error during search structure in Pubchem:\n\t{}'.format( error)) return cas_nr
def pka_lookup_pubchem(identifier, namespace=None, domain='compound') -> Optional[str]: global debug if len(sys.argv) == 2 and sys.argv[1] in ['--debug=True', '--debug=true', '--debug', '-d']: debug = True # if debug: # print(f'In DEBUG mode: {debug}') # Identify lookup source (Pubchem in this case) lookup_source = 'Pubchem' try: headers = { 'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'} # print('Searching Pubchem...') # Using pubchem api for python # Getting CID number, the result of this, by default is exact match. The result is returned as a list. cids = [] identifier_type = '' if not namespace: identifier_type = classify(identifier) # print(f'identifier_type determined by classify() is: {identifier_type}') # If the input is inchi, inchikey or smiles (this could be a false smiles): if identifier_type in ['smiles', 'inchi', 'inchikey']: lookup = pcp.get_cids(identifier, namespace=identifier_type) if lookup: cids.append(lookup[0]) else: lookup = pcp.get_cids(identifier, namespace='name') if lookup: cids.append(lookup[0]) # print(f'namespace from pubchem lookup is: {namespace}') elif namespace == 'cas': cids = pcp.get_cids(identifier, namespace='name') else: cids = pcp.get_cids(identifier, namespace=namespace) if not cids: lookup = pcp.get_cids(identifier, namespace='name') if lookup: cids.append(lookup[0]) # cids = pcp.get_cids(identifier, namespace=namespace) identifier_type = namespace # print(cids) # this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical. if len(cids) > 0: # if Pubchem found the result, get the first result of the list cid = cids[0] # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid))) exact_match = True # synonyms = [] synonyms = pcp.get_synonyms(cid)[0]['Synonym'] or [] # Extract CAS number from the list of synonyms returned_cas = '' for synonym in synonyms: cas_nr = re.search(r'^\d{2,7}-\d{2}-\d$', synonym) if cas_nr: cas_nr = cas_nr.group() returned_cas = cas_nr break # lookup_result = [] lookup_result = pcp.get_properties(['inchi', 'inchikey', 'canonical_smiles', 'isomeric_smiles', 'iupac_name'], cid) if identifier_type == 'cas': # To double check if the CAS number is correct: # using pubchem api, get a list of synonym. The result is a list of dict. # choose the first result and check all values for 'Synonym' key: exact_match = identifier in synonyms elif identifier_type in ['inchi', 'inchikey']: if identifier_type == 'inchi': # print(lookup_result[0].get('InChI', False)) # print(f'input:\n{identifier}') exact_match = (identifier == lookup_result[0].get('InChI', False)) elif identifier_type == 'inchikey': exact_match = (identifier == lookup_result[0].get('InChIKey', False)) if not exact_match: if debug: print(f'Exact match between input and Pubchem return value? {identifier in synonyms}') raise ValueError('This is not an exact match on Pubchem!') ''' get url from Pubchem to get pka lookup result 'XML' can be replaced with 'JSON' but it is harder to parse later on for more info about Pubchem output types: https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest$_Toc494865558 ''' pka_lookup_result_xml = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{}/XML?heading=Dissociation+Constants'.format(cid) # Get the html request info using CID number from pubchem r = requests.get(pka_lookup_result_xml, headers=headers, timeout=15) # Check to see if give OK status (200) and not redirect if r.status_code == 200 and len(r.history) == 0: # print(r.text) # Use python XML to parse the return result tree = ET.fromstring(r.text) # Get the XML tree of <Information> only info_node = tree.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}Information') # Get the pKa reference: original_source = info_node.find('{http://pubchem.ncbi.nlm.nih.gov/pug_view}Reference').text # Get the pKa result: pka_result = info_node.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}String').text pka_result = re.sub(r'^pKa = ', '', pka_result) # remove 'pka = ' part out of the string answer # print(pka_result) # print(original_source) # print(lookup_result) core_result = { 'source': lookup_source, 'Pubchem_CID': str(cid), 'pKa': pka_result, 'reference': original_source, 'Substance_CASRN': returned_cas, } extra_info = lookup_result[0] extra_info.pop('CID', None) # Remove 'CID': ... from lookup_result[0] # Merge 2 dict: https://treyhunner.com/2016/02/how-to-merge-dictionaries-in-python/ result = {**core_result, **extra_info} # Rename some keys in the dict s = pd.Series(result) s = s.rename({ 'CanonicalSMILES': 'Canonical_SMILES', 'IsomericSMILES': 'Isomeric_SMILES', 'IUPACName': 'IUPAC_Name' }) result = s.to_dict() return result else: raise RuntimeError('pKa not found in Pubchem.') else: raise RuntimeError('Compound not found in Pubchem.') except Exception as error: if debug: traceback_str = ''.join(traceback.format_exception(etype=type(error), value=error, tb=error.__traceback__)) print(traceback_str) return None
def pcp_getter(drug, request = 'name'): p = pcp.get_synonyms(drug, request) if p: return ";".join(p[0]["Synonym"]) else: return ""
import pandas import pubchempy as pcp #gets pubchem ID 5090 and assigns it to c c = pcp.Compound.from_cid(5090) #gets the ID of Quercetin and assigns it to results results = pcp.get_compounds('Quercetin', 'name') print results #get all synonyms for a given compound and assigns to Qsyn Qsyn = pcp.get_synonyms('Quercetin', 'name')
for line in keymaster: data = line.split(',',2) CMGName = data[0] key = str(data[1]).replace('\n', '') print "Getting cids list for " + CMGName + " with key " + key try: rr = requests.get('http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/listkey/%s/cids/JSON'% key) cids = rr.json()['IdentifierList']['CID'] # Split the big list of cids into smaller lists list_length = len(cids) batch_indexes = batchIndexes(list_length) findings = open('Results/%s.csv'% CMGName, 'a') cas_rns = [] for index in batch_indexes: print "Processing pcp.get_synonyms with cids batch ", index results = pcp.get_synonyms(cids[index[0]:index[1]]) print "Finding CASRN matches in the synonyms ..." for result in results: for syn in result.get('Synonym', []): match = re.match('(\d{2,7}-\d\d-\d)', syn) if match: cas_rns.append(match.group(1)) print "Writing results to file ..." for element in cas_rns: findings.write(CMGName + ',' + element + '\n') except Exception as e: print "Checking " + CMGName + " throws error:" print e.message findings = open('Results/%s_error.txt'% CMGName, 'a') findings.write(CMGName + ": something is wrong.\nError message is: " + e.message + '\n')