def test_parse_pubchem_compounds(filename): """ :param: filename: A string denoting the path to the metabolomics file :return: all_compounds: A Pandas dataframe containing the PubMed mappings """ fileData = pd.read_excel(filename) fileData = fileData.drop('Compound Method', axis=1).join( fileData['Compound Method'].str.split( '/', expand=True).stack().reset_index(level=1, drop=True).rename('Metabolite')) fileData['Metabolite'] = fileData['Metabolite'].str.lower() pubChemQuery = fileData['Metabolite'].tolist() all_compounds = [] for metabolite in pubChemQuery: try: df = pcp.get_substances(identifier=metabolite, namespace='name', as_dataframe=True) df['Name'] = metabolite print(df) all_compounds.append(df) except (KeyError, TimeoutError, pcp.TimeoutError): continue all_compounds = pd.concat(all_compounds) return all_compounds
def addinchikey(excelpathin, excelpathout): ''' The function takes input Excel file substractes relevant columns -'Structure', 'Name', 'Formula'. From the 'Structure' it create another 3 columns - inchikey, source_id , source_name :param excelpathin: The path to Excel file which is the output of the Compound Discoverer. in the format r'path' - r'D:/BCDD/Documents/TalCompounds_export_test.xlsx" :param excelpathout: path to output Excel after the merge. :return: Excel file with inchikey ''' CD = pd.read_excel(excelpathin) CD = pd.DataFrame(CD, columns=['Structure', 'Name', 'Formula']) sdflist = CD.Structure # loop over all cells in Structure is Nan value enter the string 'Nan' # adding delay time so we want be blocked newlistinchikey = [] newlistsource_id = [] newlistsource_name = [] for idx, sdf in enumerate(sdflist): if idx % 50 == 0: time.sleep(3.25) if pd.isnull(sdf): # print(idx) newlistinchikey.append(np.nan) newlistsource_id.append(np.nan) newlistsource_name.append(np.nan) else: comp = pcp.get_compounds(sdf, 'sdf') substance = pcp.get_substances(comp[0].cid, 'sid') # print(comp) # print(substance) # comp[0].inchikey newlistinchikey.append(comp[0].inchikey) newlistsource_name.append(substance[0].source_name) newlistsource_id.append(substance[0].source_id) # Change list to Dataframe and concatenate with the original data and name them newlistinchikey = pd.DataFrame(newlistinchikey) newlistinchikey.columns = ['inchikey'] newlistsource_name = pd.DataFrame(newlistsource_name) newlistsource_name.columns = ['source_name'] newlistsource_id = pd.DataFrame(newlistsource_id) newlistsource_id.columns = ['source_id'] CD = pd.concat([CD, newlistinchikey, newlistsource_name, newlistsource_id], axis=1, sort=False) # Export the merge data to an Excel file writer = pd.ExcelWriter(excelpathout, engine='xlsxwriter') CD.to_excel(writer, header=True) writer.save() writer.close()
def queryPubChem(data): """ queryPubChem maps the metabolite name from a pandas Dataframe in the 'Compound Method' column and extracts synoynms from several databases using the PubChem API. :param data: A Pandas Dataframe of the metabolomics dataframe with the common metabolite identifiers under the 'Compound Method' column :return all_compounds: A Pandas Dataframe from the PubChem API containing the metabolite map. This dataframe is saved as a .csv file. :return queryList: A string with semicolon delimters to be fed into a REST-API """ import pubchempy as pcp # Split 'Compound Method' column by the '/' regex and clean up some data data = data.drop('Compound Method', axis=1).join(data['Compound Method'].str.split( '/', expand=True).stack().reset_index( level=1, drop=True).rename('Metabolite')) data['Metabolite'] = data['Metabolite'].str.lower() pubChemQuery = data['Metabolite'].tolist() # Mine the PubChem database for synonyms all_compounds = [] print( "Mapping metabolite names to PubChem database for synonym matching and ID retrieval." ) # The data is too large to keep in memory. So I wrote it into a csv file, and will read in. for metabolite in pubChemQuery: try: df = pcp.get_substances(identifier=metabolite, namespace='name', as_dataframe=True) df['Name'] = metabolite df = df.applymap(str) df = df.drop('synonyms', axis=1).join(df['synonyms'].str.split( ',', expand=True).stack().reset_index(level=1, drop=True).rename('synonyms')) df.to_csv('~/Data/Mappings/ME1/pubmed_me1_query.csv', mode='a', header=False, index=False) except (KeyError, TimeoutError, pcp.TimeoutError): continue print("Finished metabolite common name -> identifier synoynm matching!")
def request_pubchem(cas, name, en_name, trans): print(cas, name, en_name) if en_name: en_name = _translate(en_name, trans) else: en_name = _translate(name.capitalize(), trans) cas = cas.strip() # print(name, '-->', en_name, '(en), CAS: {}'.format(cas)) if cas: r = requests.get(PC_SEARCH, params={'term': 'CAS-{}'.format(cas)}) else: r = requests.get(PC_SEARCH, params={'term': en_name}) m = PC_COMPOUND_re.search(r.url) data = {} structure = '' if m is not None: cid = m.group(1) compound = pcp.Compound.from_cid(int(cid)) data = compound.to_dict() structure = _get_structure(cid) else: # Try the same with the translated name r = requests.get(PC_SEARCH, params={'term': en_name}) m = PC_COMPOUND_re.search(r.url) if m is not None: cid = m.group(1) compound = pcp.Compound.from_cid(int(cid)) data = compound.to_dict() structure = _get_structure(cid) else: try: compound = pcp.get_compounds(en_name, 'name')[0] data = compound.to_dict() structure = _get_structure(str(compound.cid)) except IndexError: # Try to find as substance try: substance = pcp.get_substances(en_name, 'name')[0] compound = pcp.Compound.from_cid(substance.cids[0]) data = compound.to_dict() structure = _get_structure(str(compound.cid)) except IndexError: pass return data, structure, en_name
def search_pubchem_id(generic): ###search pubchem info sub_results = pcp.get_substances(generic, 'name') comp_results = pcp.get_compounds(generic, 'name') sub = None comp = None sub_ids = [] comp_ids = [] if (sub_results): for s_id in sub_results: print("sid:", s_id, str(s_id)[10:-1]) sub_ids.append(str(s_id)[10:-1]) if (comp_results): print(comp_results) for c_id in comp_results: print("cid:", c_id, str(c_id)[9:-1]) comp_ids.append([str(c_id)[9:-1], c_id.inchikey]) if (len(comp_ids) == 0): return None, sub_ids else: return comp_ids, sub_ids
def pubchem(compound, isCompound=True, report=False): def drug_form(compound, report=False): '''Prints data and returns it of a compound from pcp''' cid = compound.cid Entrez.email = import_email() x = Entrez.esummary(db="pccompound", id=cid) #Obtain data record = Entrez.read(x) #pharmaKeys = record[0].keys() #SynonymList, Id, MolecularFormula, MolecularWeight, HydrogenBondDonorCount, HydrogenBondAcceptorCount, PharmActionList,XLogP record = record[0] action = record['PharmActionList'] weight = record['MolecularWeight'] formula = record['MolecularFormula'] donors = record['HydrogenBondDonorCount'] acceptors = record['HydrogenBondAcceptorCount'] XLogP = record['XLogP'] names = record['SynonymList'] rotatable = record['RotatableBondCount'] '''weight = compound.molecular_weight formula = compound.molecular_formula names = compound.synonyms donors = compound.h_bond_donor_count acceptors = compound.h_bond_acceptor_count rotatable = compound.rotatable_bond_count XLogP = compound.xlogp''' if report == False: if len(names) >= 5: print("Names: " + str(names[0:5])) else: print("Names: " + str(names[0:len(names)])) print("Weight: " + str(weight)) print("Formula: " + str(formula)) print("Donors, acceptors and rotatables: " + str(donors) + ", " + str(acceptors) + ", " + str(rotatable)) print("XLogP: " + str(XLogP)) if len(action) >= 10: print("Actions: " + str(action[0:10])) else: print("Actions: " + str(action)) drug_data = { "weight": weight, "formula": formula, "names": names, "donors": donors, "acceptors": acceptors, "rotatable": rotatable, "XLogP": XLogP, 'action': action } return drug_data if report == True: print("Importing relevant data from the query {}".format(compound)) info = "" x = pcp.get_compounds(compound, 'name') if len(x) > 0: #Checks if there is a compound with the name "compound" info = drug_form(x[0], report) return info else: if report == False: print( "No results have been found using get_compounds, proceeding with substance search..." ) x = pcp.get_substances(compound, 'name') if len(x) > 0: #If a substance is found, do this... if report == False: print("Substances found: " + str(x)) print("\nFinding all synonyms...") info = [] for every in range(len(x)): if report == False: print(x[every].synonyms) info.append(x[every].synonyms) return info else: if report == False: print("No drug information has been found") return info
def addcols_joindata(excelpathin, jsonpathin, excelpathout): ''' The function takes input Excel file substractes relevant columns -'Structure', 'Name', 'Formula'. From the 'Structure' it create another 3 columns - inchikey, source_id , source_name. Then, take the JSON file which is the parsed data from the HMDB and merge the modified EXCEL with JSON, first by inchikey then by name and finely by Chemical Formula. This function takes the addinchikey and joindata functions and merge them to one. :param excelpathin: The path to Excel file which is the output of the Compound Discoverer. in the format r'path' - r'D:/BCDD/Documents/TalCompounds_export_test.xlsx" :param jsonpathin: Path to JSON file - parsed XML file from HMDB :param excelpathout: Path to output Excel after the merge. :return: The columns of the Excel file with added columns (disease name) from the JSON ''' start_time = time.time() CD = pd.read_excel(excelpathin) # CD = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/MOD_REINJ_NEG_ChemSpider Results.xlsx') # CD = pd.read_excel(r'D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Compounds_export_test.xlsx') CD = pd.DataFrame(CD[1000:2001], columns=['Structure', 'Name', 'Formula']) sdflist = CD.Structure # Loop over all cells in Structure is Nan value enter the string 'Nan' # adding delay time so we want be blocked newlistinchikey = [] newlistsource_id = [] newlistsource_name = [] for idx, sdf in enumerate(sdflist): print(idx) if idx % 50 == 0: print("--- %s seconds --f-time to %s rows" % ((time.time() - start_time), idx)) time.sleep(3.25) if pd.isnull(sdf): # print(idx) newlistinchikey.append(np.nan) newlistsource_id.append(np.nan) newlistsource_name.append(np.nan) else: comp = pcp.get_compounds(sdf, 'sdf') # In case the comp[0]=Compound() than type(comp[0].cid) is <class 'NoneType'> if type(comp[0].cid) == type(None): substance = [] newlistinchikey.append(np.nan) else: substance = pcp.get_substances(comp[0].cid, 'sid') # print(comp) # print(substance) # comp[0].inchikey newlistinchikey.append(comp[0].inchikey) # The if statement is in case substance= [] (empty) -> then len(substance)=0 if len(substance) > 0: newlistsource_name.append(substance[0].source_name) newlistsource_id.append(substance[0].source_id) else: newlistsource_name.append(np.nan) newlistsource_id.append(np.nan) # Change list to Dataframe and concatenate with the original data and name them newlistinchikey = pd.DataFrame(newlistinchikey) newlistinchikey.columns = ['InChIKey'] newlistsource_name = pd.DataFrame(newlistsource_name) newlistsource_name.columns = ['source_name'] newlistsource_id = pd.DataFrame(newlistsource_id) newlistsource_id.columns = ['source_id'] CD = pd.concat([CD, newlistinchikey, newlistsource_name, newlistsource_id], axis=1, sort=False) print("--- %s seconds --f-add 3 cols" % (time.time() - start_time)) # From here is the joindata function with modification # Load the parse HMDB file with open(jsonpathin, 'r') as read_file: data = json.load(read_file) start_time = time.time() # Load the parse HMDB file # with open('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Parser_HMDB.py Output/serum_metabolites.json', 'r') as read_file: # data = json.load(read_file) # Create a data frame from the list of dictionaries # df_hmdb = pd.DataFrame(data, columns=['accession', 'name', 'chemical_formula', 'inchikey', 'disease_name' ]) df_hmdb = pd.DataFrame(data) df_hmdb.drop( ['description', 'synonyms', 'kegg_id', 'meta_cyc_id', 'pathway_name'], axis=1) df_excel = CD # Merge by inchikey joindata_by_inchikey = pd.merge(left=df_excel, right=df_hmdb, how='inner', left_on='InChIKey', right_on='inchikey') print("--- %s seconds --f-merge by inchikey " % (time.time() - start_time)) start_time = time.time() # Reduce the rows to those we DID find a match by inchkey in bothe data sets df_hmdb_reduce_byinchik = df_hmdb.loc[~df_hmdb['inchikey']. isin(df_excel['InChIKey'])] df_excel_reduce_byinchik = df_excel.loc[ ~df_excel['InChIKey'].isin(joindata_by_inchikey['InChIKey'])] # joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel, df_hmdb, left_on="Name", right_on="name") joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel_reduce_byinchik, df_hmdb_reduce_byinchik, left_on="Name", right_on="name") # Selecting threshold best_match_score>0.25 maybe adjustments needed joindata_by_name = joindata_by_name[ joindata_by_name['best_match_score'] > 0.55] # Drop columns the joindata_by_name.drop(['best_match_score', '__id_left', '__id_right'], axis=1, inplace=True) print("--- %s seconds --f-merge by name" % (time.time() - start_time)) start_time = time.time() # Reduce the rows to those we DID find a match by inchkey in and by name both data sets df_hmdb_reduce_byname = df_hmdb_reduce_byinchik.loc[ ~df_hmdb_reduce_byinchik['name'].isin(joindata_by_name['name'])] df_excel_reduce_byname = df_excel_reduce_byinchik.loc[ ~df_excel_reduce_byinchik['Name'].isin(joindata_by_name['Name'])] # Remove spaces between letters on 'Formula' ( there is a warning) df_excel_reduce_byname.loc[:, 'Formula'] = df_excel_reduce_byname[ 'Formula'].str.replace(' ', '') # Merge by chemical_formula joindata_by_CF = pd.merge(left=df_excel_reduce_byname, right=df_hmdb_reduce_byname, how='inner', left_on='Formula', right_on='chemical_formula') # This data inculed rows from the original EXCEL file that we did NOT find and match ( by inchikey nor name nor CF) df_excel_reduce_byCF = df_excel_reduce_byname.loc[ ~df_excel_reduce_byname['Formula']. isin(joindata_by_CF['chemical_formula'])] # Create a list of all columns of the HMDB JSON data colnames = joindata_by_inchikey.columns[6:] # Add those names as empty columns to the df_excel_reduce_byCF. reducedata in all the rows from the original Excel # that did NOT find a match and added the columns of the HMDB reducedata = df_excel_reduce_byCF.reindex( columns=[*df_excel_reduce_byCF.columns.tolist(), *colnames]) # Append all the data sets # out = joindata_by_inchikey.append(joindata_by_name.append(joindata_by_CF)) out = joindata_by_inchikey.append( joindata_by_name.append(joindata_by_CF.append(reducedata))) print("--- %s seconds --f-merge by CF" % (time.time() - start_time)) # Export the merge data to an Excel file writer = pd.ExcelWriter(excelpathout, engine='xlsxwriter') # writer = pd.ExcelWriter('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/MOD_REINJ_NEG_ChemSpider ResultsW HMDB_0_1000.xlsx', engine='xlsxwriter') out.to_excel(writer, header=True) writer.save() writer.close() return (out)
def insert_interaction(interaction_name): print(interaction_name) translated = translate(interaction_name, 'en', 'ar') translated_spelled = get_google_spelling(translated) interaction_id = get_interaction(interaction_name) if interaction_id: return interaction_id[0][0] comp_id = get_composition(interaction_name) if comp_id: interaction_id = insert_interction_db(interaction_name, translated_spelled, "0") return interaction_id thera_id = get_therapeutic_from_names(interaction_name, translated_spelled) if thera_id: thera_id = insert_interction_db(interaction_name, translated_spelled, "1") return thera_id try: results = pcp.get_substances(translated_spelled, 'name') if len(results) > 0: print("from pcp") interaction_id = insert_interction_db(interaction_name, translated_spelled, "0") return interaction_id else: res = re.sub(" - ", "", translated_spelled) res = re.sub("-", "", res) res = re.sub(r"/\s+/", " ", res) res = re.sub(r"[^a-zA-Z ]", "", res) res = res.strip() print(res) thera = get_drug_class_from_drugs_site(res) if len(thera) > 0: print("thera now " , thera[0]) thera_ar = get_therapeutic_arabic_name(thera[0]) if thera_id: interaction_id = insert_interction_db(thera_ar, translated_spelled, "1") return interaction_id else: file_en_dir = "utils/drug_classes_drug.txt" file_en_dir = pkg_resources.resource_filename(__name__, file_en_dir) file_en = open(file_en_dir, 'r', encoding="utf-8") parts_en = file_en.read() file_ar_dir = "utils/drug_classes_ar.txt" file_ar_dir = pkg_resources.resource_filename(__name__, file_ar_dir) parts_en = re.split('[\n]', parts_en) file_ar = open(file_ar_dir, 'r', encoding="utf-8") parts_ar = file_ar.read() drug_site_class = re.sub(" agents", " ", thera[0]) drug_site_class = re.sub("agents ", " ", drug_site_class) drug_site_class = re.sub(" for ", " ", drug_site_class) drug_site_class = re.sub(" drugs", " ", drug_site_class) drug_site_class = re.sub("drugs ", " ", drug_site_class) drug_site_class = re.sub("-", "", drug_site_class) for i in range(len(parts_en)): parts_en[i] = parts_en[i].split('.')[0] parts_en[i] = re.sub(" agents", " ", parts_en[i]) parts_en[i] = re.sub("agents ", " ", parts_en[i]) parts_en[i] = re.sub(" for ", " ", parts_en[i]) parts_en[i] = re.sub(" drugs", " ", parts_en[i]) parts_en[i] = re.sub("drugs ", " ", parts_en[i]) parts_en[i] = re.sub("-", "", parts_en[i]) if drug_site_class.strip().lower() == parts_en[i].strip().lower(): print("matched with thera", parts_ar[i]) interaction_id = insert_interction_db(parts_ar[i], translated_spelled, "1") return interaction_id print("couldnt match with thera ") print(interaction_name, translated_spelled) interaction_id = insert_interction_db(interaction_name, translated_spelled, "1") return interaction_id else: print("no thera came back") interaction_id = insert_interction_db(interaction_name, translated_spelled, "1") return interaction_id except Exception as e: results, found = get_code(interaction_name) if found and len(results) > 0: categories = ["Biologically Active Substance", "Pharmacologic Substance", "Element, Ion, or Isotope", "Organic Chemical", "Antibiotic"] found_translation = check(results[0][0], categories) if found_translation: print("from umls") interaction_id = insert_interction_db(interaction_name, translated_spelled, "0") return interaction_id res = re.sub(" - ", "", translated_spelled) res = re.sub("-", "", res) res = re.sub(r"/\s+/", " ", res) res = re.sub(r"[^a-zA-Z ]", "", res) res = res.strip() print(res) thera = get_drug_class_from_drugs_site(res) if len(thera) > 0: print("thera now ", thera[0]) thera_ar = get_therapeutic_arabic_name(thera[0]) if thera_id: print("from thera") interaction_id = insert_interction_db(thera_ar, translated_spelled, "1") return interaction_id else: file_en_dir = "utils/drug_classes_drug.txt" file_en_dir = pkg_resources.resource_filename(__name__, file_en_dir) file_en = open(file_en_dir, 'r', encoding="utf-8") parts_en = file_en.read() file_ar_dir = "utils/drug_classes_ar.txt" file_ar_dir = pkg_resources.resource_filename(__name__, file_ar_dir) parts_en = re.split('[\n]', parts_en) file_ar = open(file_ar_dir, 'r', encoding="utf-8") parts_ar = file_ar.read() for i in range(len(parts_en)): drug_site_class = re.sub(" agents", " ", thera[0]) drug_site_class = re.sub("agents ", " ", drug_site_class) drug_site_class = re.sub(" for ", " ", drug_site_class) drug_site_class = re.sub(" drugs", " ", drug_site_class) drug_site_class = re.sub("drugs ", " ", drug_site_class) drug_site_class = re.sub("-", "", drug_site_class) if drug_site_class.strip().lower() == parts_en[i].strip().lower(): print("matched with thera", parts_ar[i]) interaction_id = insert_interction_db(parts_ar[i], translated_spelled, "1") return interaction_id print("couldnt match with thera ") interaction_id = insert_interction_db(interaction_name, translated_spelled, "1") return interaction_id else: print("no thera came back") interaction_id = insert_interction_db(interaction_name, translated_spelled, "1") return interaction_id
def extract_mol_from_pubchem(cas_nr): global download_path headers = { 'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36' } try: # print('\tSearching Pubchem...') # Using pubchem api for python # Getting CID number, the result of this, by default is exact match. The result is returned as a list. # cid = pcp.get_cids(cas_nr, 'name', 'substance', list_return='flat') cid = pcp.get_cids(cas_nr, 'name') file_name = cas_nr + '.mol' download_file = Path(download_path) / file_name # Check if the file not exists and download # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists if download_file.exists() and os.stat(download_file).st_size != 0: # print('{} already downloaded'.format(file_name)) return -1 else: # this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical. if len(cid) > 0: # if Pubchem found the result, get the first result of the list cid = cid[0] # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid))) # To double check if the CAS number is correct: # using pubchem api, get a list of synonym. The result is a list of dict. # choose the first result and check first 5 values for 'Synonym' key: # synonyms = pcp.get_synonyms(cid)[0]['Synonym'][:7] synonyms = pcp.get_synonyms(cid)[0]['Synonym'] # print('List of synonyms is: {}'.format(synonyms)); exit(0) if cas_nr not in synonyms: raise ValueError('\tThis is not an exact match!') # get url from Fisher to get url to download sds file get_sdf_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/sdf'.format( cid) # # Check if the file not exists and download # # check file exists: https://stackoverflow.com/questions/82831/how-do-i-check-whether-a-file-exists # if download_file.exists(): # # print('{} already downloaded'.format(file_name)) # return -1 # else: # # Another way to get sdf, from pubchempy --------------------------------------- # sdf = pcp.get_sdf(cid) # with open('159857-81-5.mol', 'w') as f: # f.write(sdf) # # ---------------------------------------------------------------------------------- # Get the html request info using CID number from pubchem r = requests.get(get_sdf_url, headers=headers, timeout=15) # print('url is: {}'.format(get_sdf_url)) # Check to see if give OK status (200) and not redirect if r.status_code == 200 and len(r.history) == 0: download_file.write_text(data=r.text) # Check if the mol file is a binary string (some error during downloading) or empty mol file: if is_binary_string(open(download_file, 'rb').read( 1024)) or is_empty_mol_file(download_file): os.remove(download_file) # remove the error mol file return cas_nr else: return 0 # If not, try to find substances as well elif len(cid) == 0: '''pcp.get_substances(cas_nr, 'name') returns a list of Substances if found: Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L328''' substances = pcp.get_substances(cas_nr, 'name') # print(sid); exit(0) if len(substances) == 0: # print('nothing here') raise ValueError( 'Could not find any compounds or substances with this CAS {} on Pubchem.' .format(cas_nr)) else: for substance in substances: # print('Substance ID (SID) from PubChem is: {} and type is: {}'.format(substance, type(substance))) '''Ref: https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L735''' # substance_synonyms = substance.to_dict(properties=['synonyms'])['synonyms'] ''' substance.to_dict(properties=['synonyms']) return example: {'synonyms': ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 'Iron oxide (Fe203), hydrate']} ''' substance_synonyms = substance.synonyms # https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L1095 ''' substance.synonyms' return example: ['12259-21-1', 'Iron oxide (Fe2O3), hydrate', 'Ferric oxide hydrate', 'Ferrox', 'Hydrated ferric oxide', 'Hydrous ferric oxide', 'Iron oxide (Fe203), hydrate'] ''' # Check to make sure the substance has the same CAS# if cas_nr in substance_synonyms: sdf = pcp.get_sdf(identifier=substance.sid, namespace='sid', domain='substance') # print(sdf) if sdf: # pcp.get_sdf return None if not found SDF download_file.write_text(data=sdf) # Check if the mol file is a binary string (some error during downloading) or empty mol file: if is_binary_string( open(download_file, 'rb').read(1024) ) or is_empty_mol_file(download_file): os.remove(download_file ) # remove the error mol file else: return 0 # If none of the Substances has the same CAS and/or has SDF (mol) file, then return the CAS # return cas_nr except Exception as error: # print('.', end='') if debug: print('Error during search structure in Pubchem:\n\t{}'.format( error)) return cas_nr