def save_model(self, request, obj, form, change): if form.cleaned_data['pubchem_id']: comp = Compound.from_cid(form.cleaned_data['pubchem_id']) obj.pubchem_name = comp.synonyms[0] else: obj.pubchem_name = None obj.save()
def do_search_api(): args = reqparse.RequestParser(). \ add_argument("Molecular", type=str). \ add_argument("Type", type=str, default='similarity'). \ add_argument("Cid", type=int). \ parse_args() molecular_name = args['Molecular'] search_type = args['Type'] cid_num = args['Cid'] if search_type == 'similarity': table_name = SIM_TABLE elif search_type == 'substructure': table_name = SUB_TABLE elif search_type == 'superstructure': table_name = SUPER_TABLE top_k = NUM if cid_num: from pubchempy import get_compounds, Compound comp = Compound.from_cid(cid_num) molecular_name = comp.isomeric_smiles if not molecular_name: return "no molecular" if molecular_name: try: res_smi = do_search(table_name, molecular_name, top_k) except: return "There has no results, please input the correct molecular and ensure the table has data." re = {} re["Smiles"] = res_smi return jsonify(re), 200 return "not found", 400
def get_smiles(idx, names=None, cids=None, sids=None, binding_db=None): try: cid = int(cids[idx]) smiles_string = binding_db.loc[binding_db['PubChem CID'] == cid]["Ligand SMILES"].values[0] except: try: sid = int(sids[idx]) smiles_string = binding_db.loc[binding_db['PubChem SID'] == sid]["Ligand SMILES"].values[0] except: try: cid = int(cids[idx]) smiles_string = str( Compound.from_cid(int(cid)).isomeric_smiles) except: try: chembl_id = Substance.from_sid(sids[idx]).source_id print(chembl_id) try: compounds = CompoundResource() c = compounds.get(chembl_id) smiles_string = c["smiles"] except: smiles_string = binding_db.loc[ binding_db["ChEMBL ID of Ligand"] == chembl_id]["Ligand SMILES"].values[0] except: try: name = names[idx] cs = get_compounds(name, 'name') smiles_string = cs[0].isomeric_smiles except: smiles_string = np.nan return (smiles_string)
def from_id(cid): """This is a pass through to pubchempy.Compound.from_cid(cid) """ # sometimes cid comes in as a float, it should be an int if isinstance(cid,float): cid = int(cid) return Compound.from_cid(cid)
def scrape_pubchem_for_inchi(): from pubchempy import BadRequestError, Compound, NotFoundError for m in Molecule.objects.filter( pubchem_id__isnull=False).order_by('pubchem_id'): if m.inchi_code == "": try: c = Compound.from_cid(m.pubchem_id) m.inchi_code = c.inchi m.save() except (BadRequestError, NotFoundError): logging.error('Invalid PubChem CID: {}'.format(m.pubchem_id))
def __call__(self, representation): """Create `pubchempy.Compound` object from passed CID representation. Parameters ---------- representation : int CID integer as per `pubchempy` doccumentation for `from_cid`. Returns ------- compound : pubchempy.Compound Pubchempy represenation of molecule, accessed from the PubChem database. """ compound = Compound.from_cid(representation) # calls PubChem API time.sleep(self.crawl_delay) # ensure good scraping practise return compound
def decagon_preprocess(cfg, cid_dict): in_file = cfg['DATASET']['COMBO_PATH'] out_file = cfg['DATASET']['PROCESSED_PATH'] # get list of labels that have more interactions than NUM_MIN_INTERACTION label_list, counter_dict = filter_interactions(in_file, cfg) unknown_dict = {} with open(in_file, 'r') as csv_in: with open(out_file, 'w') as csv_out: csv_reader = csv.reader(csv_in, delimiter=',') next(csv_reader, None) # ignore Header csv_writer = csv.writer(csv_out) total = sum(1 for row in open(in_file)) for line_idx, line in enumerate(csv_reader): printProgress( line_idx + 1, total, '| Processing {} pairs...'.format( cfg['DATASET']['DATA_NAME']), ' ', 1, 50) stitch1, stitch2, side_effect, _ = line # _ : string name for side effects if side_effect in label_list: row_string = [] for stitch in [stitch1, stitch2]: smiles = None try: smiles = cid_dict[stitch] except KeyError: # Key doesn't exists in dictionary if stitch in unknown_dict: smiles = unknown_dict[ stitch] # if already retrieved from pubchempy else: if stitch1[3] == '1' or stitch2[3] == '1': print("Pair has stereo : [%s,%s]" % (stitch1, stitch2)) # ? print(">>> Adding {} to dictionary...".format( stitch)) smiles = Compound.from_cid( int(stitch[4:]) ).canonical_smiles # Obtain smiles from pubchempy unknown_dict[ stitch] = smiles # update unknown dicts for future reference row_string.append(smiles) row_string.append(side_effect) csv_writer.writerow(row_string)
def main(argv): """Main entry point of program. """ # Parse arguments parser = argparse.ArgumentParser(description= 'Retrieves compounds from PubChem using either a list of CID''s or' ' identifier and type, e.g. Glucose name .\n', formatter_class=RawTextHelpFormatter) parser.add_argument('cid', metavar='cid', nargs='+',action='store', help='CIDs to fetch.') # parser.add_argument('output_dir', metavar='output_dir', action='store', # help='Output directory to place csvs into.') parser.add_argument('-s', nargs=2, metavar=('identifier', 'type'), action='store', dest='search', help='Identifier and type to search for: e.g. -s Glucose name\n' ' Types supported: cid, name, smiles, sdf, inchi, inchikey, formula.\n') args = parser.parse_args() cid = args.cid #output_dir = args.output_dir search = args.search c = Compound.from_cid(cid, as_dataframe=True) pprint(c.record) print "Synonyms", ";".join(c.synonyms) print c.iupac_name # alternate name # metabolic_network_id print c.cid # chebi # kegg # bigg # HMDB # DrubBank print c.inchi print c.inchikey print c.canonical_smiles # protein associations print c.exact_mass print c.molecular_weight print c.molecular_formula print c.charge
def save_model(self, request, obj, form, change): if form.cleaned_data['chebi_id']: chebi_id = form.cleaned_data['chebi_id'] chebi_comb = ChebiEntity('CHEBI:' + str(chebi_id)) parent_id = chebi_comb.get_parent_id() if parent_id: s = re.findall(r'\d+', parent_id) chebi_id = int(s[0]) chebi_comb = ChebiEntity('CHEBI:' + str(chebi_id)) obj.chebi_id = chebi_id obj.chebi_name = chebi_comb.get_name() else: obj.chebi_name = None if form.cleaned_data['pubchem_id']: comp = Compound.from_cid(form.cleaned_data['pubchem_id']) obj.pubchem_name = comp.synonyms[0] else: obj.pubchem_name = None obj.save()
from numpy import random from pubchempy import get_compounds, Compound # comp = Compound.from_cid(6602565) # print(comp.isomeric_smiles) # comps = get_compounds('Aspirin', 'name') # print(comps[0].xlogp) # # 1.2 import csv with open('inhibitor_all.csv', 'r') as inhibitor_all: reader = csv.reader(inhibitor_all, delimiter=',', quotechar='"') # next(reader, 5) # skip the headers Actives = [compound[2] for compound in reader if compound[3]=="Active"] with open('inhibitor_active.csv', 'w') as inhibitor_active: for active in Actives: comp = Compound.from_cid(active) print(comp.isomeric_smiles) writer = csv.writer(inhibitor_active, delimiter=',') # writer.writerow(["Active", "mol_id", "smiles"]) # write header writer.writerow([1, active, comp.isomeric_smiles]) print(len(Active)) # break # with open('inhibitor_all.csv', 'r') as inhibitor_all: # reader = csv.reader(inhibitor_all, delimiter=',', quotechar='"') # # next(reader, 5) # skip the headers # Inactive = [compound[2] for compound in reader if compound[3]=="Inactive"] # for x in xrange(12): # Inactive = random.choice(Inactive, 3880) # print(len(Inactive))
#! /home/caleb/.anaconda3/bin/python import sys from pubchempy import get_compounds, Compound arg = sys.argv[0:] arg.pop(0) for cid in arg: cid = int(cid) pc = Compound.from_cid(cid) iupac_name = pc.iupac_name if iupac_name == None: iupac_name = '' names = pc.synonyms smi = pc.isomeric_smiles inchi_val = pc.inchi inchikey = pc.inchikey mw = pc.molecular_weight formula = pc.molecular_formula CAS = '' s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi, inchi_val, inchikey, iupac_name) s += '\t'.join(names) print(s) #4 78-96-6 C3H9NO 75.10966 CC(CN)O C3H9NO/c1-3(5)2-4/h3,5H,2,4H2,1H3 HXKKHQJGJAFBHI-UHFFFAOYSA-N 1-azanylpropan-2-ol 1-amino-2-propanol
import pickle from rdkit.Chem import MolFromSmiles from pubchempy import Compound # ---------------------------------------------------------------------------------------------------------------------- with open('tests/files/features.pkl', 'rb') as f: REF_FEATURES = pickle.load(f) STD_FEATURES = REF_FEATURES['standard'] MAP4_FEATURES = REF_FEATURES['map4'] PUB_FEATURES = REF_FEATURES['pubchem'] # ---------------------------------------------------------------------------------------------------------------------- SMILES = 'O=C1C(=Cc2ccccc2)CCCC1=Cc1ccccc1' MOL = MolFromSmiles(SMILES) assert MOL, 'Error creating the testing mol object.' # ---------------------------------------------------------------------------------------------------------------------- cid = 5090 PUB_MOL = Compound.from_cid(cid) assert isinstance( PUB_MOL, Compound), 'Error creating the testing pubchem Compound object.' # ----------------------------------------------------------------------------------------------------------------------
# kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # %% import pubchempy as pcp import pandas from pubchempy import Compound, get_compounds import logging logging.basicConfig(level=logging.DEBUG) #Fetch a compound with cid #c = pcp.Compound.from_cid(5090) c = Compound.from_cid(1423) cs1 = get_compounds('Aspirin', 'name') cs2 = get_compounds('C1=CC2=C(C3=C(C=CC=N3)C=C2)N=C1', 'smiles') # %% #To get 3d information about compounds cs1 = pcp.get_compounds('Aspirin', 'name', record_type='3d') cs1 # %% c.to_dict() # %% #Fetch a list of compounds based on cids and print their metadata (schema) cs_list = [] blocksize = 10
import pandas as pd from pubchempy import Compound df = pd.read_csv('AID_1706_datatable_all.csv') print(df.head()) df['SMILES'] = [ Compound.from_cid(cid).isomeric_smiles for cid in df['PUBCHEM_CID'] ] df.to_csv('AID_1706_datatable_all.csv', index=False) print(df.head())
def parse_f(f): names = [''] cid = -1 CAS = f.split('/')[1] if '/' in f else f CAS = CAS.split('.')[0] if CAS in ignored_CASs: return None failed_mol = False try: if CAS in syn_data: d = syn_data[CAS] if 'pubchem' in d: raise Exception( 'Pubchem specified, not trying to use the mol file') elif 'formula' in d: raise Exception( 'Formula specified, not trying to use the mol file') try: mol = Chem.MolFromMolFile(f) assert mol is not None except: print('Cannot read %s' % f) 1 / 0 try: inchi_val = inchi.MolToInchi(mol) except: print('BAILING ON %s' % f) 1 / 0 mol = inchi.MolFromInchi(inchi_val) # Works better for ions if mol is None: print('BAILING ON reconversion to mol %s' % f) 1 / 0 except: failed_mol = True if CAS in syn_data: d = syn_data[CAS] if 'pubchem' in d: if str(d['pubchem']) in mycache: cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula = mycache[ str(d['pubchem'])] else: pc = Compound.from_cid(d['pubchem']) cid = pc.cid iupac_name = pc.iupac_name names = pc.synonyms mw = pc.molecular_weight smi = pc.canonical_smiles inchi_val = pc.inchi inchikey = pc.inchikey formula = pc.molecular_formula mycache[str(d['pubchem'])] = (cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula) else: cid = -1 names = d['synonyms'] if 'synonyms' in d else [''] mw = float(d['MW']) smi = d['smiles'] if 'smiles' in d else '' formula = d['formula'] if 'formula' in d else '' inchi_val = d['inchi'] if 'inchi' in d else '' inchikey = d['inchikey'] if 'inchikey' in d else '' iupac_name = '' else: print('FAILED on %s and no custom data was available either' % CAS) return None if not failed_mol: smi = Chem.MolToSmiles(mol, True) inchi_val = inchi.MolToInchi(mol) inchikey = inchi.InchiToInchiKey(inchi_val) mw = Descriptors.MolWt(mol) # for i in mol.GetAtoms(): # if i.GetIsotope(): # mw = Descriptors.ExactMolWt(mol) # break formula = CalcMolFormula(mol, True, True) iupac_name = '' try: if not failed_mol: if str(inchikey) in mycache: cid, iupac_name, names = mycache[str(inchikey)] else: try: pc = get_compounds(inchikey, 'inchikey')[0] cid = pc.cid iupac_name = pc.iupac_name names = pc.synonyms mycache[str(inchikey)] = (cid, iupac_name, names) except: mycache[str(inchikey)] = (-1, '', ['']) except: cid = -1 iupac_name = '' names = [''] other_CAS = [] if CAS in pdf_data: d = pdf_data[CAS] name = d['Name'] if 'Other Names' in d: syns = d['Other Names'] else: syns = [] if not iupac_name: iupac_name = name else: syns.insert(0, name) if 'Deleted CAS' in d: other_CAS.extend(d['Deleted CAS']) if 'Alternate CAS' in d: other_CAS.extend(d['Alternate CAS']) syns = [i for i in syns if i not in dup_names] names = syns + [i for i in names if i not in all_names] + other_CAS actual_names = [] for name in names: if name in all_user_names: # If the name is in the user db, only add it if it corresponds to this CAS number if CAS in syn_data and 'synonyms' in syn_data[ CAS] and name in syn_data[CAS]['synonyms']: actual_names.append(name) else: # Discard it otherwise pass else: # If the name is not in the user db we're all good actual_names.append(name) if CAS in syn_data and 'synonyms' in syn_data[CAS]: # If the user has any syns for this cas number, add those names if the name hasn't already been aded for n in syn_data[CAS]['synonyms']: if n not in actual_names: actual_names.append(n) actual_names = [i for i in actual_names if i] if inchi_val is not None: inchi_val = inchi_val.replace('InChI=1S/', '') formula = serialize_formula(formula) s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi, inchi_val, inchikey, iupac_name) s += '\t'.join(actual_names) print(s) return None
"pd_idx": pd_idx, "pd_weight": pd_weight } directory = os.path.abspath(os.getcwd()) + "/output/pkl" if not os.path.exists(directory): os.makedirs(directory) with open(directory + "/{}.pkl".format(fig_name), "wb") as f: pickle.dump(out, f) ######################################### from pubchempy import Compound import pandas as pd cids = [ int(data.drug_idx_to_id[i][3:]) for i in range(len(data.drug_idx_to_id)) ] drugs = [Compound.from_cid(int(cid)) for cid in cids] drug_ids = pd.DataFrame([[ data.drug_id_to_idx['CID{}'.format(d.cid)], d.cid, 'NA' if len(d.synonyms) == 0 else d.synonyms[0], d.iupac_name ] for d in drugs], columns=["drug_idx", 'CID', 'synonym', 'iupac_name']) drug_ids.to_csv('./index-map/drug-map.csv', index=False) from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hum geneid2symbol = {v.GeneID: v.Symbol for k, v in GeneID2nt_hum.items()} genes = [ int(data.prot_idx_to_id[i][6:]) for i in range(len(data.prot_idx_to_id)) ] gene_ids = pd.DataFrame([[ data.prot_id_to_idx['GeneID{}'.format(gene)], gene,
def main(): st.title("Bioinformatics App") st.set_option('deprecation.showfileUploaderEncoding', False) activity = [ 'Intro', 'SequenceAnalysis', 'DotPlot', 'ProteinSearch', "MoleculeVisualizer", "ChemicalSearch" ] choice = st.sidebar.selectbox("Select Activity", activity) if choice == 'Intro': st.subheader("Intro") st.write( """ This is a bioinformatics web app made with Python and Streamlit. Use the left panel dropdown to choose the various features to use.""" ) image = Image.open("overviewpicture.png") st.image(image, use_column_width=True) elif choice == "SequenceAnalysis": st.subheader("DNA Sequence Analysis") seq_file = st.file_uploader("Upload FASTA File", type=["fasta", "fa"]) if seq_file is not None: dna_record = SeqIO.read(seq_file, "fasta") # st.write(dna_record) dna_seq = dna_record.seq details = st.radio("Details", ("Description", "Sequence")) if details == "Description": st.write(dna_record.description) elif details == "Sequence": st.write(dna_record.seq) # Nucleotide Frequencies st.subheader("Nucleotide Frequency") dna_freq = Counter(dna_seq) st.write(dna_freq) adenine_color = st.beta_color_picker("Adenine Color") thymine_color = st.beta_color_picker("thymine Color") guanine_color = st.beta_color_picker("Guanine Color") cytosil_color = st.beta_color_picker("cytosil Color") if st.button("Plot Freq"): barlist = plt.bar(dna_freq.keys(), dna_freq.values()) barlist[2].set_color(adenine_color) barlist[3].set_color(thymine_color) barlist[1].set_color(guanine_color) barlist[0].set_color(cytosil_color) st.pyplot() st.subheader("DNA Composition") gc_score = utils.gc_content(str(dna_seq)) at_score = utils.at_content(str(dna_seq)) st.json({"GC Content": gc_score, "AT Content": at_score}) # Nucleotide Count nt_count = st.text_input("Enter Nucleotide Here", "Type Nucleotide Alphabet") st.write("Number of {} Nucleotide is ::{}".format( (nt_count), str(dna_seq).count(nt_count))) # Protein Synthesis st.subheader("Protein Synthesis") p1 = dna_seq.translate() aa_freq = Counter(str(p1)) if st.checkbox("Transcription"): st.write(dna_seq.transcribe()) elif st.checkbox("Translation"): st.write(dna_seq.translate()) elif st.checkbox("Complement"): st.write(dna_seq.complement()) elif st.checkbox("AA Frequency"): st.write(aa_freq) elif st.checkbox("Plot AA Frequency"): aa_color = st.beta_color_picker("Pick An Amino Acid Color") # barlist = plt.bar(aa_freq.keys(),aa_freq.values(),color=aa_color) # barlist[2].set_color(aa_color) plt.bar(aa_freq.keys(), aa_freq.values(), color=aa_color) st.pyplot() elif st.checkbox("Full Amino Acid Name"): aa_name = str(p1).replace("*", "") aa3 = utils.convert_1to3(aa_name) st.write(aa_name) st.write("=====================") st.write(aa3) st.write("=====================") st.write(utils.get_acid_name(aa3)) elif choice == "ProteinSearch": st.subheader("Search for Papers Related to a Protein") st.write(""" Try entering ACE2 and coronavirus!""") ace2 = st.text_input("Query Protein") disease = st.text_input( "Query Specifier (more specific thing to narrow down papers with)") if ace2 and disease is not None: protein = req.get( 'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=10&gene=' + ace2 + '&organism=h**o%20sapiens', headers={'Accept': "application/json"}) for i, v in enumerate(protein.json()[0]['references']): counter = 1 try: title = protein.json( )[0]['references'][i]['citation']['title'] if counter == 10: break if title.find(disease) != -1: st.write(title) counter += 1 except: pass elif choice == "DotPlot": st.subheader("Generate Dot Plot For Two Sequences") seq_file1 = st.file_uploader("Upload 1st FASTA File", type=["fasta", "fa"]) seq_file2 = st.file_uploader("Upload 2nd FASTA File", type=["fasta", "fa"]) if seq_file1 and seq_file2 is not None: dna_record1 = SeqIO.read(seq_file1, "fasta") dna_record2 = SeqIO.read(seq_file2, "fasta") # st.write(dna_record) dna_seq1 = dna_record1.seq dna_seq2 = dna_record2.seq details = st.radio("Details", ("Description", "Sequence")) if details == "Description": st.write(dna_record1.description) st.write("=====================") st.write(dna_record2.description) elif details == "Sequence": st.write(dna_record1.seq) st.write("=====================") st.write(dna_record2.seq) cus_limit = st.number_input("Select Max number of Nucleotide", 10, 200, 50) if st.button("Dot Plot"): st.write( "Comparing the first {} Nucleotide of the Two Sequences". format(cus_limit)) dotplotx(dna_seq1[0:cus_limit], dna_seq2[0:cus_limit]) st.pyplot() elif choice == "MoleculeVisualizer": st.subheader( "Look at a molecule! Pre-loaded example is the Covid-19 Spike Protein. Thank you to: https://github.com/napoles-uach/streamlit_3dmol" ) component_3dmol() elif choice == "ChemicalSearch": st.title( "Search for chemicals and get info. Pre-loaded example: imatinib") user_compound = st.text_input("Enter compound name", 'imatinib') if user_compound is not None: results = pcp.get_compounds(user_compound, 'name') for compound in results: st.write('Compound ID: ' + str(compound.cid)) st.write('SMILES: ' + compound.isomeric_smiles) vioxx = Compound.from_cid(compound.cid) st.write('Molecular Formula: ' + vioxx.molecular_formula) st.write('Molecular Weight: ' + str(vioxx.molecular_weight)) st.write('IUPAC Name: ' + vioxx.iupac_name) st.write('xlogp value: ' + str(vioxx.xlogp))
def find_pubchem_from_ids(pubchem=None, CASRN=None, inchi=None, inchikey=None, smiles=None, use_cache=True): '''Cached query of pubchem database, based on one of many identifiers. Parameters ---------- pubchem : int, optional PubChem ID; prefered lookup, [-] CASRN : str, optional CAS number, [-] inchi : str, optional InChI identification string as given in Common Chemistry (there can be multiple valid InChI strings for a compound), [-] inchikey : str, optional InChI key identification string (meant to be unique to a compound), [-] smiles : str, optional SMILES identification string, [-] use_cache : bool, optional Whether or not to use the cache, [-] Returns ------- cid : intoxidane PubChem ID, [-] iupac_name : str IUPAC name as given in pubchem, [-] MW : float Molecular weight, [g/mol] InChI : str InChI identification string as given in Common Chemistry (there can be multiple valid InChI strings for a compound), [-] InChI_key : str InChI key identification string (meant to be unique to a compound), [-] smiles : str SMILES identification string, [-] formula : str Formula, [-] synonyms : list[str] List of synonyms of the compound, [-] Examples -------- >>> find_pubchem_from_ids(pubchem=962)[0] 962 >>> find_pubchem_from_ids(pubchem=962)[1] 'oxidane' >>> find_pubchem_from_ids(pubchem=962)[2] 18.015 >>> find_pubchem_from_ids(pubchem=962)[3] 'InChI=1S/H2O/h1H2' >>> find_pubchem_from_ids(pubchem=962)[4] 'XLYOFNOQVPJJNP-UHFFFAOYSA-N' >>> find_pubchem_from_ids(pubchem=962)[5] 'O' >>> find_pubchem_from_ids(pubchem=962)[6] 'H2O' >>> len(find_pubchem_from_ids(pubchem=962)[7]) > 100 True >>> find_pubchem_from_ids(CASRN="53850-36-5")[0] 56951715 >>> find_pubchem_from_ids(CASRN="54084-70-7") # Nihonium is missing [None, None, None, None, None, None, None, None] try to use rdkit here to check the correct inchikey is found. >>> find_pubchem_from_ids(inchi='InChI=1S/Cl', inchikey="ZAMOUSCENKQFHK-UHFFFAOYSA-N")[0] 5360523 >>> find_pubchem_from_ids(inchi='InChI=1S/H2O/h1H2', inchikey="XLYOFNOQVPJJNP-UHFFFAOYSA-N")[0] 962 >>> find_pubchem_from_ids(inchi='InChI=1S/I2/c1-2')[0:5] [807, 'molecular iodine', 253.8089, 'InChI=1S/I2/c1-2', 'PNDPGZBMCMUPRI-UHFFFAOYSA-N'] >>> find_pubchem_from_ids(inchi='InChI=1S/H2/h1H')[0:5] [783, 'molecular hydrogen', 2.016, 'InChI=1S/H2/h1H', 'UFHFLCQGNIYNRP-UHFFFAOYSA-N'] ''' abort = False key = (pubchem, CASRN, inchi, inchikey, smiles) hash_key = deterministic_hash(str(key)) key_file = os.path.join(pubchem_cache_dir, hash_key) if os.path.exists(key_file) and use_cache: f = open(key_file, 'r') json_data = json.loads(f.read()) f.close() return json_data if pubchem is not None: compound = Compound.from_cid(pubchem) cid = compound.cid else: if inchikey is not None: # Dup for chlorine atomic here # find_pubchem_from_ids(inchikey='ZAMOUSCENKQFHK-UHFFFAOYSA-N')[0] compounds = get_compounds(inchikey, 'inchikey') elif inchi is not None: # chlorine search "InChI=1S/Cl" finds HCl compounds = get_compounds(inchi, 'inchi') elif smiles is not None: compounds = get_compounds(smiles, 'smiles') elif CASRN is not None: compounds = get_compounds(CASRN, 'name') # maybe sort by ID in the future if not compounds: abort = True cid = None if not abort: compound = compounds[0] cid = compound.cid if cid is None: abort = True if abort: cid, iupac_name, mw, inchi_val, inchikey, smi, formula, names = [None]*8 else: iupac_name = compound.iupac_name mw = float(compound.molecular_weight) smi = compound.canonical_smiles inchi_val = compound.inchi inchikey = compound.inchikey formula = compound.molecular_formula names = compound.synonyms ans = (cid, iupac_name, mw, inchi_val, inchikey, smi, formula, names) f = open(key_file, 'w') json.dump(ans, f) f.close() return ans