Ejemplo n.º 1
0
 def save_model(self, request, obj, form, change):
     if form.cleaned_data['pubchem_id']:
         comp = Compound.from_cid(form.cleaned_data['pubchem_id'])
         obj.pubchem_name = comp.synonyms[0]
     else:
         obj.pubchem_name = None
     obj.save()
Ejemplo n.º 2
0
def do_search_api():
    args = reqparse.RequestParser(). \
        add_argument("Molecular", type=str). \
        add_argument("Type", type=str, default='similarity'). \
        add_argument("Cid", type=int). \
        parse_args()

    molecular_name = args['Molecular']
    search_type = args['Type']
    cid_num = args['Cid']

    if search_type == 'similarity':
        table_name = SIM_TABLE
    elif search_type == 'substructure':
        table_name = SUB_TABLE
    elif search_type == 'superstructure':
        table_name = SUPER_TABLE
    top_k = NUM

    if cid_num:
        from pubchempy import get_compounds, Compound
        comp = Compound.from_cid(cid_num)
        molecular_name = comp.isomeric_smiles

    if not molecular_name:
        return "no molecular"
    if molecular_name:
        try:
            res_smi = do_search(table_name, molecular_name, top_k)
        except:
            return "There has no results, please input the correct molecular and ensure the table has data."
        re = {}
        re["Smiles"] = res_smi
        return jsonify(re), 200
    return "not found", 400
Ejemplo n.º 3
0
def get_smiles(idx, names=None, cids=None, sids=None, binding_db=None):
    try:
        cid = int(cids[idx])
        smiles_string = binding_db.loc[binding_db['PubChem CID'] ==
                                       cid]["Ligand SMILES"].values[0]
    except:
        try:
            sid = int(sids[idx])
            smiles_string = binding_db.loc[binding_db['PubChem SID'] ==
                                           sid]["Ligand SMILES"].values[0]
        except:
            try:
                cid = int(cids[idx])
                smiles_string = str(
                    Compound.from_cid(int(cid)).isomeric_smiles)
            except:
                try:
                    chembl_id = Substance.from_sid(sids[idx]).source_id
                    print(chembl_id)
                    try:
                        compounds = CompoundResource()
                        c = compounds.get(chembl_id)
                        smiles_string = c["smiles"]
                    except:
                        smiles_string = binding_db.loc[
                            binding_db["ChEMBL ID of Ligand"] ==
                            chembl_id]["Ligand SMILES"].values[0]
                except:
                    try:
                        name = names[idx]
                        cs = get_compounds(name, 'name')
                        smiles_string = cs[0].isomeric_smiles
                    except:
                        smiles_string = np.nan
    return (smiles_string)
Ejemplo n.º 4
0
def from_id(cid):
    """This is a pass through to pubchempy.Compound.from_cid(cid)
    """
    # sometimes cid comes in as a float, it should be an int
    if isinstance(cid,float):
        cid = int(cid)

    return Compound.from_cid(cid)
Ejemplo n.º 5
0
def scrape_pubchem_for_inchi():
    from pubchempy import BadRequestError, Compound, NotFoundError
    for m in Molecule.objects.filter(
            pubchem_id__isnull=False).order_by('pubchem_id'):
        if m.inchi_code == "":
            try:
                c = Compound.from_cid(m.pubchem_id)
                m.inchi_code = c.inchi
                m.save()
            except (BadRequestError, NotFoundError):
                logging.error('Invalid PubChem CID: {}'.format(m.pubchem_id))
Ejemplo n.º 6
0
    def __call__(self, representation):
        """Create `pubchempy.Compound` object from passed CID representation.

        Parameters
        ----------
        representation : int
            CID integer as per `pubchempy` doccumentation for `from_cid`.

        Returns
        -------
        compound : pubchempy.Compound
            Pubchempy represenation of molecule, accessed from the PubChem database.
        """
        compound = Compound.from_cid(representation)  # calls PubChem API
        time.sleep(self.crawl_delay)  # ensure good scraping practise
        return compound
Ejemplo n.º 7
0
def decagon_preprocess(cfg, cid_dict):
    in_file = cfg['DATASET']['COMBO_PATH']
    out_file = cfg['DATASET']['PROCESSED_PATH']

    # get list of labels that have more interactions than NUM_MIN_INTERACTION
    label_list, counter_dict = filter_interactions(in_file, cfg)
    unknown_dict = {}

    with open(in_file, 'r') as csv_in:
        with open(out_file, 'w') as csv_out:
            csv_reader = csv.reader(csv_in, delimiter=',')
            next(csv_reader, None)  # ignore Header
            csv_writer = csv.writer(csv_out)

            total = sum(1 for row in open(in_file))
            for line_idx, line in enumerate(csv_reader):
                printProgress(
                    line_idx + 1, total, '| Processing {} pairs...'.format(
                        cfg['DATASET']['DATA_NAME']), ' ', 1, 50)
                stitch1, stitch2, side_effect, _ = line  # _ : string name for side effects
                if side_effect in label_list:
                    row_string = []
                    for stitch in [stitch1, stitch2]:
                        smiles = None
                        try:
                            smiles = cid_dict[stitch]
                        except KeyError:
                            # Key doesn't exists in dictionary
                            if stitch in unknown_dict:
                                smiles = unknown_dict[
                                    stitch]  # if already retrieved from pubchempy
                            else:
                                if stitch1[3] == '1' or stitch2[3] == '1':
                                    print("Pair has stereo : [%s,%s]" %
                                          (stitch1, stitch2))  # ?
                                print(">>> Adding {} to dictionary...".format(
                                    stitch))
                                smiles = Compound.from_cid(
                                    int(stitch[4:])
                                ).canonical_smiles  # Obtain smiles from pubchempy
                                unknown_dict[
                                    stitch] = smiles  # update unknown dicts for future reference
                        row_string.append(smiles)
                    row_string.append(side_effect)
                    csv_writer.writerow(row_string)
Ejemplo n.º 8
0
def main(argv):
    """Main entry point of program.
    """
    # Parse arguments
    parser = argparse.ArgumentParser(description=
        'Retrieves compounds from PubChem using either a list of CID''s or'
        ' identifier and type, e.g. Glucose name .\n',
        formatter_class=RawTextHelpFormatter)
    parser.add_argument('cid', metavar='cid', nargs='+',action='store',
        help='CIDs to fetch.')
   # parser.add_argument('output_dir', metavar='output_dir', action='store',
   #     help='Output directory to place csvs into.')

    parser.add_argument('-s', nargs=2, metavar=('identifier', 'type'), action='store', dest='search',
        help='Identifier and type to search for: e.g. -s Glucose name\n'
            '  Types supported: cid, name, smiles, sdf, inchi, inchikey, formula.\n')

    args = parser.parse_args()
    cid = args.cid
    #output_dir = args.output_dir
    search = args.search

    c = Compound.from_cid(cid, as_dataframe=True)
    
    pprint(c.record)
    print "Synonyms", ";".join(c.synonyms)

    print c.iupac_name
    # alternate name
    # metabolic_network_id
    print c.cid
    # chebi
    # kegg
    # bigg
    # HMDB
    # DrubBank
    print c.inchi
    print c.inchikey
    print c.canonical_smiles
    # protein associations
    print c.exact_mass
    print c.molecular_weight
    print c.molecular_formula
    print c.charge
Ejemplo n.º 9
0
 def save_model(self, request, obj, form, change):
     if form.cleaned_data['chebi_id']:
         chebi_id = form.cleaned_data['chebi_id']
         chebi_comb = ChebiEntity('CHEBI:' + str(chebi_id))
         parent_id = chebi_comb.get_parent_id()
         if parent_id:
             s = re.findall(r'\d+', parent_id)
             chebi_id = int(s[0])
             chebi_comb = ChebiEntity('CHEBI:' + str(chebi_id))
         obj.chebi_id = chebi_id
         obj.chebi_name = chebi_comb.get_name()
     else:
         obj.chebi_name = None
     if form.cleaned_data['pubchem_id']:
         comp = Compound.from_cid(form.cleaned_data['pubchem_id'])
         obj.pubchem_name = comp.synonyms[0]
     else:
         obj.pubchem_name = None
     obj.save()
from numpy import random
from pubchempy import get_compounds, Compound
# comp = Compound.from_cid(6602565)
# print(comp.isomeric_smiles)
# comps = get_compounds('Aspirin', 'name')
# print(comps[0].xlogp)
# # 1.2

import csv
with open('inhibitor_all.csv', 'r') as inhibitor_all:
	reader = csv.reader(inhibitor_all, delimiter=',', quotechar='"')
	# next(reader, 5)  # skip the headers
	Actives = [compound[2] for compound in reader if compound[3]=="Active"]
	with open('inhibitor_active.csv', 'w') as inhibitor_active:
		for active in Actives:
			comp = Compound.from_cid(active)
			print(comp.isomeric_smiles)
			writer = csv.writer(inhibitor_active, delimiter=',')
			# writer.writerow(["Active", "mol_id", "smiles"])  # write header
			writer.writerow([1, active, comp.isomeric_smiles])

	print(len(Active))
	# break

# with open('inhibitor_all.csv', 'r') as inhibitor_all:
# 	reader = csv.reader(inhibitor_all, delimiter=',', quotechar='"')
# 	# next(reader, 5)  # skip the headers
# 	Inactive = [compound[2] for compound in reader if compound[3]=="Inactive"]
# 	for x in xrange(12):
# 		Inactive = random.choice(Inactive, 3880)
# 	print(len(Inactive))
#! /home/caleb/.anaconda3/bin/python
import sys

from pubchempy import get_compounds, Compound

arg = sys.argv[0:]
arg.pop(0)
for cid in arg:
    cid = int(cid)
    pc = Compound.from_cid(cid)
    iupac_name = pc.iupac_name
    if iupac_name == None:
        iupac_name = ''
    names = pc.synonyms
    smi = pc.isomeric_smiles
    inchi_val = pc.inchi
    inchikey = pc.inchikey

    mw = pc.molecular_weight
    formula = pc.molecular_formula
    CAS = ''

    s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi,
                                              inchi_val, inchikey, iupac_name)
    s += '\t'.join(names)
    print(s)

#4	78-96-6	C3H9NO	75.10966	CC(CN)O	C3H9NO/c1-3(5)2-4/h3,5H,2,4H2,1H3	HXKKHQJGJAFBHI-UHFFFAOYSA-N	1-azanylpropan-2-ol	1-amino-2-propanol
Ejemplo n.º 12
0
import pickle
from rdkit.Chem import MolFromSmiles
from pubchempy import Compound

# ----------------------------------------------------------------------------------------------------------------------

with open('tests/files/features.pkl', 'rb') as f:
    REF_FEATURES = pickle.load(f)

    STD_FEATURES = REF_FEATURES['standard']
    MAP4_FEATURES = REF_FEATURES['map4']
    PUB_FEATURES = REF_FEATURES['pubchem']

# ----------------------------------------------------------------------------------------------------------------------

SMILES = 'O=C1C(=Cc2ccccc2)CCCC1=Cc1ccccc1'
MOL = MolFromSmiles(SMILES)
assert MOL, 'Error creating the testing mol object.'

# ----------------------------------------------------------------------------------------------------------------------

cid = 5090
PUB_MOL = Compound.from_cid(cid)
assert isinstance(
    PUB_MOL, Compound), 'Error creating the testing pubchem Compound object.'

# ----------------------------------------------------------------------------------------------------------------------
Ejemplo n.º 13
0
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# %%
import pubchempy as pcp
import pandas
from pubchempy import Compound, get_compounds
import logging
logging.basicConfig(level=logging.DEBUG)

#Fetch a compound with cid
#c = pcp.Compound.from_cid(5090)
c = Compound.from_cid(1423)
cs1 = get_compounds('Aspirin', 'name')
cs2 = get_compounds('C1=CC2=C(C3=C(C=CC=N3)C=C2)N=C1', 'smiles')

# %%
#To get 3d information about compounds
cs1 = pcp.get_compounds('Aspirin', 'name', record_type='3d')
cs1

# %%
c.to_dict()

# %%
#Fetch a list of compounds based on cids and print their metadata (schema)
cs_list = []
blocksize = 10
Ejemplo n.º 14
0
import pandas as pd
from pubchempy import Compound

df = pd.read_csv('AID_1706_datatable_all.csv')
print(df.head())
df['SMILES'] = [
    Compound.from_cid(cid).isomeric_smiles for cid in df['PUBCHEM_CID']
]
df.to_csv('AID_1706_datatable_all.csv', index=False)
print(df.head())
Ejemplo n.º 15
0
def parse_f(f):
    names = ['']
    cid = -1
    CAS = f.split('/')[1] if '/' in f else f
    CAS = CAS.split('.')[0]
    if CAS in ignored_CASs:
        return None
    failed_mol = False
    try:
        if CAS in syn_data:
            d = syn_data[CAS]
            if 'pubchem' in d:
                raise Exception(
                    'Pubchem specified, not trying to use the mol file')
            elif 'formula' in d:
                raise Exception(
                    'Formula specified, not trying to use the mol file')
        try:
            mol = Chem.MolFromMolFile(f)
            assert mol is not None
        except:
            print('Cannot read %s' % f)
            1 / 0
        try:
            inchi_val = inchi.MolToInchi(mol)
        except:
            print('BAILING ON %s' % f)
            1 / 0
        mol = inchi.MolFromInchi(inchi_val)  # Works better for ions
        if mol is None:
            print('BAILING ON reconversion to mol %s' % f)
            1 / 0
    except:
        failed_mol = True
        if CAS in syn_data:
            d = syn_data[CAS]
            if 'pubchem' in d:
                if str(d['pubchem']) in mycache:
                    cid, iupac_name, names, mw, smi, inchi_val, inchikey, formula = mycache[
                        str(d['pubchem'])]
                else:
                    pc = Compound.from_cid(d['pubchem'])
                    cid = pc.cid
                    iupac_name = pc.iupac_name
                    names = pc.synonyms
                    mw = pc.molecular_weight
                    smi = pc.canonical_smiles
                    inchi_val = pc.inchi
                    inchikey = pc.inchikey
                    formula = pc.molecular_formula

                    mycache[str(d['pubchem'])] = (cid, iupac_name, names, mw,
                                                  smi, inchi_val, inchikey,
                                                  formula)
            else:
                cid = -1
                names = d['synonyms'] if 'synonyms' in d else ['']
                mw = float(d['MW'])
                smi = d['smiles'] if 'smiles' in d else ''
                formula = d['formula'] if 'formula' in d else ''
                inchi_val = d['inchi'] if 'inchi' in d else ''
                inchikey = d['inchikey'] if 'inchikey' in d else ''
                iupac_name = ''
        else:
            print('FAILED on %s and no custom data was available either' % CAS)
            return None

    if not failed_mol:
        smi = Chem.MolToSmiles(mol, True)
        inchi_val = inchi.MolToInchi(mol)
        inchikey = inchi.InchiToInchiKey(inchi_val)
        mw = Descriptors.MolWt(mol)
        #        for i in mol.GetAtoms():
        #            if i.GetIsotope():
        #                mw = Descriptors.ExactMolWt(mol)
        #                break

        formula = CalcMolFormula(mol, True, True)
        iupac_name = ''
    try:
        if not failed_mol:
            if str(inchikey) in mycache:
                cid, iupac_name, names = mycache[str(inchikey)]
            else:
                try:
                    pc = get_compounds(inchikey, 'inchikey')[0]
                    cid = pc.cid
                    iupac_name = pc.iupac_name
                    names = pc.synonyms
                    mycache[str(inchikey)] = (cid, iupac_name, names)
                except:
                    mycache[str(inchikey)] = (-1, '', [''])
    except:
        cid = -1
        iupac_name = ''
        names = ['']

    other_CAS = []
    if CAS in pdf_data:
        d = pdf_data[CAS]
        name = d['Name']
        if 'Other Names' in d:
            syns = d['Other Names']
        else:
            syns = []
        if not iupac_name:
            iupac_name = name
        else:
            syns.insert(0, name)
        if 'Deleted CAS' in d:
            other_CAS.extend(d['Deleted CAS'])
        if 'Alternate CAS' in d:
            other_CAS.extend(d['Alternate CAS'])

        syns = [i for i in syns if i not in dup_names]
        names = syns + [i for i in names if i not in all_names] + other_CAS
    actual_names = []
    for name in names:
        if name in all_user_names:
            # If the name is in the user db, only add it if it corresponds to this CAS number
            if CAS in syn_data and 'synonyms' in syn_data[
                    CAS] and name in syn_data[CAS]['synonyms']:
                actual_names.append(name)
            else:
                # Discard it otherwise
                pass
        else:
            # If the name is not in the user db we're all good
            actual_names.append(name)
    if CAS in syn_data and 'synonyms' in syn_data[CAS]:
        # If the user has any syns for this cas number, add those names if the name hasn't already been aded
        for n in syn_data[CAS]['synonyms']:
            if n not in actual_names:
                actual_names.append(n)

    actual_names = [i for i in actual_names if i]

    if inchi_val is not None:
        inchi_val = inchi_val.replace('InChI=1S/', '')

    formula = serialize_formula(formula)
    s = '%d\t%s\t%s\t%g\t%s\t%s\t%s\t%s\t' % (cid, CAS, formula, mw, smi,
                                              inchi_val, inchikey, iupac_name)

    s += '\t'.join(actual_names)
    print(s)
    return None
Ejemplo n.º 16
0
    "pd_idx": pd_idx,
    "pd_weight": pd_weight
}
directory = os.path.abspath(os.getcwd()) + "/output/pkl"
if not os.path.exists(directory):
    os.makedirs(directory)
with open(directory + "/{}.pkl".format(fig_name), "wb") as f:
    pickle.dump(out, f)

#########################################
from pubchempy import Compound
import pandas as pd
cids = [
    int(data.drug_idx_to_id[i][3:]) for i in range(len(data.drug_idx_to_id))
]
drugs = [Compound.from_cid(int(cid)) for cid in cids]
drug_ids = pd.DataFrame([[
    data.drug_id_to_idx['CID{}'.format(d.cid)], d.cid,
    'NA' if len(d.synonyms) == 0 else d.synonyms[0], d.iupac_name
] for d in drugs],
                        columns=["drug_idx", 'CID', 'synonym', 'iupac_name'])

drug_ids.to_csv('./index-map/drug-map.csv', index=False)

from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hum
geneid2symbol = {v.GeneID: v.Symbol for k, v in GeneID2nt_hum.items()}
genes = [
    int(data.prot_idx_to_id[i][6:]) for i in range(len(data.prot_idx_to_id))
]
gene_ids = pd.DataFrame([[
    data.prot_id_to_idx['GeneID{}'.format(gene)], gene,
Ejemplo n.º 17
0
def main():
    st.title("Bioinformatics App")
    st.set_option('deprecation.showfileUploaderEncoding', False)

    activity = [
        'Intro', 'SequenceAnalysis', 'DotPlot', 'ProteinSearch',
        "MoleculeVisualizer", "ChemicalSearch"
    ]
    choice = st.sidebar.selectbox("Select Activity", activity)
    if choice == 'Intro':
        st.subheader("Intro")
        st.write(
            """ This is a bioinformatics web app made with Python and Streamlit. Use the left panel dropdown to choose the various features to use."""
        )
        image = Image.open("overviewpicture.png")
        st.image(image, use_column_width=True)

    elif choice == "SequenceAnalysis":
        st.subheader("DNA Sequence Analysis")

        seq_file = st.file_uploader("Upload FASTA File", type=["fasta", "fa"])

        if seq_file is not None:
            dna_record = SeqIO.read(seq_file, "fasta")
            # st.write(dna_record)
            dna_seq = dna_record.seq

            details = st.radio("Details", ("Description", "Sequence"))
            if details == "Description":
                st.write(dna_record.description)
            elif details == "Sequence":
                st.write(dna_record.seq)

            # Nucleotide Frequencies
            st.subheader("Nucleotide Frequency")
            dna_freq = Counter(dna_seq)
            st.write(dna_freq)
            adenine_color = st.beta_color_picker("Adenine Color")
            thymine_color = st.beta_color_picker("thymine Color")
            guanine_color = st.beta_color_picker("Guanine Color")
            cytosil_color = st.beta_color_picker("cytosil Color")

            if st.button("Plot Freq"):
                barlist = plt.bar(dna_freq.keys(), dna_freq.values())
                barlist[2].set_color(adenine_color)
                barlist[3].set_color(thymine_color)
                barlist[1].set_color(guanine_color)
                barlist[0].set_color(cytosil_color)

                st.pyplot()

            st.subheader("DNA Composition")
            gc_score = utils.gc_content(str(dna_seq))
            at_score = utils.at_content(str(dna_seq))
            st.json({"GC Content": gc_score, "AT Content": at_score})

            # Nucleotide Count
            nt_count = st.text_input("Enter Nucleotide Here",
                                     "Type Nucleotide Alphabet")
            st.write("Number of {} Nucleotide is ::{}".format(
                (nt_count),
                str(dna_seq).count(nt_count)))

            # Protein Synthesis
            st.subheader("Protein Synthesis")
            p1 = dna_seq.translate()
            aa_freq = Counter(str(p1))

            if st.checkbox("Transcription"):
                st.write(dna_seq.transcribe())

            elif st.checkbox("Translation"):
                st.write(dna_seq.translate())

            elif st.checkbox("Complement"):
                st.write(dna_seq.complement())

            elif st.checkbox("AA Frequency"):
                st.write(aa_freq)

            elif st.checkbox("Plot AA Frequency"):
                aa_color = st.beta_color_picker("Pick An Amino Acid Color")
                # barlist = plt.bar(aa_freq.keys(),aa_freq.values(),color=aa_color)
                # barlist[2].set_color(aa_color)
                plt.bar(aa_freq.keys(), aa_freq.values(), color=aa_color)
                st.pyplot()

            elif st.checkbox("Full Amino Acid Name"):
                aa_name = str(p1).replace("*", "")
                aa3 = utils.convert_1to3(aa_name)
                st.write(aa_name)
                st.write("=====================")
                st.write(aa3)

                st.write("=====================")
                st.write(utils.get_acid_name(aa3))

    elif choice == "ProteinSearch":
        st.subheader("Search for Papers Related to a Protein")
        st.write(""" Try entering ACE2 and coronavirus!""")

        ace2 = st.text_input("Query Protein")
        disease = st.text_input(
            "Query Specifier (more specific thing to narrow down papers with)")

        if ace2 and disease is not None:
            protein = req.get(
                'https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=10&gene='
                + ace2 + '&organism=h**o%20sapiens',
                headers={'Accept': "application/json"})
            for i, v in enumerate(protein.json()[0]['references']):
                counter = 1
                try:
                    title = protein.json(
                    )[0]['references'][i]['citation']['title']
                    if counter == 10:
                        break

                    if title.find(disease) != -1:
                        st.write(title)
                        counter += 1
                except:
                    pass

    elif choice == "DotPlot":
        st.subheader("Generate Dot Plot For Two Sequences")
        seq_file1 = st.file_uploader("Upload 1st FASTA File",
                                     type=["fasta", "fa"])
        seq_file2 = st.file_uploader("Upload 2nd FASTA File",
                                     type=["fasta", "fa"])

        if seq_file1 and seq_file2 is not None:
            dna_record1 = SeqIO.read(seq_file1, "fasta")
            dna_record2 = SeqIO.read(seq_file2, "fasta")
            # st.write(dna_record)
            dna_seq1 = dna_record1.seq
            dna_seq2 = dna_record2.seq

            details = st.radio("Details", ("Description", "Sequence"))
            if details == "Description":
                st.write(dna_record1.description)
                st.write("=====================")
                st.write(dna_record2.description)
            elif details == "Sequence":
                st.write(dna_record1.seq)
                st.write("=====================")
                st.write(dna_record2.seq)

            cus_limit = st.number_input("Select Max number of Nucleotide", 10,
                                        200, 50)
            if st.button("Dot Plot"):
                st.write(
                    "Comparing the first {} Nucleotide of the Two Sequences".
                    format(cus_limit))
                dotplotx(dna_seq1[0:cus_limit], dna_seq2[0:cus_limit])

                st.pyplot()

    elif choice == "MoleculeVisualizer":
        st.subheader(
            "Look at a molecule! Pre-loaded example is the Covid-19 Spike Protein. Thank you to: https://github.com/napoles-uach/streamlit_3dmol"
        )

        component_3dmol()

    elif choice == "ChemicalSearch":
        st.title(
            "Search for chemicals and get info. Pre-loaded example: imatinib")
        user_compound = st.text_input("Enter compound name", 'imatinib')
        if user_compound is not None:
            results = pcp.get_compounds(user_compound, 'name')
            for compound in results:
                st.write('Compound ID: ' + str(compound.cid))
                st.write('SMILES: ' + compound.isomeric_smiles)

                vioxx = Compound.from_cid(compound.cid)
                st.write('Molecular Formula: ' + vioxx.molecular_formula)
                st.write('Molecular Weight: ' + str(vioxx.molecular_weight))
                st.write('IUPAC Name: ' + vioxx.iupac_name)
                st.write('xlogp value: ' + str(vioxx.xlogp))
Ejemplo n.º 18
0
def find_pubchem_from_ids(pubchem=None, CASRN=None, inchi=None, inchikey=None,
                          smiles=None, use_cache=True):
    '''Cached query of pubchem database, based on one of many identifiers.
    
    Parameters
    ----------
    pubchem : int, optional
        PubChem ID; prefered lookup, [-]
    CASRN : str, optional
        CAS number, [-]
    inchi : str, optional
        InChI identification string as given in Common Chemistry (there can be multiple
        valid InChI strings for a compound), [-]
    inchikey : str, optional
        InChI key identification string (meant to be unique to a compound), [-]        
    smiles : str, optional
        SMILES identification string, [-]
    use_cache : bool, optional
        Whether or not to use the cache, [-]
    
    Returns
    -------
    cid : intoxidane
        PubChem ID, [-]
    iupac_name : str
        IUPAC name as given in pubchem, [-]
    MW : float
        Molecular weight, [g/mol]
    InChI : str
        InChI identification string as given in Common Chemistry (there can be multiple
        valid InChI strings for a compound), [-]
    InChI_key : str
        InChI key identification string (meant to be unique to a compound), [-]        
    smiles : str
        SMILES identification string, [-]
    formula : str
        Formula, [-]
    synonyms : list[str]
        List of synonyms of the compound, [-]
        
    Examples
    --------
    
    >>> find_pubchem_from_ids(pubchem=962)[0]
    962
    >>> find_pubchem_from_ids(pubchem=962)[1]
    'oxidane'
    >>> find_pubchem_from_ids(pubchem=962)[2]
    18.015
    >>> find_pubchem_from_ids(pubchem=962)[3]
    'InChI=1S/H2O/h1H2'
    >>> find_pubchem_from_ids(pubchem=962)[4]
    'XLYOFNOQVPJJNP-UHFFFAOYSA-N'
    >>> find_pubchem_from_ids(pubchem=962)[5]
    'O'
    >>> find_pubchem_from_ids(pubchem=962)[6]
    'H2O'
    >>> len(find_pubchem_from_ids(pubchem=962)[7]) > 100
    True
    
    >>> find_pubchem_from_ids(CASRN="53850-36-5")[0]
    56951715
    
    >>> find_pubchem_from_ids(CASRN="54084-70-7") # Nihonium is missing
    [None, None, None, None, None, None, None, None]
    
    try to use rdkit here to check the correct inchikey is found.
    
    >>> find_pubchem_from_ids(inchi='InChI=1S/Cl', inchikey="ZAMOUSCENKQFHK-UHFFFAOYSA-N")[0]
    5360523
    >>> find_pubchem_from_ids(inchi='InChI=1S/H2O/h1H2', inchikey="XLYOFNOQVPJJNP-UHFFFAOYSA-N")[0]
    962
    
    >>> find_pubchem_from_ids(inchi='InChI=1S/I2/c1-2')[0:5]
    [807, 'molecular iodine', 253.8089, 'InChI=1S/I2/c1-2', 'PNDPGZBMCMUPRI-UHFFFAOYSA-N']
    >>> find_pubchem_from_ids(inchi='InChI=1S/H2/h1H')[0:5]
    [783, 'molecular hydrogen', 2.016, 'InChI=1S/H2/h1H', 'UFHFLCQGNIYNRP-UHFFFAOYSA-N']
    '''
    abort = False
    key = (pubchem, CASRN, inchi, inchikey, smiles)
    hash_key = deterministic_hash(str(key))
    key_file = os.path.join(pubchem_cache_dir, hash_key)
    if os.path.exists(key_file) and use_cache:
        f = open(key_file, 'r')
        json_data = json.loads(f.read())
        f.close()
        return json_data
    
    if pubchem is not None:
        compound = Compound.from_cid(pubchem)
        cid = compound.cid
    else:
        if inchikey is not None:
            # Dup for chlorine atomic here
            # find_pubchem_from_ids(inchikey='ZAMOUSCENKQFHK-UHFFFAOYSA-N')[0]
             compounds = get_compounds(inchikey, 'inchikey')
        elif inchi is not None:
            # chlorine search "InChI=1S/Cl" finds HCl
            compounds = get_compounds(inchi, 'inchi')
        elif smiles is not None:
             compounds = get_compounds(smiles, 'smiles')
        elif CASRN is not None:
            compounds = get_compounds(CASRN, 'name')
        # maybe sort by ID in the future
        if not compounds:
            abort = True
            cid = None
        if not abort:
            compound = compounds[0]
            cid = compound.cid

    if cid is None:
        abort = True
    if abort:
        cid, iupac_name, mw, inchi_val, inchikey, smi, formula, names = [None]*8
    else:
        iupac_name = compound.iupac_name
        mw = float(compound.molecular_weight)
        smi = compound.canonical_smiles
        inchi_val = compound.inchi
        inchikey = compound.inchikey
        formula = compound.molecular_formula
        names = compound.synonyms
    ans = (cid, iupac_name, mw, inchi_val, inchikey, smi, formula, names)
    
    f = open(key_file, 'w')
    json.dump(ans, f)
    f.close()
    return ans