Python get_properties Examples, pubchempy.get_properties Python Examples

Example #1

0

Show file

File: pchem.py Project: poponta1218/Bot.ppnt

 async def popochem(self, ctx, *, name: str):
     outstr = ""
     properties = [
         "IUPACName", "MolecularFormula", "MolecularWeight",
         "CanonicalSMILES"
     ]
     translator = Translator()
     name = translator.translate(text=name, dest="en").text
     rslt = pcp.get_properties(properties, name, "name")
     if rslt == []:
         outstr += "検索した分子は見つかりませんでした．\nタイプミス等の可能性があります．"
         await ctx.send(outstr)
     else:
         smiles = rslt[0].get("CanonicalSMILES")
         if rslt[0].get("IUPACName") is not None:
             iupac_name = "IUPAC名: " + rslt[0].get("IUPACName") + "\n"
         else:
             iupac_name = ""
         mol_info = "分子式 (分子量): " + rslt[0].get("MolecularFormula") + \
             " (" + rslt[0].get("MolecularWeight") + ")"
         outstr += iupac_name + mol_info
         view = rdMolDraw2D.MolDraw2DCairo(330, 300)
         options = view.drawOptions()
         options.legendFontSize = 24
         options.multipleBondOffset = 0.1
         options.useBWAtomPalette()
         struct = rdMolDraw2D.PrepareMolForDrawing(
             Chem.MolFromSmiles(smiles))
         view.DrawMolecule(struct)
         view.FinishDrawing()
         view.WriteDrawingText("structure.png")
         img_path = discord.File("structure.png")
         await ctx.send(outstr, file=img_path)
         os.remove("structure.png")

Example #2

0

Show file

    def get_mol_data(self, molecule):
        data = pcp.get_properties('MolecularWeight,MolecularFormula,IUPACName',
                                  molecule,
                                  namespace='name',
                                  name_type='complete')

        return data[0]

Example #3

0

Show file

def pull_smiles(cid_list):
    '''
    Use pcp package to pull SMILES string from the pubchem API
    '''
    smiles = [
        x['CanonicalSMILES']
        for x in pcp.get_properties('CanonicalSMILES', cid_list)
    ]

    return smiles

Example #4

0

Show file

def pcp_getSmilesFromPubchem(CIDList, renameDict = {}, addCol = {}):
    import pubchempy as pcp
    smilesDF = (pcp.get_properties('IsomericSMILES', CIDList, as_dataframe=True))
    if renameDict:
        smilesDF = smilesDF.rename(columns=renameDict)
        smilesDF.drop(smilesDF.index[0])
    if addCol:
        for i in addCol.keys():
            smilesDF[i] = addCol[i]

    return smilesDF

Example #5

0

Show file

File: Bioservices_magic_august11.py Project: PerlJam/1KM

def retrieve_enzyme_substrates(enzyme_name):
    s = Kegg()
    compound_search = (s.get(str(enzyme_name)))
    p = KeggParser()
    d = p.parse(compound_search)

    ####################### grabs all the reactions that are dealt with by this enzyme
    all_rxns = []

    if d['all_reac'][
            0] == 'R':  ####filter to make sure that we do have a list, if the first thing is R then there's only one reaction and no need to iterate
        all_rxns.append(d['all_reac'])
    else:
        for i in d['all_reac']:
            for j in i.split():
                if j[0] == 'R':
                    if j[-1] == ';':
                        j = j[:-1]
                    all_rxns.append(j)

    ####################### grabs all the smiles of the substrates that are involved with these reactions
    smiles = []
    real_name = []

    for i in all_rxns:
        reactants = []  ##build reactants
        reaction = s.get(i)
        for i in reaction.split('\n'):
            if 'EQUATION' in i:
                for j in range(len(i.split())):
                    if i.split()[j][0] == '<':
                        eq = j
                for j in range(len(i.split())):
                    if i.split()[j][0] == "C" and j < eq:
                        reactants.append(i.split()[j])
        reactant = max(reactants)

        reactantinfo = s.get(reactant)
        for i in reactantinfo.split('\n'):
            if 'NAME' in i:
                reactantname = i.split()[1]
        if reactantname[-1] == ';':
            reactantname = reactantname[:-1]
        if reactantname != 'Reduced':
            real_name.append(reactantname)
        try:
            test = pcp.get_properties('CanonicalSMILES', reactantname, 'name')
            smiles.append(str(test[0]['CanonicalSMILES']))
        except:
            pass

    return real_name, smiles

Example #6

0

Show file

File: nametosmiles_csv.py Project: hori1537/chemical-data-science

def apply_get_compounds(mol_name):
    properties = get_properties( ['IsomericSMILES'], mol_name, 'name')

    if properties != [] :
        # get_properties return dictionary in list [{'CID': 9890, 'IsomericSMILES': 'C(C(F)F)F'}]
        # properties[0] return dictionary {'CID': 9890, 'IsomericSMILES': 'C(C(F)F)F'}
        iso_smiles = properties[0]['IsomericSMILES']
        print(mol_name, ' : ', iso_smiles)
    else:
        iso_smiles =nan
        print(mol_name, ' can\'t convert')

    return iso_smiles

Example #7

0

Show file

File: get_similar_hba.py Project: jrodguez/des-basis-set

def get_similar_hba(dataframe, source_column):

    # empty dataframe to continuously append final results to.
    final_df = pd.DataFrame()

    for i, row in dataframe.iterrows():

        # source column contains the cid's for chemical similarity search.
        cid = row[source_column]

        request_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastsimilarity_2d/cid/%s/cids/JSON?Threshold=80&MaxRecords=3000" % str(
            cid)

        request = requests.get(request_url)
        request_json = request.json()

        # adding results from request to a list.
        similarity_list = request_json['IdentifierList']['CID']

        smiles_list = []  # empty list to append smiles strings from results.

        for cid in similarity_list:
            # getting smiles strings and appending to list
            smiles_list.append(pcp.get_properties('canonical_smiles', cid))

        # screening the smiles list for HBA criteria
        screened_list = hba_screen(smiles_list)

        screened_cid_list = []  # empty list to append screened cid's into
        screened_smiles_list = []  # empty list to append screened smiles into

        for cid in range(len(screened_list)):
            screened_cid_list.append(screened_list[cid]['CID'])

        for smiles in range(len(screened_list)):
            screened_smiles_list.append(
                screened_list[smiles]['CanonicalSMILES'])

        # creating an empty temporary dataframe to put screened results in
        temp_dataframe = pd.DataFrame()

        #adding screened cid and smiles to the temporary dataframe
        temp_dataframe['HBA_cid'] = screened_cid_list
        temp_dataframe['HBA_smiles'] = screened_smiles_list

        print(temp_dataframe)

        final_df = final_df.append(
            temp_dataframe)  # appending to final dataframe

    return final_df

Example #8

0

Show file

File: API_functions.py Project: PerlJam/1KM

def retrieve_enzyme_substrates(enzyme_name):
    s = Kegg()
    compound_search = (s.get(str(enzyme_name)))
    p = KeggParser()
    d = p.parse(compound_search)
    
    ####################### grabs all the reactions that are dealt with by this enzyme
    all_rxns = []
    
    if d['all_reac'][0] == 'R': ####filter to make sure that we do have a list, if the first thing is R then there's only one reaction and no need to iterate
        all_rxns.append( d['all_reac'] )
    else:
        for i in d['all_reac']:
            for j in i.split():
                if j[0] == 'R':
                    if j[-1] == ';':
                        j = j[:-1]
                    all_rxns.append( j )
          
    ####################### grabs all the smiles of the substrates that are involved with these reactions            
    smiles = []
    real_name = []
    
    for i in all_rxns:
        reactants = [] ##build reactants
        reaction = s.get(i)
        for i in reaction.split('\n'):
            if 'EQUATION' in i:
                for j in range(len( i.split() )):
                    if i.split()[j][0] == '<':
                        eq = j
                for j in range(len( i.split() )):
                    if i.split()[j][0] == "C" and j < eq:
                        reactants.append( i.split()[j] ) 
        reactant = max(reactants)

        reactantinfo = s.get(reactant)
        for i in reactantinfo.split('\n'):
            if 'NAME' in i:
                reactantname = i.split()[1]
        if reactantname[-1] == ';':
            reactantname = reactantname[:-1]
        if reactantname != 'Reduced':
            real_name.append( reactantname )
        try:
            test = pcp.get_properties('CanonicalSMILES', reactantname, 'name')
            smiles.append( str(test[0]['CanonicalSMILES']) )  
        except:
            pass
    
    return real_name, smiles

Example #9

0

Show file

File: tasks.py Project: zaidalam/cspade

def process_smiles(pk):
    project = Project.objects.get(pk=pk)
    df = pd.read_table(project.data_file, index_col='Compound')
    cmpds = df.index.values  #Name of Compounds
    # #remove duplicates
    # df = df.reset_index().drop_duplicates(subset='Compound', keep='last').set_index('Compound')
    #PUBChem Smiles and CIDs structural retrieval:
    smiles = {}

    for i in cmpds:
        print(i)
        if 'Smiles' in df.columns:
            if pd.isnull(df.loc[i]['Smiles']):
                sm_string = get_properties('CanonicalSMILES', i, 'name')
                if (len(sm_string) != 0):
                    smiles[i] = sm_string[0]
            else:
                smiles[i] = {
                    'CanonicalSMILES': df.loc[i]['Smiles'],
                    'CID': None
                }
        else:
            sm_string = get_properties('CanonicalSMILES', i, 'name')
            if (len(sm_string) != 0):
                smiles[i] = sm_string[0]

    dfsmi = pd.DataFrame.from_dict(smiles, orient='index')
    dfsmi.columns = ['Smiles', 'PubChem_CID']
    if 'Smiles' in df.columns:
        df = df.drop(['Smiles'], axis=1)
    #Merge Smiles Column with Data Frame
    dfcomp = df.join(dfsmi, how='left')
    dfcomp.index.name = 'Compound'
    project.header = ','.join(dfcomp.columns.tolist())
    # dfcomp.to_json('example_data_IC50-smiles.json',orient='index')
    project.smiles_data = dfcomp.to_json(orient='index')
    project.save()

Example #10

0

Show file

File: structure.py Project: kijanac/materia

    def retrieve(
        name: Optional[str] = None,
        smiles: Optional[str] = None,
        inchi: Optional[str] = None,
        inchikey: Optional[str] = None,
    ) -> mtr.Structure:
        kwargs = (
            (name, "name"),
            (smiles, "smiles"),
            (inchi, "inchi"),
            (inchikey, "inchikey"),
        )
        try:
            identifier, identifier_type = next(
                (k, v) for k, v in kwargs if k is not None)
        except StopIteration:
            raise ValueError(
                "Provide name, SMILES, InChi, or InChiKey to retrieve structure."
            )
        try:
            # this just picks the first returned compound
            # if there are multiple, we are assuming that the
            # first such compound is the "most relevant" in some sense
            cid, *_ = pcp.get_cids(identifier, identifier_type)
            if cid == 0:
                raise ValueError
        except (ValueError, OSError):
            raise ValueError(f"Structure retrieval for {identifier} failed.")

        try:
            return _structure_from_pubchem_compound(
                compound=pcp.Compound.from_cid(cid, record_type="3d"))
        except pcp.NotFoundError:
            # no 3d structure from pubchem
            # there must be a 2d structure since a cid was found
            [property_dict] = pcp.get_properties(properties="IsomericSMILES",
                                                 identifier=cid,
                                                 namespace="cid")
            return Structure.generate(smiles=property_dict["IsomericSMILES"])

Example #11

0

Show file

def drug_properties(docked_struct_directory):
    drug_properties = [
        'iupac_name', 'molecular_formula', 'molecular_weight',
        'h_bond_acceptor_count', 'h_bond_donor_count', 'xlogp', 'tpsa'
    ]
    pdbqt = [
        f for f in os.listdir(docked_struct_directory)
        if f.endswith('_output.pdbqt')
    ]
    df = pd.DataFrame()
    #os.remove('drug_properties.tsv')
    for file in pdbqt:
        with open('docked_struct\\' + file) as f:
            temp = pd.DataFrame(
                pcp.get_properties(
                    drug_properties,
                    f.readlines()[2].split(' = ')[1].split('\n')[0],
                    'cid',
                    as_dataframe=True))
            df = df.append(temp, ignore_index=False)
    df.reset_index(level=0, inplace=True)
    return (df)

Example #12

0

Show file

def get_properties(dataframe, properties_list, source_column, name_prefix):
    """This function will retrieve chemical properties from the pubchem database by searching from their cid's. 
    Must input a dataframe, properties list, a source column for which to search based on cid, and a name to give as
    a prefix for the new columns"""

    empty_df = pd.DataFrame()  # empty df to append results to

    for i, row in dataframe.iterrows():

        # make sure the source column contains the cid's you want to obtain proeprties for
        cids = row[source_column]

        # will return the properties as seperate df
        temporary_df = pcp.get_properties(properties_list,
                                          cids,
                                          listkey_count=3,
                                          as_dataframe=True)

        # append result to empty dataframe
        empty_df = temporary_df.append(empty_df)

    # need to keep original order of results so this will fix that
    empty_df = empty_df.iloc[::-1]

    empty_df = empty_df.reset_index()  # also resetting index

    # dropping the cid column from dataframe
    empty_df = empty_df.drop(['CID'], axis=1)

    # adding prefix to column names
    empty_df = empty_df.add_prefix(name_prefix)

    # concatenating to original dataframe
    dataframe = pd.concat([dataframe, empty_df], axis=1)

    return dataframe

Example #13

0

Show file

File: name2smiles.py Project: z-gong/il-sim

#!/usr/bin/env python3

import os
import time
from ilthermo.models import *
from pubchempy import get_compounds, get_properties
from chemspipy import ChemSpider

cs_token = os.environ['CHEMSPIDER_TOKEN']
cs = ChemSpider(cs_token)

ions = session.query(Ion)
n = 0
for ion in ions.filter(Ion.smiles == None):
    MATCHED = False
    pc_d_list = get_properties('IUPACName,IsomericSMILES,CanonicalSMILES',
                               ion.name, 'name')
    print('PubChem: ', ion, pc_d_list)
    if len(pc_d_list) == 1:
        d = pc_d_list[0]
        iupac = d.get('IUPACName')
        smiles = d.get('IsomericSMILES') or d.get('CanonicalSMILES')
        MATCHED = True

    if not MATCHED:
        cs_results = cs.search(ion.name)
        cs_results.wait()
        print('ChemSpider: ', ion, cs_results)
        if len(cs_results) == 1:
            result = cs_results[0]
            iupac = None
            smiles = result.smiles

Example #14

0

Show file

import pubchempy as pcp
import pandas as pd
from collections import deque

data = pd.read_csv("list_of_molecules.csv", header=None)

list_of_smiles = {}

for i in list_of_compounds:
    try:
        p=pcp.get_properties('IsomericSMILES','CC','smiles',searchtype='superstructure')
	list_of_smiles[i] = p
        print(p)
    except:
        print(i)
write.csv(list_of_smiles,"smiles.csv")

Example #15

0

Show file

File: getPubChem.py Project: edu159/molecpy

import pubchempy as pcp
import json
#hex = pcp.request(11006, namespace='cid', domain='compound', operation='classification', output='JSON')
#text = json.loads(hex.read())
#print text["Hierarchies"]["Hierarchy"][1]["Node"][1]["Information"]["Name"]
 #print "AKI"
hex = pcp.get("meth", namespace='name', domain='compound', operation='cids', output='JSON', name_type='word', list_return='listkey')
hex = json.loads(hex)
listkey_name = hex['IdentifierList']['ListKey']
#print "AKI"
#hex = pcp.get(listkey_name, namespace='listkey', domain='compound', operation='cids', output='JSON', listkey_count=20)
hex = pcp.get_properties('MolecularWeight,MolecularFormula,IUPACName', listkey_name, namespace='listkey', listkey_count=200)
namelist = [c["IUPACName"] for c in hex if c["IUPACName"].startswith('meth')]
namelist.sort()
print namelist
#hex = json.loads(hex)
#compounds = [pcp.Compound(r) for r in hex['PC_Compounds']] if hex else []
#print len(compounds)
#print "AKI"
#print hex
#hex = pcp.get(297, namespace='cid', domain='compound', output='PNG')
#hex = pcp.get_cids('meth', 'name', name_type='word', listkey_start=0,listkey_count=2)
#print hex.molecular_formula
#print hex.to_dict(properties=["atoms","bonds"])

Example #16

0

Show file

File: pka_lookup_pubchem.py Project: khoivan88/pka_lookup

def pka_lookup_pubchem(identifier, namespace=None, domain='compound') -> Optional[str]:
    global debug

    if len(sys.argv) == 2 and sys.argv[1] in ['--debug=True', '--debug=true', '--debug', '-d']:
        debug = True

    # if debug:
    #     print(f'In DEBUG mode: {debug}')

    # Identify lookup source (Pubchem in this case)
    lookup_source = 'Pubchem'

    try:
        headers = {
            'user-agent': 'Mozilla/5.0 (X11; CentOS; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}

        # print('Searching Pubchem...')

        # Using pubchem api for python
        # Getting CID number, the result of this, by default is exact match. The result is returned as a list.
        cids = []
        identifier_type = ''

        if not namespace:
            identifier_type = classify(identifier)
            # print(f'identifier_type determined by classify() is: {identifier_type}')

            # If the input is inchi, inchikey or smiles (this could be a false smiles):
            if identifier_type in ['smiles', 'inchi', 'inchikey']:
                lookup = pcp.get_cids(identifier, namespace=identifier_type)
                if lookup:
                    cids.append(lookup[0])
            else:
                lookup = pcp.get_cids(identifier, namespace='name')
                if lookup:
                    cids.append(lookup[0])
                    # print(f'namespace from pubchem lookup is: {namespace}')
        elif namespace == 'cas':
            cids = pcp.get_cids(identifier, namespace='name')
        else:
            cids = pcp.get_cids(identifier, namespace=namespace)

        if not cids:
            lookup = pcp.get_cids(identifier, namespace='name')
            if lookup:
                cids.append(lookup[0])

            # cids = pcp.get_cids(identifier, namespace=namespace)
            identifier_type = namespace

        # print(cids)

        #  this api return an empty list if it cannot find cas_nr. This is to check if pubchem has this chemical.
        if len(cids) > 0:
            # if Pubchem found the result, get the first result of the list
            cid = cids[0]
            # print('Compound ID (CID) from PubChem is: {} and type is: {}'.format(cid, type(cid)))

            exact_match = True

            # synonyms = []
            synonyms = pcp.get_synonyms(cid)[0]['Synonym'] or []
            
            # Extract CAS number from the list of synonyms
            returned_cas = ''
            for synonym in synonyms:
                cas_nr = re.search(r'^\d{2,7}-\d{2}-\d$', synonym)
                if cas_nr:
                    cas_nr = cas_nr.group()
                    returned_cas = cas_nr
                    break

            # lookup_result = []
            lookup_result = pcp.get_properties(['inchi', 'inchikey',
                                        'canonical_smiles', 'isomeric_smiles',
                                        'iupac_name'],
                                cid)

            if identifier_type == 'cas':
                # To double check if the CAS number is correct:
                # using pubchem api, get a list of synonym. The result is a list of dict.
                # choose the first result and check all values for 'Synonym' key:
                exact_match = identifier in synonyms

            elif identifier_type in ['inchi', 'inchikey']:

                if identifier_type == 'inchi':
                    # print(lookup_result[0].get('InChI', False))
                    # print(f'input:\n{identifier}')
                    exact_match = (identifier == lookup_result[0].get('InChI', False))
                
                elif identifier_type == 'inchikey':
                    exact_match = (identifier == lookup_result[0].get('InChIKey', False))

            if not exact_match:
                if debug:
                    print(f'Exact match between input and Pubchem return value? {identifier in synonyms}')
                raise ValueError('This is not an exact match on Pubchem!')

            '''
            get url from Pubchem to get pka lookup result
            'XML' can be replaced with 'JSON' but it is harder to parse later on
            for more info about Pubchem output types: https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest$_Toc494865558
            '''
            pka_lookup_result_xml = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{}/XML?heading=Dissociation+Constants'.format(cid)

            # Get the html request info using CID number from pubchem
            r = requests.get(pka_lookup_result_xml, headers=headers, timeout=15)
            # Check to see if give OK status (200) and not redirect
            if r.status_code == 200 and len(r.history) == 0:
                # print(r.text)
                # Use python XML to parse the return result
                tree = ET.fromstring(r.text)
            
                # Get the XML tree of <Information> only
                info_node = tree.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}Information')

                # Get the pKa reference:
                original_source = info_node.find('{http://pubchem.ncbi.nlm.nih.gov/pug_view}Reference').text
                # Get the pKa result:
                pka_result = info_node.find('.//*{http://pubchem.ncbi.nlm.nih.gov/pug_view}String').text
                pka_result = re.sub(r'^pKa = ', '', pka_result)    # remove 'pka = ' part out of the string answer
                # print(pka_result)
                # print(original_source)
                # print(lookup_result)

                core_result = {
                    'source': lookup_source,
                    'Pubchem_CID': str(cid),
                    'pKa': pka_result,
                    'reference': original_source,
                    'Substance_CASRN': returned_cas,
                }
                extra_info = lookup_result[0]
                extra_info.pop('CID', None)    # Remove 'CID': ... from lookup_result[0]

                # Merge 2 dict: https://treyhunner.com/2016/02/how-to-merge-dictionaries-in-python/
                result = {**core_result, **extra_info}
                # Rename some keys in the dict
                s = pd.Series(result)
                s = s.rename({
                    'CanonicalSMILES': 'Canonical_SMILES',
                    'IsomericSMILES': 'Isomeric_SMILES',
                    'IUPACName': 'IUPAC_Name'
                })
                result = s.to_dict()            
                return result

            else:
                raise RuntimeError('pKa not found in Pubchem.')
    
        else:
            raise RuntimeError('Compound not found in Pubchem.')

    except Exception as error:
        if debug:
            traceback_str = ''.join(traceback.format_exception(etype=type(error), value=error, tb=error.__traceback__))
            print(traceback_str)

        return None

Example #17

0

Show file

import pandas as pd
import pubchempy as pcp

metabs = pd.read_csv('metabolite_list.csv')

#get name and monoisotopic mass for a given formula from pubchem and write them to a pandas df
df1 = pcp.get_properties(['IUPACName', 'MonoisotopicMass'],
                         'C7H7NO2',
                         'formula',
                         as_dataframe=True)

#get info for a factor of compounds
#get all compound IDs for C20H41Br and assign to cs
cs = pcp.get_compounds('C20H41Br', 'formula')

#get info for these IDs
pcp.compounds_to_frame(
    cs, properties=['isomeric_smiles', 'xlogp', 'rotatable_bond_count'])

Example #18

0

Show file

File: getStrFromPubchem.py Project: dipanghosh/FHProj

from dipan_utilities import utilities
import pubchempy as pcp, pickle
import pandas as pd

inactiveIDList, freqActiveIDList = pickle.load(open("publicdataDump"))

freqActiveDF = (pcp.get_properties('IsomericSMILES',
                                   freqActiveIDList,
                                   as_dataframe=True))
freqActiveDF = freqActiveDF.rename(columns={
    'CID': 'externalid',
    'IsomericSMILES': 'molecule'
})
freqActiveDF['AlphascreenFH'] = 'yes'

inactiveDF = (pcp.get_properties('IsomericSMILES',
                                 inactiveIDList,
                                 as_dataframe=True))
inactiveDF = inactiveDF.rename(columns={
    'CID': 'externalid',
    'IsomericSMILES': 'molecule'
})
inactiveDF['AlphascreenFH'] = 'no'

mergedDF = pd.concat([freqActiveDF, inactiveDF])
utilities.viewTable(mergedDF)
mergedDF.to_excel("mergedMore.xls")

Example #19

0

Show file

File: make_compounf_info.py Project: Haru38/dream-challenge

my_columns = train_set[0]

import pandas as pd

df = pd.DataFrame(train_set[1:], columns=my_columns)
id_list = df['Compound Identifier'].unique().tolist()

properties = [
    'IUPACName', 'MolecularFormula', 'MolecularWeight', 'XLogP', 'TPSA',
    'CanonicalSMILES'
]

#get informations from pubChem
chem_infos = []
for cid in id_list:
    chem_info = pcp.get_properties(properties, cid)
    chem_infos.append(chem_info)

#ECFP4
radius = 2
nBits = 4096
morgan_fp = []
for info_list in chem_infos:
    info = info_list[0]
    mol = Chem.MolFromSmiles(info['CanonicalSMILES'])
    fp = [i for i in AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)]
    morgan_fp.append(fp)
morgan_fp = np.array(morgan_fp)

compound_dic = {}
i = 0

Example #20

0

Show file

File: main.py Project: Steinerr/pubchem_example

    Threshold=70)
print(f'len cids for 70: {len(cids_70)}')

# график зависимости количества результатов поиска от порога схожести
threshholds = [95, 80, 70]
threshholds_lengths = [len(cids_95), len(cids_80), len(cids_70)]

plt.plot(threshholds, threshholds_lengths)
plt.xlabel('Threshhold similarity')
plt.ylabel('Count of results')
plt.savefig('threshholds_length.png')

# pandas dataframe c четырьмя колонками
cids_95_df = pcp.get_properties(
    ('XlogP', 'TPSA', 'MolecularWeight', 'Volume3D'),
    cids_95,
    namespace='cid',
    as_dataframe=True)

cids_80_df = pcp.get_properties(
    ('XlogP', 'TPSA', 'MolecularWeight', 'Volume3D'),
    cids_80,
    namespace='cid',
    as_dataframe=True)

listkey_count = 5000
pages = math.ceil(len(cids_70) / listkey_count)
print(f'Found {pages} pages')
cids_70_dfs = []
for page_number in range(1, pages + 1):
    print(f'Downloading page #{page_number} from {pages}')

Example #21

0

Show file

                #cid = aline.replace('\n')
                CID_dict[tableID].append(
                    aline.replace('\n', '').replace('CID: ', ''))

print(len(CID_dict[tableID]))
CID_dict.keys()
print(CID_dict[tableID][0])

#get SMILES for CIDs
#keep things with smiles length <400

smiles_dict = {}

for key in CID_dict.keys():
    smiles_dict[key] = []
    prop_dict = pcp.get_properties('IsomericSMILES', CID_dict[key])
    for i in range(0, len(prop_dict)):
        if len(prop_dict[i]
               ['IsomericSMILES']) < 400:  #### only those under 200 char
            smiles_dict[key].append(prop_dict[i]['IsomericSMILES'])
            cidslist.append(i)

print(prop_dict[i]["IsomericSMILES"])
print(len(smiles_dict[key]))

p = pcp.get_properties('IsomericSMILES',
                       'CC',
                       'smiles',
                       searchtype='superstructure')
p = pcp.get_properties('C[N+](C)(C)CC(=O)[O-]',
                       'CC',

Example #22

0

Show file

def name2smilesinchi(name):
    special_dict = {
        'hydrogen': '[H][H]',
        'oxygen': 'O=O',
        'nitrogen': 'N#N',
        'n-butan(ol-d)': 'CCCCO[2H]',
        '3-(10,11-dihydro-5H-dibenzo[a,d]cyclohepten-5-ylidine)-N,N-dimethyl-1-propanamine hydrochloride': '[H+].[Cl-].CN(C)CCC=C1c2ccccc2CCc3ccccc13',
        'acetylferrocene': '[Fe+2].[C-]1C(C(=O)C)=CC=C1.[C-]1C=CC=C1',
        'ferrocenylsulfonyl(trifluoromethylsulfonyl)imide': '[Fe+2].[C-]1C(S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F)=CC=C1.[C-]1C=CC=C1',
        '2,2\'-[1,2-phenylenebis(nitrilomethylidyne)]bis-phenol': 'Oc1ccccc1CNc1ccccc1NCc1ccccc1O',
        'rhodium(1+), [(1,2,5,6-.nu.)-1,5-cyclooctadiene][(2R,2\'R,5R,5\'R)-1,1\'-(1,2-phenylene)bis[2,5-dimethylphospholane-kP]]-, tetrafluoroborate(1-) (1:1)': 'cannot get SMILES, case 1',
        'N,N\'-ethylenebis(salicylideneiminato)diaquachromium(III) chloride': 'cannot get SMILES, case 2',
        'diaquabis(4-methylpyridine)iron(3+)  tris[tetrafluoroborate(1-)]': 'cannot get SMILES, case 3',
        '(N,N-diethylethanamine)(dihydrido)(1-methyl-1H-imidazole-.kappa.N3)boron(1+) bis(trifluoromethylsulfonyl)amide': 'cannot get SMILES, case 4',
        'micoflavin': 'cannot get SMILES, case 5',
        '(2-methyloyoxyethyl)dimethylpentyloxyammonium acesulfamate': 'cannot get SMILES, case 6',
        'rel-(1R,2S)-N-methylephedrine': 'CN(C)[C@@H](C)[C@H](O)c1ccccc1',
        '(S)-(2-methoxycarbonyl)pyrrolidinium': 'COC(=O)[C@H]1[NH2+]CCC1',
        'salnaph': 'Oc1ccccc1/C=N/c1cccc2cccc(/N=C/c3ccccc3O)c12',
        '[bis(salicylidene)ethylenediaminato]oxovanadium': '[V-2](=O)235[N+](=CC1=C(C=CC=C1)O2)CC[N+]3=CC4=CC=CC=C4O5',
        '2,2-(4\',4\'\'-dihydroxy)diphenylpropane': 'c1(O)ccc(C(C)(C)c2ccc(O)cc2)cc1',
        '2,2\'-(dodecylimino)bis-ethanol N-oxide': 'C(CCCCCCCCCCC)N(=O)(CCO)CCO',
        '(+-)-carvedilol': 'COc1ccccc1OCCNCC(O)COc2cccc3[nH]c4ccccc4c23',
        '5-hydroxy-3-methyl-1,2,3-oxadiazolium inner salt': 'OC1=CN([NH2+]O1)C'
    }
    if name in special_dict:
        smiles = special_dict[name]
        print(name, smiles)
        if re.match(r'cannot get SMILES, case \d$', smiles):
            return smiles, None
        inchi = smiles2inchi(smiles)
        return smiles, inchi
    special_dict = {
        '(.+-.)-.alpha.-aminobutyric acid': 'alpha-aminobutyric acid',
    }
    if name in special_dict:
        name = special_dict[name]
    if re.match(r'(|[0-9a-zA-Z()+\[\]\',\- ]+)monohydrate', name):
        name = re.split(r' monohydrate', name)[0]
    try:
        url = 'https://opsin.ch.cam.ac.uk/opsin/' + name.replace(' ', '%20')
        info = json.loads(urlopen(url).read().decode('utf8'))
        return info['smiles'], info['stdinchi']
    except:
        a = 1

    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + name + '/smiles'
        smiles = urlopen(url).read().decode('utf8')
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + name + '/inchi'
        inchi = urlopen(url).read().decode('utf8')
        return smiles, inchi
    except:
        a = 1

    try:
        cs_token = os.environ['CHEMSPIDER_TOKEN']
        cs = ChemSpider(cs_token)
        MATCHED = False
        pc_d_list = get_properties('IUPACName,IsomericSMILES,CanonicalSMILES,InChI', name, 'name')
        if len(pc_d_list) == 1:
            d = pc_d_list[0]
            smiles = d.get('CanonicalSMILES') or d.get('IsomericSMILES')
            inchi = d.get('InChI')
            MATCHED = True

        if not MATCHED:
            cs_results = cs.search(name)
            cs_results.wait()
            print('ChemSpider: ', name, cs_results)
            if len(cs_results) == 1:
                result = cs_results[0]
                smiles = result.smiles
                inchi = result.inchi
            else:
                result = cs_results[0]
                smiles = result.smiles
                inchi = result.inchi
        return smiles, inchi
    except:
        return None, None