def __init__(self, drug_decode): print("ChemSpiderSearch is still in progress, please do not use") self.dd = DrugDecode(drug_decode) self.dd_filled = DrugDecode(drug_decode) from bioservices.chemspider import ChemSpider from bioservices import ChEMBL from bioservices import UniChem try: print('Loading PubChem') from bioservices.pubchem import PubChem self.puchem = PubChem() except: # Pubchem was introduced only in dec 2015 pass print('Loading ChEMBL service') self.chembl = ChEMBL(cache=True) print('Loading ChemSpider service') self.chemspider = ChemSpider(cache=True) print('Loading UniChem service') # in unichem db number is 22 and chembl is 1 self.unichem = UniChem() print('Settings some data aliases') self._cs_find = self.chemspider.find self._cs_get = self.chemspider.GetExtendedCompoundInfo self.drug_ids = sorted(list(self.dd.df.index.values)) self.drug_names = sorted(list(self.dd.df.DRUG_NAME.values))
def main(self): # get current crawl number crawldata_row = models.CrawlData.query.first() current_crawl_number = crawldata_row.current_crawl_number print 'Current crawl number: %d' % current_crawl_number # Get list of uniprot accession numbers that are in the rows of current crawl number uniprot_acs = [str(uniprot_ac[0]) for uniprot_ac in models.UniProt.query.filter_by(crawl_number=current_crawl_number).values(models.UniProt.ac)] # Iterate through Uniprot ACs for uniprot_ac in uniprot_acs: # get ChEMBL target id using bioservices chembl=ChEMBL(verbose=False) target_info = chembl.get_target_by_uniprotId(uniprot_ac) target_chembl_id=target_info['chemblId'] print "ChEMBL ID of target protein:", target_chembl_id target = models.ChemblTarget(crawl_number=current_crawl_number, target_chembl_id=target_chembl_id) db.session.add(target) # Updata datestamp for Chembl now = datetime.datetime.utcnow() current_crawl_datestamp_row = models.DateStamps.query.filter_by(crawl_number=current_crawl_number).first() current_crawl_datestamp_row.chembl_datestamp = now db.session.commit() print 'Done.'
def chembl_fetch(input_id): """ Access ChEMBL to retrieve canonical SMILES """ from bioservices import ChEMBL chembl = ChEMBL() current_molecule = chembl.search_molecule(input_id)["molecules"][0][ "molecule_structures"]["canonical_smiles"] return current_molecule
def chembl_assay(chemblid): """Access ChEMBL to get information for related assays""" global CHEMBL if CHEMBL is None: from bioservices import ChEMBL as ChEMBLdb CHEMBL = ChEMBLdb() data = CHEMBL.get_assays_by_chemblId(str(chemblid))['assay'] return {FIELDS[key]: value for key, value in list(data.items()) if key in FIELDS}
import os import sys from bioservices import ChEMBL, QuickGO, Reactome, KEGG from py2neo import Graph from model.core import * from ncbi import fetch_publication_list from quickgo import fetch_quick_go_data from uniprot import * graph = Graph(host=os.environ.get("DB", "localhost"), bolt=True, password=os.environ.get("NEO4J_PASSWORD", "")) chembl = ChEMBL(verbose=False) quick_go = QuickGO(verbose=False) reactome = Reactome(verbose=False) kegg = KEGG(verbose=False) # watch("neo4j.bolt") gene_dict = dict() transcript_dict = dict() pseudogene_dict = dict() cds_dict = dict() exon_dict = dict() rrna_dict = dict() trna_dict = dict() ncrna_dict = dict() location_dict = dict()
from bioservices import ChEMBL chembl=ChEMBL(verbose=False) acc = 'P00519' target_info = chembl.get_target_by_uniprotId(acc) print target_info target_chembl_id=target_info['chemblId'] bioactivities=chembl.get_target_bioactivities(str(target_chembl_id)) compound_chemblids = [ entry['ingredient_cmpd_chemblid'] for entry in bioactivities ] print "# of compound chemblids:", len(compound_chemblids) resjson = chembl.get_compounds_by_chemblId(chembl._chemblId_example) print "Example compound retrieved by compound chemblid: \n", resjson
class ChemSpiderSearch(object): """This class uses ChemSpider and ChEMBL to identify drug name .. warning:: this is a draft version in dev mode :: c = ChemSpiderSearch() c.search_in_chemspider() c.search_from_smile_inchembl() df = c.find_chembl_ids() It happens that most of public names can be found and almost none of non-public are found. As expected... If chemspider, chembl and pubchem are empty, search for the drug name in chemspider. CHEMSPIDER search: if no identifier found, the search if DROPPED if 1 identifier found, we keep going using the SMILE identifier If more than 1 identifier found, this is AMBIGUOUS. If chembl and pubchem, check with unichem If chembl, check smiles If chembl and chemspider, check smiles ? SMILES are not unique """ def __init__(self, drug_decode): print("ChemSpiderSearch is still in progress, please do not use") self.dd = DrugDecode(drug_decode) self.dd_filled = DrugDecode(drug_decode) from bioservices.chemspider import ChemSpider from bioservices import ChEMBL from bioservices import UniChem try: print('Loading PubChem') from bioservices.pubchem import PubChem self.puchem = PubChem() except: # Pubchem was introduced only in dec 2015 pass print('Loading ChEMBL service') self.chembl = ChEMBL(cache=True) print('Loading ChemSpider service') self.chemspider = ChemSpider(cache=True) print('Loading UniChem service') # in unichem db number is 22 and chembl is 1 self.unichem = UniChem() print('Settings some data aliases') self._cs_find = self.chemspider.find self._cs_get = self.chemspider.GetExtendedCompoundInfo self.drug_ids = sorted(list(self.dd.df.index.values)) self.drug_names = sorted(list(self.dd.df.DRUG_NAME.values)) def filling_chembl_pubchem_using_unichem(self): """ """ N = len(self.drug_ids) pb = Progress(N) for i, this in enumerate(self.drug_ids): entry = self.dd.df.loc[this] # if no information is provided, we will need to get it # from chemspider # From the database, when chembl is provided, it is unique # same for chemspider and pubchem and CAS select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']] if select.count() == 0: name = self.dd.df.loc[this].DRUG_NAME results = self._cs_find(name) if len(results) == 0: # nothing found pass elif len(results) == 1: self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results[0] else: # non unique #chemspider = ",".join([str(x) for x in results]) self.dd_filled.df.loc[this].loc['CHEMSPIDER'] = results pb.animate(i + 1) # Search in chemspider systematically for i, this in enumerate(self.drug_ids): entry = self.dd.df.loc[this] if select.count() == 1: res = self._cs_find(drug) pb.animate(i + 1) def find_chembl_ids(self): """ """ # don't know how to search for a chembl id given the drug name... # so we use chemspider #self.search_in_chemspider() # but chemspider returns molecular information (not chembl id) # so given the smile string, we look back in chembl for valid entries #self.search_from_smile_inchembl() # finally, get the chembl identifiers drugs = [] chembl_ids = [] chemspider_ids = [] smiles_c = [] smiles_cs = [] for drug in self.drug_ids: try: entry = self.results_chembl[drug] ids = ",".join([x['chemblId'] for x in entry]) drugs.append(drug) chembl_ids.append(ids) ids = ",".join([str(x) for x in self.results[drug]]) except: print('skipping' + drug) ids = ",".join([drug, '', '', '', '', '']) chemspider_ids.append(ids) for drug in self.drug_ids: try: smiles_c.append(",".join( [x['smiles'] for x in self.results_chembl[drug]])) except: smiles_c.append('') try: smiles_cs.append(self.results_chemspider[drug]['smiles']) except: smiles_cs.append('') df = pd.DataFrame( [drugs, chembl_ids, chemspider_ids, smiles_c, smiles_cs], index=[ 'DRUG_NAME', 'CHEMBL_ID', 'CHEMSPIDER_ID', 'SMILE_CHEMBL', 'SMILE_CHEMSPIDER' ]) df = df.T return df def get_chemspider_ids(self, drug_name): res = self._cs_find(drug) return res def search_in_chemspider(self): # Fill results attribute as a dictionary. Keys being the drug id # and values are list of chemspider identifiers # # SB52334 --> SB-52334 N = len(self.dd) pb = Progress(N) self.results = {} results = [] for i, index in enumerate(self.dd.df.index): drug = self.dd.df.index[i] drug_name = self.dd.df.loc[drug].DRUG_NAME try: res = self._cs_find(drug_name) except: print("This drug index (%s) / drug name (%s) was not found" % (index, drug_name)) res = [] self.results[drug] = res pb.animate(i + 1) results.append(res) self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results def search_from_smile_inchembl(self): N = len(self.drug_ids) pb = Progress(N) self.results_chembl = {} self.results_chemspider = {} for i in range(0, N): drug = self.drug_ids[i] self.results_chembl[drug] = [] if self.results[drug]: for chemspider_id in self.results[drug]: chemspider_entry = self._cs_get(chemspider_id) self.results_chemspider[drug] = chemspider_entry smile = chemspider_entry['smiles'] # now search in chembl res_chembl = self.chembl.get_compounds_by_SMILES(smile) try: res_chembl['compounds'] self.results_chembl[drug].extend( res_chembl['compounds']) except: pass pb.animate(i + 1)
from bioservices import UniProt from bioservices import PDB from bioservices.apps.fasta import FASTA from bioservices import ChEMBL import os # Leos from uniprot id to the protein name f = FASTA() u = UniProt(cache=True, verbose=False) s = PDB() c = ChEMBL(verbose=False) """ pdb chain to uniprot id: ftp.ebi.ac.uk/pub/databases/msd/sifts/csv/pdb_chain_uniprot.csv import requests from xml.etree.ElementTree import fromstring pdb_id = '4hhb.A' pdb_mapping_url = 'http://www.rcsb.org/pdb/rest/das/pdb_uniprot_mapping/alignment' uniprot_url = 'http://www.uniprot.org/uniprot/{}.xml' def get_uniprot_accession_id(response_xml): root = fromstring(response_xml) return next( el for el in root.getchildren()[0].getchildren() if el.attrib['dbSource'] == 'UniProt' ).attrib['dbAccessionId'] def get_uniprot_protein_name(uniport_id): uinprot_response = requests.get( uniprot_url.format(uniport_id) ).text
def run(days=180): """Run as:: $ python manage.py runscript update_activities By default, updates bioactivities of compounds that were never updated or updates 180 days ago. To force updating bioactivities for all compounds, give 0 days argument as follows:: $ python manage.py runscript update_activities --script-args=0 """ try: days = int(days) except ValueError: days = 180 chembl = ChEMBLdb() #for table, func in [ # (Compound, chembl.get_compounds_activities), # (Target, chembl.get_target_bioactivities), # (Assay, chembl.get_assay_bioactivities), ]: count = skip = error = ncomp = 0 # will iterate over all compounds one-by-one for compound in Compound.objects.all(): # if no updates were made, last_update is None if (compound.last_update is None or (datetime.date.today() - compound.last_update) >= datetime.timedelta(days)): ncomp += 1 try: acts = chembl.get_compounds_activities(str(compound.chemblid)) except BioServicesError: continue try: for act in acts['bioactivities']: act = {FIELDS[key]: value for key, value in list(act.items()) if key in FIELDS} tid, aid, cid, pid = ( act['target'], act['assay'], act['compound'], act['parent_compound'] ) try: parent = Compound.objects.get(chemblid=pid) except Compound.DoesNotExist: try: # Uses implemented methods in lieu of Bioservices parent_compound_data = get_chembl_compound_data(pid) parent_compound_data.update(get_drugbank_data_from_chembl_id(pid)) parent_compound_targets = parent_compound_data.get('targets', []) del parent_compound_data['targets'] parent = Compound.objects.create(locked=True, **parent_compound_data) for target in parent_compound_targets: CompoundTarget.objects.create(compound=parent, **target) print(("Added Compound:", parent.name)) except ValueError: error += 1 continue try: target = Target.objects.get(chemblid=tid) except Target.DoesNotExist: try: target = Target.objects.create(locked=True, **chembl_target(tid)) except ValueError: error += 1 continue try: assay = Assay.objects.get(chemblid=aid) except Assay.DoesNotExist: try: assay = Assay.objects.create(locked=True, **chembl_assay(aid)) except ValueError: error += 1 continue try: activity = Bioactivity.objects.get( target=target, assay=assay, compound=compound) except Bioactivity.DoesNotExist: (act['target'], act['assay'], act['compound'], act['parent_compound']) = ( target, assay, compound, parent ) try: ba = Bioactivity.objects.create(locked=True, **act) except ValueError: error += 1 except Exception as err: for key, val in list(act.items()): if isinstance(val, str): print(('{}: ({}) {}'.format(key, len(val), val))) raise err else: count += 1 else: skip += 1 except: print(("An error occured:", compound.name, acts)) compound.last_update = datetime.date.today() compound.save() print(('{} bioactivities were added, {} were found in the database, and ' '{} failed due to value errors.'.format(count, skip, error))) print('Updating bioactivity units...') cursor = connection.cursor() cursor.execute( ''' UPDATE bioactivities_bioactivity SET standardized_units = '', standard_name = '', standardized_value = NULL; --no mass is needed in data conversion UPDATE public.bioactivities_bioactivity as v SET standard_name = s.standard_name, standardized_units=s.standard_unit, standardized_value=value*scale_factor FROM public.bioactivities_bioactivitytype as s WHERE v.bioactivity_type = s.chembl_bioactivity and v.units=s.chembl_unit and s.mass_flag='N'; --mass is needed in data conversion UPDATE public.bioactivities_bioactivity as v SET standard_name = s.standard_name, standardized_units=s.standard_unit, standardized_value=value*scale_factor/c.molecular_weight FROM public.bioactivities_bioactivitytype as s, public.compounds_compound as c WHERE v.bioactivity_type = s.chembl_bioactivity and v.units=s.chembl_unit and v.compound_id =c.id and s.mass_flag='Y'; ''' ) print('Units updated') print('Normalizing values') bio_types = {bio.standard_name: True for bio in Bioactivity.objects.all()} for bio_type in bio_types: targets = { bio.target: True for bio in Bioactivity.objects.filter( standard_name=bio_type ).prefetch_related('target') } for target in targets: current_bio = Bioactivity.objects.filter( standard_name=bio_type, target_id=target.id, standardized_value__isnull=False ).prefetch_related('target') bio_pk = [bio.id for bio in current_bio] bio_value = np.array([bio.standardized_value for bio in current_bio]) if len(bio_pk) > 0 and len(bio_value) > 0: bio_value /= np.max(np.abs(bio_value), axis=0) for index, pk in enumerate(bio_pk): try: Bioactivity.objects.filter(pk=pk).update( normalized_value=bio_value[index] ) except: print(('Update of bioactivity {} failed'.format(pk))) # Flag questionable entries print('Flagging questionable entries...') # Remove old flags in case they have become outdated (medians change and so on) Bioactivity.objects.all().update(data_validity='') total = 0 all_chembl = Bioactivity.objects.all().prefetch_related('compound', 'target').filter( standardized_value__isnull=False ) bio_types = {bio.standard_name: True for bio in all_chembl} bio_compounds = {bio.compound: True for bio in all_chembl} bio_targets = {bio.target: True for bio in all_chembl} chembl_entries = {} for entry in all_chembl: if entry.target: key = '|'.join([entry.standard_name, str(entry.compound.id), str(entry.target.id)]) else: key = '|'.join([entry.standard_name, str(entry.compound.id), 'None']) chembl_entries.setdefault(key, []).append(entry) # ChEMBL contains negative values! # TODO Needs revision for bio_type in bio_types: for target in bio_targets: for compound in bio_compounds: if bio_type and target and compound: if target: current_bio = chembl_entries.get('|'.join([bio_type, str(compound.id), str(target.id)]), []) else: current_bio = chembl_entries.get('|'.join([bio_type, str(compound.id), 'None']), []) bio_pk = [bio.id for bio in current_bio] bio_value = np.array([bio.standardized_value for bio in current_bio]) if len(bio_value) > 0: # Shift values by the minimum to avoid problems with negative values bio_value = np.array(bio_value) + np.abs(np.min(bio_value)) + 1 if len(bio_pk) > 0 and len(bio_value) > 0: bio_median = np.median(bio_value) flag_threshold = bio_median * 100 for index, pk in enumerate(bio_pk): if bio_value[index] > flag_threshold: this_bio = Bioactivity.objects.get(pk=bio_pk[index]) #this_bio.notes = 'Flagged' # Flag data validity for "Out of Range" this_bio.data_validity = 'R' this_bio.save() print((bio_pk[index], bio_value[index], 'vs', bio_median)) total += 1 # Check for possible transcription errors (1000-fold error mistaking uM for nM) for index, pk in enumerate(bio_pk): thousand_fold = np.where(bio_value == bio_value[index] * 1000)[0] if len(thousand_fold) > 0: for error_index in thousand_fold: this_bio = Bioactivity.objects.get(pk=bio_pk[error_index]) if not this_bio.data_validity: total += 1 this_bio.data_validity = 'T' this_bio.save() print((bio_pk[error_index], bio_value[error_index], 'thousand fold')) print(total)
class ChemSpiderSearch(object): """This class uses ChemSpider and ChEMBL to identify drug name .. warning:: this is a draft version in dev mode :: c = ChemSpiderSearch() c.search_in_chemspider() c.search_from_smile_inchembl() df = c.find_chembl_ids() It happens that most of public names can be found and almost none of non-public are found. As expected... If chemspider, chembl and pubchem are empty, search for the drug name in chemspider. CHEMSPIDER search: if no identifier found, the search if DROPPED if 1 identifier found, we keep going using the SMILE identifier If more than 1 identifier found, this is AMBIGUOUS. If chembl and pubchem, check with unichem If chembl, check smiles If chembl and chemspider, check smiles ? SMILES are not unique """ def __init__(self, drug_decode): print("ChemSpiderSearch is still in progress, please do not use") self.dd = DrugDecode(drug_decode) self.dd_filled = DrugDecode(drug_decode) from bioservices.chemspider import ChemSpider from bioservices import ChEMBL from bioservices import UniChem try: print('Loading PubChem') from bioservices.pubchem import PubChem self.puchem = PubChem() except: # Pubchem was introduced only in dec 2015 pass print('Loading ChEMBL service') self.chembl = ChEMBL(cache=True) print('Loading ChemSpider service') self.chemspider = ChemSpider(cache=True) print('Loading UniChem service') # in unichem db number is 22 and chembl is 1 self.unichem = UniChem() print('Settings some data aliases') self._cs_find = self.chemspider.find self._cs_get = self.chemspider.GetExtendedCompoundInfo self.drug_ids = sorted(list(self.dd.df.index.values)) self.drug_names = sorted(list(self.dd.df.DRUG_NAME.values)) def filling_chembl_pubchem_using_unichem(self): """ """ N = len(self.drug_ids) pb = Progress(N) for i,this in enumerate(self.drug_ids): entry = self.dd.df.ix[this] # if no information is provided, we will need to get it # from chemspider # From the database, when chembl is provided, it is unique # same for chemspider and pubchem and CAS select = entry[['CHEMSPIDER', 'CHEMBL', 'PUBCHEM']] if select.count() == 0: name = self.dd.df.ix[this].DRUG_NAME results = self._cs_find(name) if len(results) == 0: # nothing found pass elif len(results) == 1: self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results[0] else: # non unique #chemspider = ",".join([str(x) for x in results]) self.dd_filled.df.ix[this].loc['CHEMSPIDER'] = results pb.animate(i+1) # Search in chemspider systematically for i, this in enumerate(self.drug_ids): entry = self.dd.df.ix[this] if select.count() == 1: res = self._cs_find(drug) pb.animate(i+1) def find_chembl_ids(self): """ """ # don't know how to search for a chembl id given the drug name... # so we use chemspider #self.search_in_chemspider() # but chemspider returns molecular information (not chembl id) # so given the smile string, we look back in chembl for valid entries #self.search_from_smile_inchembl() # finally, get the chembl identifiers drugs = [] chembl_ids = [] chemspider_ids = [] smiles_c = [] smiles_cs = [] for drug in self.drug_ids: try: entry = self.results_chembl[drug] ids = ",".join([x['chemblId'] for x in entry]) drugs.append(drug) chembl_ids.append(ids) ids = ",".join([str(x) for x in self.results[drug]]) except: print('skipping' + drug) ids = ",".join([drug, '', '', '', '', '']) chemspider_ids.append(ids) for drug in self.drug_ids: try: smiles_c.append(",".join([x['smiles'] for x in self.results_chembl[drug]])) except: smiles_c.append('') try: smiles_cs.append(self.results_chemspider[drug]['smiles']) except: smiles_cs.append('') df = pd.DataFrame([drugs, chembl_ids, chemspider_ids, smiles_c, smiles_cs], index=['DRUG_NAME','CHEMBL_ID','CHEMSPIDER_ID', 'SMILE_CHEMBL', 'SMILE_CHEMSPIDER']) df = df.T return df def get_chemspider_ids(self, drug_name): res = self._cs_find(drug) return res def search_in_chemspider(self): # Fill results attribute as a dictionary. Keys being the drug id # and values are list of chemspider identifiers # # SB52334 --> SB-52334 N = len(self.dd) pb = Progress(N) self.results = {} results = [] for i, index in enumerate(self.dd.df.index): drug = self.dd.df.index[i] drug_name = self.dd.df.ix[drug].DRUG_NAME try: res = self._cs_find(drug_name) except: print("This drug index (%s) / drug name (%s) was not found" % (index, drug_name)) res = [] self.results[drug] = res pb.animate(i+1) results.append(res) self.dd_filled.df['CHEMSPIDER_SEARCHED'] = results def search_from_smile_inchembl(self): N = len(self.drug_ids) pb = Progress(N) self.results_chembl = {} self.results_chemspider = {} for i in range(0, N): drug = self.drug_ids[i] self.results_chembl[drug] = [] if self.results[drug]: for chemspider_id in self.results[drug]: chemspider_entry = self._cs_get(chemspider_id) self.results_chemspider[drug] = chemspider_entry smile = chemspider_entry['smiles'] # now search in chembl res_chembl = self.chembl.get_compounds_by_SMILES(smile) try: res_chembl['compounds'] self.results_chembl[drug].extend(res_chembl['compounds']) except: pass pb.animate(i+1)