def mapping(self, tax_id): ens_db = EnsemblDownload.get_ens_dbname_by_taxid(tax_id) if ens_db is None: print "Unsupported organism: %s" % tax_id return None print "Creating ensembl map for %s" % tax_id con = self.get_ensembl_connection(ens_db) try: #Get all the isoforms available in Ensembl (ENSG,ENST,ENSP map.) t_gt = db.from_sql( con, "select t1.ENSG, t1.ENST, t2.stable_id ENSP from (SELECT g.stable_id ENSG,t.stable_id ENST, t.transcript_id tid from gene g,transcript t where g.gene_id=t.gene_id) t1 left join translation t2 on t1.tid = t2.transcript_id" ) #ENSG to EntrezGeneID map. t_g = db.from_sql( con, """ SELECT g.stable_id ENSG,syn.synonym SYMBOL,xdb.db_name DB,'SYN' TYPE FROM object_xref oxr, xref x, external_db xdb,external_synonym syn,gene g WHERE oxr.xref_id=x.xref_id AND x.xref_id=syn.xref_id AND x.external_db_id=xdb.external_db_id AND oxr.ensembl_id=g.gene_id AND oxr.ensembl_object_type='Gene' and syn.synonym is not null UNION SELECT g.stable_id ENSG,x.display_label SYMBOL,xdb.db_name DB,'DIS' TYPE FROM object_xref oxr, xref x,external_db xdb, gene g WHERE oxr.xref_id=x.xref_id AND x.external_db_id=xdb.external_db_id AND oxr.ensembl_id=g.gene_id AND oxr.ensembl_object_type='Gene' and x.display_label is not null UNION SELECT g.stable_id ENSG,x.dbprimary_acc SYMBOL,xdb.db_name DB,'PRI' Type FROM object_xref oxr, xref x,external_db xdb, gene g WHERE oxr.xref_id=x.xref_id AND x.external_db_id=xdb.external_db_id AND oxr.ensembl_id=g.gene_id AND oxr.ensembl_object_type='Gene' and x.dbprimary_acc is not null AND (xdb.db_name='EntrezGene')""" ) t_g = t_g[(t_g.DB == 'EntrezGene') & (t_g.TYPE == 'PRI')].copy() t_g.drop(['DB', 'TYPE'], axis=1, inplace=True) t_g.rename2({'SYMBOL': 'GeneID'}) t_g['GeneID'] = t_g['GeneID'].astype(str) #UCSC to ENST map. t_t = db.from_sql( con, "SELECT transcript.stable_id ENST, xref.display_label ACCESSION,external_db.db_name DB FROM transcript, object_xref, xref,external_db WHERE transcript.transcript_id = object_xref.ensembl_id AND object_xref.ensembl_object_type = 'Transcript' AND object_xref.xref_id = xref.xref_id AND xref.external_db_id = external_db.external_db_id" ) t_ucsc = t_t[t_t.DB == 'UCSC'].copy() t_ucsc.drop(['DB'], axis=1, inplace=True) t_ucsc.rename2({'ACCESSION': 'UCSC'}) t_g = t_g.merge(t_gt, left_on='ENSG', right_on='ENSG', how='left') t_g = t_g.merge(t_ucsc, left_on='ENST', right_on='ENST', how='left') #t_g is a complete map between ENSG,ENST,ENSP,UCSC and EntrezGeneID return t_g except: print "Error in creating ensembl map for %s" % tax_id return None
def stat_js(rpath): import re con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE) #get last update date; dt = db.from_sql( con, "Select * FROM {0}.statistics order by history desc".format( SyncDB.DATABASE)) dt['history'] = dt['history'].apply(str) latest_build_date = max(dt['history']) #Tracer()() dt = dt.query("history=='" + latest_build_date + "'") jquery_str = "window.descriptionStatistics ={'latest_build':'" + latest_build_date + "','ds_counts':{\n" totals = [] for table in [['i_ann_', 'annotation'], ['i_gene_', 'gid2terms'], ['i_term_', 'term2gids'], ['i_interaction_', 'interaction']]: dt_type = dt.query("table_name=='" + table[1] + "'") for k, g in dt_type.groupby(by=['type_id']): total_in_one_type = g['total'].sum() totals.append("'" + table[0] + str(k) + "':" + str(total_in_one_type)) jquery_str += ',\n'.join(totals) + "}}" with open(rpath + "gp_description_stats.js", "w") as report_file: report_file.write(jquery_str) files = util.unix('cp ' + rpath + 'gp_description_stats.js ' + rpath + 'gp_description_stats_' + latest_build_date + '.js')
def get_pubmed(self): # stragg probably is too slow #s_sql="select link_id,stragg(distinct rpo.pubmed) pubmeds from regulation_rels_org rro, regulation_pubmed_org rpo where rro.koid=rpo.koid and rro.org=1 group by link_id" s_sql = "select link_id,rpo.pubmed,org from regulation_rels_org rro join regulation_pubmed_org rpo on rro.koid=rpo.koid order by link_id,org" t = db.from_sql(self.con, s_sql) t['PUBMED'] = t.PUBMED.astype(str) n = len(t) iB = iE = 0 I = [] S = [] O = [] for i in xrange(1, n + 1): #if (i+1)%10000==0: # print "> %d of %d" % (i+1, n) if i == n or t.ix[i, 'LINK_ID'] != t.ix[i - 1, 'LINK_ID'] or t.ix[ i, 'ORG'] != t.ix[i - 1, 'ORG']: iE = i - 1 I.append(t.ix[iB, 'LINK_ID']) O.append(t.ix[iB, 'ORG']) if iE > iB: S.append("|".join(set(t.ix[iB:iE, 'PUBMED']))) else: S.append(t.ix[iB, 'PUBMED']) iB = i t = pd.DataFrame(data={'LINK_ID': I, 'ORG': O, 'PUBMED': S}) return t
def get_mechanism(self, l_filter=True): s_sql = "select id, abbr as mechanism_label, tmp as mechanism_name, direct from regulation_mechanisms order by tmp" t = db.from_sql(self.con, s_sql) if l_filter: t = t[t.DIRECT > 0] t = t[t.ID.apply(lambda x: x not in (10, 14, 31, 11))].copy() return t
def get_variations(self): ensembl_file = SyncDB.DOWNLOAD_DIR( ) + "/ensembl_files/ensembl_variations.csv" print "Processing variations" if os.path.exists(ensembl_file): t = util.read_csv(ensembl_file) else: con = self.get_ensembl_connection( EnsemblDownload.get_ensembl_latest_version( 'homo_sapiens_variation_{0}_'.format( EnsemblDownload.ENSEMBL_VERSION))) query = "select distinct pf.object_id as variation_name,p.description,v.clinical_significance,vg.gene_name, s.name as source_name from source s, phenotype_feature pf, phenotype p, variation v, variation_genename vg where pf.type ='Variation' and pf.phenotype_id = p.phenotype_id and v.name=pf.object_id and v.variation_id=vg.variation_id and v.source_id=s.source_id and v.clinical_significance in ('likely pathogenic','pathogenic','risk factor','association','drug response')" t = db.from_sql(con, query, params=[]) t.to_csv(ensembl_file, index=False) map = GPUtils.get_sym2gid_map()["sym2gid"] data = [] for gene, row in t.groupby(['gene_name']): if gene in map: #Tracer()() content = [ '[' + r[1]['variation_name'] + '] ' + r[1]['description'] + '{' + r[1]['clinical_significance'] + '}(' + r[1]['source_name'] + ')' for r in row.iterrows() ] data.append({ 'gid': map[gene], 'content': ';'.join(content), 'annotation_field1': gene, 'type_name': 'VARIATIONS_ENSEMBL', 'tax_id': '9606' }) return data
def get_annotation_martdb(self, tax_id, a_type, is_boolean=False): import math db_name = self.get_dbname_by_taxid(tax_id) if db_name is None: return None file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/biomart_%s_%s.csv" % ( a_type, tax_id) print "Running query to get %s for %s from martdb" % (a_type, tax_id) query = self.get_annotation_mart_query(a_type, db_name) con = self.get_biomart_connection() try: if os.path.exists(file): df = util.read_csv(file) else: df = db.from_sql(con, query).drop_duplicates() df.to_csv(file, index=False) except: print "error in getting %s data for %s" % (a_type, tax_id) return None data = [] #Tracer()() for k, grow in df.groupby(['gid']): # Tracer()() cnt = [] for i in grow.index: v1 = grow.at[i, "term"] v2 = grow.at[i, "description"] try: if type(v1) is str or not math.isnan(v1): try: if type(v2) is str or not math.isnan(v2): cnt.append('[%s] %s' % (str(v1), str(v2))) else: cnt.append(str(v1)) except: cnt.append(str(v1)) except: pass cnt = pd.unique(cnt) content = '' if is_boolean: if len(cnt) > 0: content = "Yes" else: content = ';'.join(cnt) if content != '': data.append({ 'gid': k, 'content': content, 'annotation_field1': grow.at[i, 'gene'], 'type_name': a_type, 'tax_id': tax_id }) return data
def get_type_col_value(self): if self.type_col_value is None: con = self.get_connection() t = db.from_sql(con, self.get_type_col_value_sql(), params=[self.type_name]) self.type_col_value = t.ix[0, 0].astype(str) return self.type_col_value
def history(): con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE) query = "DELETE FROM {0}.statistics where history = CURDATE()" query = query.format(SyncDB.DATABASE) db.from_sql(con, query) query = """ INSERT INTO {0}.statistics SELECT a.*, CURDATE() AS history FROM( SELECT 'gid2source_id' AS table_name, it.display_name AS type_name, it.id_type_id as type_id, it.ds_name as ds, gs.tax_id, COUNT(*) AS total FROM {0}.gid2source_id gs , {0}.id_type it WHERE gs.id_type_id = it.id_type_id GROUP BY gs.id_type_id, gs.tax_id UNION ALL SELECT 'annotation' AS table_name, a_t.display_name AS type_name, a_t.annotation_type_id as type_id, a_t.ds_name as ds, a.tax_id, COUNT(*) AS total FROM {0}.annotation a, {0}.annotation_type a_t WHERE a.annotation_type_id = a_t.annotation_type_id GROUP BY a.annotation_type_id, a.tax_id UNION ALL SELECT 'gid2terms' AS table_name, tc.category_name AS type_name, tc.term_category_id as type_id, tc.ds_name as ds, gt.tax_id , COUNT(*) FROM {0}.gid2terms gt, {0}.term_category tc where tc.term_category_id = gt.term_category_id GROUP BY tc.term_category_id, gt.tax_id UNION ALL SELECT 'homologene' AS table_name, 'Homologene' AS type_name, 1 as type_id, 'NCBI' as ds, hg.tax_id, COUNT(*) AS total FROM {0}.homologene hg group by hg.tax_id UNION ALL SELECT 'term' AS table_name, tc.category_name AS type_name, tc.term_category_id as type_id, tc.ds_name as ds, NULL as tax_id, COUNT(*) AS total FROM {0}.term t, {0}.term_category tc WHERE t.term_category_id = tc.term_category_id GROUP BY t.term_category_id UNION ALL SELECT 'term2gids' AS table_name, tc.category_name AS type_name, tc.term_category_id as type_id, tc.ds_name as ds, gt.tax_id, COUNT(*) as total FROM {0}.term2gids gt, {0}.term_category tc where tc.term_category_id = gt.term_category_id GROUP BY tc.term_category_id, gt.tax_id UNION ALL SELECT 'term2term' AS table_name, 'Term relations' AS type_name, tc.term_category_id as type_id, tc.ds_name as ds, NULL as tax_id, COUNT(*) AS total FROM {0}.term2term tt, {0}.term_category tc WHERE tt.term_category_id = tc.term_category_id GROUP BY tt.term_category_id UNION ALL SELECT 'interaction' AS table_name, i_t.interaction_type_name AS type_name, i_t.interaction_type_id as type_id, i_t.ds_name as ds, i.tax_id_A as tax_id, COUNT(*) AS total FROM {0}.interaction i, {0}.interaction_type i_t WHERE i.interaction_type_id = i_t.interaction_type_id GROUP BY i.interaction_type_id, i.tax_id_A )a order by a.table_name, a.ds, a.type_name, a.tax_id """ query = query.format(SyncDB.DATABASE) db.from_sql(con, query)
def report_js(rpath): import re con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE) #get last update date; dt = db.from_sql( con, "Select * FROM {0}.statistics order by history desc".format( SyncDB.DATABASE)) dt['history'] = dt['history'].apply(str) cur_date = max(dt['history']) json_arr = [] for c in dt.columns: json_arr.append('"' + c + '":' + dt[c].to_json(orient='values')) with open(rpath + "gp_stats.js", "w") as report_file: report_file.write("window.buildLogStatistics=") report_file.write(dt.to_json(orient='records')) files = util.unix('cp ' + rpath + 'gp_stats.js ' + rpath + 'gp_stats_' + cur_date + '.js')
def report_html(rpath): import re con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE) #get last update date; dt = db.from_sql( con, "Select distinct history FROM {0}.statistics order by history desc" .format(SyncDB.DATABASE)) if (len(dt['history']) < 2): print "No previous build statistics found." return cur_date = str(dt['history'][0]) last_date = str(dt['history'][1]) query = """ select t.* from ( SELECT new.ds as data_source, new.table_name, new.type_name, new.total as size, (new.total - old.total) as growth, (new.total - old.total)/old.total *100 AS delta, 0 as new_missing FROM {2}.statistics old, {2}.statistics new WHERE old.history = '{0}' AND new.history = '{1}' AND old.table_name = new.table_name AND old.type_name = new.type_name UNION SELECT new.ds as data_source, new.table_name, new.type_name, new.total as size, new.total as growth, 101 AS delta, 1 as new_missing FROM {2}.statistics new left Join (select * from {2}.statistics where history = '{0}') old on old.table_name = new.table_name and old.type_name = new.type_name WHERE new.history = '{1}' AND old.table_name is NULL UNION SELECT old.ds as data_source, old.table_name, old.type_name, old.total as size, -old.total as growth, -101 AS delta, -1 as new_missing FROM {2}.statistics old left Join (select * from {2}.statistics where history = '{1}') new on old.table_name = new.table_name and old.type_name = new.type_name WHERE old.history = '{0}' AND old.table_name is NULL ) t ORDER BY data_source, table_name, delta DESC; """ query = query.format(last_date, cur_date, SyncDB.DATABASE) dt = db.from_sql(con, query) #Tracer()() missing_data = dt.query('new_missing < 0') new_data = dt.query('new_missing > 0') expanded_data = dt.query('growth > 0') reduced_data = dt.query('growth < 0') unchanged_data = dt.query('growth == 0 and new_missing==0') reports = [ r for r in sorted(os.listdir(rpath)) if "Report_" in r and r != "Report_" + cur_date + ".html" ] last_report = reports[len(reports) - 1] if len(reports) > 0 else None #import datetime #time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') html = '<!DOCTYPE html><html><head><title> GP Build Report ' + cur_date + '</title></head><body><h1> GP Build Report ' + cur_date + '</h1>' if last_report: html += '<span id="lastreportlink"><a href="' + last_report + '"><<</a></span>\n' html += '<span id="nextreportlink"></span>\n' html += '<div>' if (len(missing_data) > 0): html += '<h4> Missing Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th></tr>' for i in range(len(missing_data)): html += '<tr><td>' + missing_data.irow( i)['data_source'] + '</td><td>' + missing_data.irow( i)['table_name'] + '</td><td>' + missing_data.irow( i)['type_name'] + '</td><td>' + str( missing_data.irow(i)['growth']) + '</td></tr>' if (len(missing_data) > 0): html += '</table>' html += '</div><div>' if (len(new_data) > 0): html += '<h4> New Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th></tr>' for i in range(len(new_data)): html += '<tr><td>' + new_data.irow( i)['data_source'] + '</td><td>' + new_data.irow( i)['table_name'] + '</td><td>' + new_data.irow( i)['type_name'] + '</td><td>' + str( new_data.irow(i)['growth']) + '</td></tr>' if (len(new_data) > 0): html += '</table>' html += '</div><div>' if (len(expanded_data) > 0): html += '<h4> Expanded Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size Diff</th> <th>Ratio</th></tr>' for i in range(len(expanded_data)): html += '<tr><td>' + expanded_data.irow( i)['data_source'] + '</td><td>' + expanded_data.irow( i)['table_name'] + '</td><td>' + expanded_data.irow( i)['type_name'] + '</td><td>' + str( expanded_data.irow(i) ['growth']) + '</td><td>' + str( expanded_data.irow(i)['delta']) + '</td></tr>' if (len(expanded_data) > 0): html += '</table>' html += '</div><div>' if (len(reduced_data) > 0): html += '<h4> Reduced Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size Diff</th> <th>Ratio</th></tr>' for i in range(len(reduced_data)): html += '<tr><td>' + reduced_data.irow( i)['data_source'] + '</td><td>' + reduced_data.irow( i)['table_name'] + '</td><td>' + reduced_data.irow( i)['type_name'] + '</td><td>' + str( reduced_data.irow(i) ['growth']) + '</td><td>' + str( reduced_data.irow(i)['delta']) + '</td></tr>' if (len(reduced_data) > 0): html += '</table>' html += '</div><div>' if (len(unchanged_data) > 0): html += '<h4> Unchanged Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th> </tr>' for i in range(len(unchanged_data)): html += '<tr><td>' + unchanged_data.irow( i)['data_source'] + '</td><td>' + unchanged_data.irow( i)['table_name'] + '</td><td>' + unchanged_data.irow( i)['type_name'] + '</td><td>' + str( unchanged_data.irow(i)['size']) + '</td></tr>' if (len(unchanged_data) > 0): html += '</table>' html += '</div>' html += '</body></html>' with open(rpath + "Report.html", "w") as report_file: report_file.write(html) files = util.unix('cp ' + rpath + 'Report.html ' + rpath + 'Report_' + cur_date + '.html') if last_report: with open(rpath + last_report, "r") as last_report_file: last_html = last_report_file.read() last_html = re.sub( r"<span id=\"nextreportlink\">.*</span>", '<span id="nextreportlink"><a href="' + 'Report_' + cur_date + '.html">>></a></span>', last_html) with open(rpath + last_report, "w") as last_report_file: last_report_file.write(last_html)
#!/usr/bin/env python from os import sys, path p1 = path.join(path.dirname(path.abspath(__file__)), '../mylib') print p1 sys.path.insert(0, p1) import pandas as pd import util import db import urllib con = db.get_con('GENEGO') t = db.from_sql('select 1 from dual') print t
def get_network_object(self): s_sql = "select g.id netw_obj_id,d.ref gene_id,o.org from gene_netw g,genedbs d,geneorgs o,genes gs where g.gene=d.gene and o.gene=g.gene and g.gene=gs.geneid and gs.type=1 and d.db=17" t = db.from_sql(self.con, s_sql) return t
def get_interaction(self, l_filter=True, l_physical=True): ### dump interactome sw = util.StopWatch() # organism column here is obsolete, should not be used, see Reference 3.1 Species information and interactions, consider they are generic network objects s_sql_regulation_rels = 'select distinct id1, id2, type as effect, mechanism, trust, link_id,0 org_link from regulation_rels where nvl(trust, -2) <> -1' # -1, not exist # calculated interactions, group and complex relationships s_sql_reg_r = 'select distinct id1, id2, 0 as type, mechanism, null as trust, null as link_id,org as org_link from reg_r where mechanism in (10,14)' # 10: Group relation, 14: complex subunit s_sql_edge = "select * from (%s union all %s) r where id1!=id2" % ( s_sql_regulation_rels, s_sql_reg_r) t = db.from_sql(self.con, s_sql_edge) sw.check('Interaction data loaded') if l_filter: # 1: NLP, -1: No Link t = t[t.TRUST.apply(lambda x: x not in (-1, 1))].copy() #ID Value Meaning Level #0 Present Interaction is proven by trusted methods on this organism High #8 Approved Interaction is proven for all protein group members (with Present trust) High #9 Conflicting data Proven interaction, but different effects in different papers High #3 Animal model Proven on animal model High #7 Possible common Proven for some protein group members, but not all Medium #6 Mix Proven for the protein group as a whole, but not for individual members Medium #2 Domain interaction Interaction derived using unreliable methods (yeast2hybrid), only binding site for trans. Factors Low #10 Signaling pathway Interaction is made specially for signaling pathway map, may be indirect Low #1 NLP Result of data mining, or paper with high-throughput screen (chip on chip, prediction) Low #-1 No link Means that this interaction is absent for the particular species No link sw.check('Weak link filtered') t_tax = db.from_sql( self.con, 'select orgid,taxonomyid from orgs where taxonomyid is not NULL') c_tax = { t_tax.ix[i, 'ORGID']: t_tax.ix[i, 'TAXONOMYID'] for i in t_tax.index } # filter out undesirable mechanisms t_m = self.get_mechanism(l_filter=l_filter) t = t.merge(t_m, left_on='MECHANISM', right_on='ID') sw.check('Extract Mechanism') t_e = self.get_effect() t = t.merge(t_e, left_on='EFFECT', right_on='ID') sw.check('Extract Effect') t_no = self.get_network_object() print ">> t", len(t) print ">> t_no", util.unique_count(t_no.ORG) t = t.merge(t_no, left_on='ID1', right_on='NETW_OBJ_ID') t.rename2({'GENE_ID': 'GENE_A', 'ORG': 'ORG_A'}) t = t.merge(t_no, left_on='ID2', right_on='NETW_OBJ_ID') t.rename2({'GENE_ID': 'GENE_B', 'ORG': 'ORG_B'}) print ">> A", len(t) t = t[(t.GENE_A != t.GENE_B) & (t.ORG_A == t.ORG_B)] print ">> B", len(t) t1 = t[t.ORG_LINK == 0].copy() #LINK_ID is not NULL print ">> t1", len(t1) t2 = t[t.ORG_LINK != 0].copy() #LINK_ID is NULL print ">> t2", len(t2) sw.check('Add Entrez Gene ID') t_p = self.get_pubmed() t_p.rename2({'ORG': 'ORG_PUBMED'}) sw.check('Extract PubMed, merging ...') t1 = t1.merge(t_p, left_on=['LINK_ID'], right_on=['LINK_ID'], how='left') t1_1 = t1[t1.ORG_PUBMED.isnull()].copy() t1_2 = t1[~t1.ORG_PUBMED.isnull()].copy() t1_2 = t1_2[t1_2.ORG_A == t1_2.ORG_PUBMED].copy() print ">> t1+pubmed", len(t1_1), len(t1_2) t2 = t2[t2.ORG_A == t2.ORG_LINK].copy() print ">> t2, ORG_LINK", len(t2) t = pd.concat([t1_1, t1_2, t2], ignore_index=True) t['TRUST'] = t['TRUST'].fillna(-2) t['TRUST'] = t.TRUST.astype(int) t = t[[ 'GENE_A', 'GENE_B', 'EFFECT_NAME', 'MECHANISM_NAME', 'TRUST', 'PUBMED', 'ORG_A' ]] #,'ORG_B','ORG_PUBMED','ORG_LINK']] t.rename2({'ORG_A': 'ORG'}) t['tax_id_A'] = t.ORG.apply(lambda x: c_tax.get(x, 0)) t['tax_id_B'] = t['tax_id_A'] t = t[[ 'GENE_A', 'GENE_B', 'tax_id_A', 'tax_id_B', 'EFFECT_NAME', 'MECHANISM_NAME', 'TRUST', 'PUBMED' ]] t = t.query('tax_id_A in [%s]' % ','.join(self.taxidList)) print "DONE", len(t), util.unique_count(t.tax_id_A) return t
def get_effect(self): s_sql = "select id, desc_ as effect_name from regulation_types" t = db.from_sql(self.con, s_sql) return t