Ejemplo n.º 1
0
    def mapping(self, tax_id):
        ens_db = EnsemblDownload.get_ens_dbname_by_taxid(tax_id)
        if ens_db is None:
            print "Unsupported organism: %s" % tax_id
            return None

        print "Creating ensembl map for %s" % tax_id
        con = self.get_ensembl_connection(ens_db)
        try:
            #Get all the isoforms available in Ensembl (ENSG,ENST,ENSP map.)
            t_gt = db.from_sql(
                con,
                "select t1.ENSG, t1.ENST, t2.stable_id ENSP  from (SELECT g.stable_id ENSG,t.stable_id ENST, t.transcript_id tid from gene g,transcript t where g.gene_id=t.gene_id) t1 left join  translation t2 on t1.tid = t2.transcript_id"
            )

            #ENSG to EntrezGeneID map.
            t_g = db.from_sql(
                con, """
        SELECT g.stable_id ENSG,syn.synonym SYMBOL,xdb.db_name DB,'SYN' TYPE FROM object_xref oxr, xref x, external_db xdb,external_synonym syn,gene g WHERE oxr.xref_id=x.xref_id AND x.xref_id=syn.xref_id AND x.external_db_id=xdb.external_db_id AND oxr.ensembl_id=g.gene_id AND oxr.ensembl_object_type='Gene' and syn.synonym is not null
        UNION
        SELECT g.stable_id ENSG,x.display_label SYMBOL,xdb.db_name DB,'DIS' TYPE FROM object_xref oxr, xref x,external_db xdb, gene g WHERE oxr.xref_id=x.xref_id AND x.external_db_id=xdb.external_db_id AND oxr.ensembl_id=g.gene_id AND oxr.ensembl_object_type='Gene' and x.display_label is not null
        UNION
        SELECT g.stable_id ENSG,x.dbprimary_acc SYMBOL,xdb.db_name DB,'PRI' Type FROM object_xref oxr, xref x,external_db xdb, gene g WHERE oxr.xref_id=x.xref_id AND x.external_db_id=xdb.external_db_id AND oxr.ensembl_id=g.gene_id AND oxr.ensembl_object_type='Gene' and x.dbprimary_acc is not null AND (xdb.db_name='EntrezGene')"""
            )
            t_g = t_g[(t_g.DB == 'EntrezGene') & (t_g.TYPE == 'PRI')].copy()
            t_g.drop(['DB', 'TYPE'], axis=1, inplace=True)
            t_g.rename2({'SYMBOL': 'GeneID'})
            t_g['GeneID'] = t_g['GeneID'].astype(str)

            #UCSC to ENST map.
            t_t = db.from_sql(
                con,
                "SELECT transcript.stable_id ENST, xref.display_label ACCESSION,external_db.db_name DB FROM transcript, object_xref, xref,external_db WHERE transcript.transcript_id = object_xref.ensembl_id AND object_xref.ensembl_object_type = 'Transcript' AND object_xref.xref_id = xref.xref_id AND xref.external_db_id = external_db.external_db_id"
            )
            t_ucsc = t_t[t_t.DB == 'UCSC'].copy()
            t_ucsc.drop(['DB'], axis=1, inplace=True)
            t_ucsc.rename2({'ACCESSION': 'UCSC'})
            t_g = t_g.merge(t_gt, left_on='ENSG', right_on='ENSG', how='left')
            t_g = t_g.merge(t_ucsc,
                            left_on='ENST',
                            right_on='ENST',
                            how='left')

            #t_g is a complete map between ENSG,ENST,ENSP,UCSC and EntrezGeneID
            return t_g

        except:
            print "Error in creating ensembl map for %s" % tax_id
            return None
Ejemplo n.º 2
0
    def stat_js(rpath):
        import re
        con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE)
        #get last update date;
        dt = db.from_sql(
            con, "Select * FROM {0}.statistics order by history desc".format(
                SyncDB.DATABASE))
        dt['history'] = dt['history'].apply(str)
        latest_build_date = max(dt['history'])
        #Tracer()()
        dt = dt.query("history=='" + latest_build_date + "'")
        jquery_str = "window.descriptionStatistics ={'latest_build':'" + latest_build_date + "','ds_counts':{\n"

        totals = []
        for table in [['i_ann_', 'annotation'], ['i_gene_', 'gid2terms'],
                      ['i_term_', 'term2gids'],
                      ['i_interaction_', 'interaction']]:
            dt_type = dt.query("table_name=='" + table[1] + "'")
            for k, g in dt_type.groupby(by=['type_id']):
                total_in_one_type = g['total'].sum()
                totals.append("'" + table[0] + str(k) + "':" +
                              str(total_in_one_type))

        jquery_str += ',\n'.join(totals) + "}}"

        with open(rpath + "gp_description_stats.js", "w") as report_file:
            report_file.write(jquery_str)

        files = util.unix('cp ' + rpath + 'gp_description_stats.js ' + rpath +
                          'gp_description_stats_' + latest_build_date + '.js')
Ejemplo n.º 3
0
 def get_pubmed(self):
     # stragg probably is too slow
     #s_sql="select link_id,stragg(distinct rpo.pubmed) pubmeds from regulation_rels_org rro, regulation_pubmed_org rpo where rro.koid=rpo.koid and rro.org=1 group by link_id"
     s_sql = "select link_id,rpo.pubmed,org from regulation_rels_org rro join regulation_pubmed_org rpo on rro.koid=rpo.koid order by link_id,org"
     t = db.from_sql(self.con, s_sql)
     t['PUBMED'] = t.PUBMED.astype(str)
     n = len(t)
     iB = iE = 0
     I = []
     S = []
     O = []
     for i in xrange(1, n + 1):
         #if (i+1)%10000==0:
         #    print "> %d of %d" % (i+1, n)
         if i == n or t.ix[i, 'LINK_ID'] != t.ix[i - 1, 'LINK_ID'] or t.ix[
                 i, 'ORG'] != t.ix[i - 1, 'ORG']:
             iE = i - 1
             I.append(t.ix[iB, 'LINK_ID'])
             O.append(t.ix[iB, 'ORG'])
             if iE > iB:
                 S.append("|".join(set(t.ix[iB:iE, 'PUBMED'])))
             else:
                 S.append(t.ix[iB, 'PUBMED'])
             iB = i
     t = pd.DataFrame(data={'LINK_ID': I, 'ORG': O, 'PUBMED': S})
     return t
Ejemplo n.º 4
0
 def get_mechanism(self, l_filter=True):
     s_sql = "select id, abbr as mechanism_label, tmp as mechanism_name, direct from regulation_mechanisms order by tmp"
     t = db.from_sql(self.con, s_sql)
     if l_filter:
         t = t[t.DIRECT > 0]
         t = t[t.ID.apply(lambda x: x not in (10, 14, 31, 11))].copy()
     return t
Ejemplo n.º 5
0
    def get_variations(self):
        ensembl_file = SyncDB.DOWNLOAD_DIR(
        ) + "/ensembl_files/ensembl_variations.csv"
        print "Processing variations"

        if os.path.exists(ensembl_file):
            t = util.read_csv(ensembl_file)
        else:
            con = self.get_ensembl_connection(
                EnsemblDownload.get_ensembl_latest_version(
                    'homo_sapiens_variation_{0}_'.format(
                        EnsemblDownload.ENSEMBL_VERSION)))
            query = "select distinct pf.object_id as variation_name,p.description,v.clinical_significance,vg.gene_name, s.name as source_name from source s, phenotype_feature pf, phenotype p, variation v, variation_genename vg where pf.type ='Variation' and pf.phenotype_id = p.phenotype_id and v.name=pf.object_id and v.variation_id=vg.variation_id and v.source_id=s.source_id and v.clinical_significance in ('likely pathogenic','pathogenic','risk factor','association','drug response')"
            t = db.from_sql(con, query, params=[])
            t.to_csv(ensembl_file, index=False)

        map = GPUtils.get_sym2gid_map()["sym2gid"]
        data = []
        for gene, row in t.groupby(['gene_name']):
            if gene in map:
                #Tracer()()
                content = [
                    '[' + r[1]['variation_name'] + '] ' + r[1]['description'] +
                    '{' + r[1]['clinical_significance'] + '}(' +
                    r[1]['source_name'] + ')' for r in row.iterrows()
                ]
                data.append({
                    'gid': map[gene],
                    'content': ';'.join(content),
                    'annotation_field1': gene,
                    'type_name': 'VARIATIONS_ENSEMBL',
                    'tax_id': '9606'
                })

        return data
Ejemplo n.º 6
0
    def get_annotation_martdb(self, tax_id, a_type, is_boolean=False):
        import math
        db_name = self.get_dbname_by_taxid(tax_id)
        if db_name is None:
            return None
        file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/biomart_%s_%s.csv" % (
            a_type, tax_id)
        print "Running query to get %s for %s from martdb" % (a_type, tax_id)

        query = self.get_annotation_mart_query(a_type, db_name)

        con = self.get_biomart_connection()
        try:
            if os.path.exists(file):
                df = util.read_csv(file)
            else:
                df = db.from_sql(con, query).drop_duplicates()
                df.to_csv(file, index=False)
        except:
            print "error in getting %s data for %s" % (a_type, tax_id)
            return None

        data = []
        #Tracer()()
        for k, grow in df.groupby(['gid']):
            # Tracer()()
            cnt = []
            for i in grow.index:
                v1 = grow.at[i, "term"]
                v2 = grow.at[i, "description"]
                try:
                    if type(v1) is str or not math.isnan(v1):
                        try:
                            if type(v2) is str or not math.isnan(v2):
                                cnt.append('[%s] %s' % (str(v1), str(v2)))
                            else:
                                cnt.append(str(v1))
                        except:
                            cnt.append(str(v1))
                except:
                    pass

            cnt = pd.unique(cnt)
            content = ''
            if is_boolean:
                if len(cnt) > 0:
                    content = "Yes"
            else:
                content = ';'.join(cnt)

            if content != '':
                data.append({
                    'gid': k,
                    'content': content,
                    'annotation_field1': grow.at[i, 'gene'],
                    'type_name': a_type,
                    'tax_id': tax_id
                })

        return data
Ejemplo n.º 7
0
 def get_type_col_value(self):
     if self.type_col_value is None:
         con = self.get_connection()
         t = db.from_sql(con,
                         self.get_type_col_value_sql(),
                         params=[self.type_name])
         self.type_col_value = t.ix[0, 0].astype(str)
     return self.type_col_value
Ejemplo n.º 8
0
    def history():
        con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE)
        query = "DELETE FROM {0}.statistics where history = CURDATE()"
        query = query.format(SyncDB.DATABASE)
        db.from_sql(con, query)

        query = """
            INSERT INTO {0}.statistics
            SELECT a.*, CURDATE() AS history
            FROM(
            SELECT 'gid2source_id' AS table_name,  it.display_name  AS type_name, it.id_type_id as type_id, it.ds_name as ds, gs.tax_id, COUNT(*) AS total FROM {0}.gid2source_id gs , {0}.id_type it
            WHERE gs.id_type_id = it.id_type_id
            GROUP BY gs.id_type_id, gs.tax_id
            UNION ALL
            SELECT 'annotation' AS table_name, a_t.display_name AS type_name, a_t.annotation_type_id as type_id, a_t.ds_name as ds,  a.tax_id, COUNT(*) AS total
            FROM {0}.annotation a, {0}.annotation_type a_t
            WHERE a.annotation_type_id = a_t.annotation_type_id
            GROUP BY a.annotation_type_id, a.tax_id
            UNION ALL
            SELECT 'gid2terms' AS table_name, tc.category_name AS type_name, tc.term_category_id as type_id, tc.ds_name as ds, gt.tax_id , COUNT(*)
            FROM {0}.gid2terms gt, {0}.term_category tc
            where tc.term_category_id = gt.term_category_id 
            GROUP BY tc.term_category_id, gt.tax_id
            UNION ALL
            SELECT 'homologene' AS table_name, 'Homologene' AS type_name, 1 as type_id, 'NCBI' as ds, hg.tax_id,  COUNT(*) AS total FROM {0}.homologene hg group by hg.tax_id
            UNION ALL
            SELECT 'term' AS table_name, tc.category_name AS type_name, tc.term_category_id as type_id, tc.ds_name as ds, NULL as tax_id, COUNT(*) AS total FROM {0}.term t, {0}.term_category tc
            WHERE t.term_category_id = tc.term_category_id
            GROUP BY t.term_category_id
            UNION ALL
            SELECT 'term2gids' AS table_name, tc.category_name AS type_name, tc.term_category_id as type_id, tc.ds_name as ds, gt.tax_id, COUNT(*) as total FROM {0}.term2gids gt, {0}.term_category tc
            where tc.term_category_id = gt.term_category_id 
            GROUP BY tc.term_category_id, gt.tax_id
            UNION ALL
            SELECT 'term2term' AS table_name, 'Term relations' AS type_name, tc.term_category_id as type_id, tc.ds_name as ds, NULL as tax_id, COUNT(*) AS total FROM {0}.term2term tt, {0}.term_category tc
            WHERE tt.term_category_id = tc.term_category_id
            GROUP BY tt.term_category_id
            UNION ALL
            SELECT 'interaction' AS table_name, i_t.interaction_type_name AS type_name, i_t.interaction_type_id as type_id, i_t.ds_name as ds, i.tax_id_A as tax_id, COUNT(*) AS total FROM {0}.interaction i, {0}.interaction_type i_t
            WHERE i.interaction_type_id = i_t.interaction_type_id
            GROUP BY i.interaction_type_id, i.tax_id_A
            )a order by a.table_name, a.ds, a.type_name, a.tax_id
        """
        query = query.format(SyncDB.DATABASE)
        db.from_sql(con, query)
Ejemplo n.º 9
0
    def report_js(rpath):
        import re
        con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE)
        #get last update date;
        dt = db.from_sql(
            con, "Select * FROM {0}.statistics order by history desc".format(
                SyncDB.DATABASE))
        dt['history'] = dt['history'].apply(str)
        cur_date = max(dt['history'])
        json_arr = []
        for c in dt.columns:
            json_arr.append('"' + c + '":' + dt[c].to_json(orient='values'))

        with open(rpath + "gp_stats.js", "w") as report_file:
            report_file.write("window.buildLogStatistics=")
            report_file.write(dt.to_json(orient='records'))

        files = util.unix('cp ' + rpath + 'gp_stats.js ' + rpath +
                          'gp_stats_' + cur_date + '.js')
Ejemplo n.º 10
0
    def report_html(rpath):
        import re
        con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE)
        #get last update date;
        dt = db.from_sql(
            con,
            "Select distinct history FROM {0}.statistics order by history desc"
            .format(SyncDB.DATABASE))
        if (len(dt['history']) < 2):
            print "No previous build statistics found."
            return

        cur_date = str(dt['history'][0])
        last_date = str(dt['history'][1])
        query = """
            select t.* from (
            SELECT new.ds as data_source, new.table_name, new.type_name, new.total as size, (new.total - old.total) as growth, (new.total - old.total)/old.total *100 AS delta, 0 as new_missing
            FROM {2}.statistics old, {2}.statistics new
            WHERE old.history = '{0}'
            AND new.history = '{1}'
            AND old.table_name = new.table_name
            AND old.type_name = new.type_name
            UNION
            SELECT new.ds as data_source, new.table_name, new.type_name, new.total as size, new.total as growth, 101 AS delta, 1 as new_missing
            FROM {2}.statistics new left Join (select * from {2}.statistics where history = '{0}') old on old.table_name = new.table_name and old.type_name = new.type_name
            WHERE new.history = '{1}' 
            AND old.table_name is NULL
            UNION
            SELECT old.ds as data_source, old.table_name, old.type_name, old.total as size, -old.total as growth, -101 AS delta, -1 as new_missing
            FROM {2}.statistics old left Join (select * from {2}.statistics where history = '{1}') new on old.table_name = new.table_name and old.type_name = new.type_name
            WHERE old.history = '{0}'
            AND old.table_name is NULL
            ) t
            ORDER BY data_source, table_name, delta DESC;
        """
        query = query.format(last_date, cur_date, SyncDB.DATABASE)
        dt = db.from_sql(con, query)
        #Tracer()()
        missing_data = dt.query('new_missing < 0')
        new_data = dt.query('new_missing > 0')
        expanded_data = dt.query('growth > 0')
        reduced_data = dt.query('growth < 0')
        unchanged_data = dt.query('growth == 0 and new_missing==0')

        reports = [
            r for r in sorted(os.listdir(rpath))
            if "Report_" in r and r != "Report_" + cur_date + ".html"
        ]
        last_report = reports[len(reports) - 1] if len(reports) > 0 else None

        #import datetime
        #time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        html = '<!DOCTYPE html><html><head><title> GP Build Report ' + cur_date + '</title></head><body><h1> GP Build Report ' + cur_date + '</h1>'
        if last_report:
            html += '<span id="lastreportlink"><a href="' + last_report + '"><<</a></span>\n'
        html += '<span id="nextreportlink"></span>\n'
        html += '<div>'
        if (len(missing_data) > 0):
            html += '<h4> Missing Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th></tr>'
        for i in range(len(missing_data)):
            html += '<tr><td>' + missing_data.irow(
                i)['data_source'] + '</td><td>' + missing_data.irow(
                    i)['table_name'] + '</td><td>' + missing_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            missing_data.irow(i)['growth']) + '</td></tr>'
        if (len(missing_data) > 0):
            html += '</table>'
        html += '</div><div>'

        if (len(new_data) > 0):
            html += '<h4> New Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th></tr>'
        for i in range(len(new_data)):
            html += '<tr><td>' + new_data.irow(
                i)['data_source'] + '</td><td>' + new_data.irow(
                    i)['table_name'] + '</td><td>' + new_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            new_data.irow(i)['growth']) + '</td></tr>'
        if (len(new_data) > 0):
            html += '</table>'
        html += '</div><div>'
        if (len(expanded_data) > 0):
            html += '<h4> Expanded Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size Diff</th> <th>Ratio</th></tr>'
        for i in range(len(expanded_data)):
            html += '<tr><td>' + expanded_data.irow(
                i)['data_source'] + '</td><td>' + expanded_data.irow(
                    i)['table_name'] + '</td><td>' + expanded_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            expanded_data.irow(i)
                            ['growth']) + '</td><td>' + str(
                                expanded_data.irow(i)['delta']) + '</td></tr>'
        if (len(expanded_data) > 0):
            html += '</table>'
        html += '</div><div>'
        if (len(reduced_data) > 0):
            html += '<h4> Reduced Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size Diff</th> <th>Ratio</th></tr>'
        for i in range(len(reduced_data)):
            html += '<tr><td>' + reduced_data.irow(
                i)['data_source'] + '</td><td>' + reduced_data.irow(
                    i)['table_name'] + '</td><td>' + reduced_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            reduced_data.irow(i)
                            ['growth']) + '</td><td>' + str(
                                reduced_data.irow(i)['delta']) + '</td></tr>'
        if (len(reduced_data) > 0):
            html += '</table>'
        html += '</div><div>'
        if (len(unchanged_data) > 0):
            html += '<h4> Unchanged Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th> </tr>'
        for i in range(len(unchanged_data)):
            html += '<tr><td>' + unchanged_data.irow(
                i)['data_source'] + '</td><td>' + unchanged_data.irow(
                    i)['table_name'] + '</td><td>' + unchanged_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            unchanged_data.irow(i)['size']) + '</td></tr>'
        if (len(unchanged_data) > 0):
            html += '</table>'

        html += '</div>'
        html += '</body></html>'

        with open(rpath + "Report.html", "w") as report_file:
            report_file.write(html)

        files = util.unix('cp ' + rpath + 'Report.html ' + rpath + 'Report_' +
                          cur_date + '.html')

        if last_report:
            with open(rpath + last_report, "r") as last_report_file:
                last_html = last_report_file.read()
                last_html = re.sub(
                    r"<span id=\"nextreportlink\">.*</span>",
                    '<span id="nextreportlink"><a href="' + 'Report_' +
                    cur_date + '.html">>></a></span>', last_html)

            with open(rpath + last_report, "w") as last_report_file:
                last_report_file.write(last_html)
Ejemplo n.º 11
0
#!/usr/bin/env python
from os import sys, path
p1 = path.join(path.dirname(path.abspath(__file__)), '../mylib')
print p1
sys.path.insert(0, p1)
import pandas as pd
import util
import db
import urllib

con = db.get_con('GENEGO')
t = db.from_sql('select 1 from dual')
print t
Ejemplo n.º 12
0
 def get_network_object(self):
     s_sql = "select g.id netw_obj_id,d.ref gene_id,o.org from gene_netw g,genedbs d,geneorgs o,genes gs where g.gene=d.gene and o.gene=g.gene and g.gene=gs.geneid and gs.type=1 and d.db=17"
     t = db.from_sql(self.con, s_sql)
     return t
Ejemplo n.º 13
0
    def get_interaction(self, l_filter=True, l_physical=True):
        ### dump interactome
        sw = util.StopWatch()
        # organism column here is obsolete, should not be used, see Reference 3.1 Species information and interactions, consider they are generic network objects
        s_sql_regulation_rels = 'select distinct id1, id2, type as effect, mechanism, trust, link_id,0 org_link from regulation_rels where nvl(trust, -2) <> -1'  # -1, not exist
        # calculated interactions, group and complex relationships
        s_sql_reg_r = 'select distinct id1, id2, 0 as type, mechanism, null as trust, null as link_id,org as org_link from reg_r where mechanism in (10,14)'  # 10: Group relation, 14: complex subunit
        s_sql_edge = "select * from (%s union all %s) r where id1!=id2" % (
            s_sql_regulation_rels, s_sql_reg_r)

        t = db.from_sql(self.con, s_sql_edge)
        sw.check('Interaction data loaded')
        if l_filter:
            # 1: NLP, -1: No Link
            t = t[t.TRUST.apply(lambda x: x not in (-1, 1))].copy()
            #ID Value Meaning Level
            #0 Present Interaction is proven by trusted methods on this organism High
            #8 Approved Interaction is proven for all protein group members (with Present trust) High
            #9 Conflicting data Proven interaction, but different effects in different papers High
            #3 Animal model Proven on animal model High
            #7 Possible common Proven for some protein group members, but not all Medium
            #6 Mix Proven for the protein group as a whole, but not for individual members Medium
            #2 Domain interaction Interaction derived using unreliable methods (yeast2hybrid), only binding site for trans. Factors Low
            #10 Signaling pathway Interaction is made specially for signaling pathway map, may be indirect Low
            #1 NLP Result of data mining, or paper with high-throughput screen (chip on chip, prediction) Low
            #-1 No link Means that this interaction is absent for the particular species No link
            sw.check('Weak link filtered')

        t_tax = db.from_sql(
            self.con,
            'select orgid,taxonomyid from orgs where taxonomyid is not NULL')
        c_tax = {
            t_tax.ix[i, 'ORGID']: t_tax.ix[i, 'TAXONOMYID']
            for i in t_tax.index
        }

        # filter out undesirable mechanisms
        t_m = self.get_mechanism(l_filter=l_filter)
        t = t.merge(t_m, left_on='MECHANISM', right_on='ID')
        sw.check('Extract Mechanism')
        t_e = self.get_effect()
        t = t.merge(t_e, left_on='EFFECT', right_on='ID')
        sw.check('Extract Effect')
        t_no = self.get_network_object()
        print ">> t", len(t)
        print ">> t_no", util.unique_count(t_no.ORG)
        t = t.merge(t_no, left_on='ID1', right_on='NETW_OBJ_ID')
        t.rename2({'GENE_ID': 'GENE_A', 'ORG': 'ORG_A'})
        t = t.merge(t_no, left_on='ID2', right_on='NETW_OBJ_ID')
        t.rename2({'GENE_ID': 'GENE_B', 'ORG': 'ORG_B'})
        print ">> A", len(t)
        t = t[(t.GENE_A != t.GENE_B) & (t.ORG_A == t.ORG_B)]
        print ">> B", len(t)
        t1 = t[t.ORG_LINK == 0].copy()  #LINK_ID is not NULL
        print ">> t1", len(t1)
        t2 = t[t.ORG_LINK != 0].copy()  #LINK_ID is NULL
        print ">> t2", len(t2)
        sw.check('Add Entrez Gene ID')
        t_p = self.get_pubmed()
        t_p.rename2({'ORG': 'ORG_PUBMED'})
        sw.check('Extract PubMed, merging ...')
        t1 = t1.merge(t_p,
                      left_on=['LINK_ID'],
                      right_on=['LINK_ID'],
                      how='left')
        t1_1 = t1[t1.ORG_PUBMED.isnull()].copy()
        t1_2 = t1[~t1.ORG_PUBMED.isnull()].copy()
        t1_2 = t1_2[t1_2.ORG_A == t1_2.ORG_PUBMED].copy()
        print ">> t1+pubmed", len(t1_1), len(t1_2)
        t2 = t2[t2.ORG_A == t2.ORG_LINK].copy()
        print ">> t2, ORG_LINK", len(t2)
        t = pd.concat([t1_1, t1_2, t2], ignore_index=True)
        t['TRUST'] = t['TRUST'].fillna(-2)
        t['TRUST'] = t.TRUST.astype(int)
        t = t[[
            'GENE_A', 'GENE_B', 'EFFECT_NAME', 'MECHANISM_NAME', 'TRUST',
            'PUBMED', 'ORG_A'
        ]]  #,'ORG_B','ORG_PUBMED','ORG_LINK']]
        t.rename2({'ORG_A': 'ORG'})
        t['tax_id_A'] = t.ORG.apply(lambda x: c_tax.get(x, 0))
        t['tax_id_B'] = t['tax_id_A']
        t = t[[
            'GENE_A', 'GENE_B', 'tax_id_A', 'tax_id_B', 'EFFECT_NAME',
            'MECHANISM_NAME', 'TRUST', 'PUBMED'
        ]]
        t = t.query('tax_id_A in [%s]' % ','.join(self.taxidList))
        print "DONE", len(t), util.unique_count(t.tax_id_A)
        return t
Ejemplo n.º 14
0
 def get_effect(self):
     s_sql = "select id, desc_ as effect_name from regulation_types"
     t = db.from_sql(self.con, s_sql)
     return t