コード例 #1
0
ファイル: ucscdownload.py プロジェクト: fossilqq/Metascape
    def get_ucsc2gid_df(self, tax_id):
        #mmusculus_gene_ensembl   10090
        #rnorvegicus_gene_ensembl 10116
        db_name = self.get_dsname_by_taxid(tax_id)
        if db_name is None:
            return pd.DataFrame()

        util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ucsc_files')
        file = SyncDB.DOWNLOAD_DIR() + "/ucsc_files/ucscid2gid_%s" % tax_id

        if not os.path.exists(file):
            con = db.get_con('UCSC')
            try:
                df = db.from_sql(
                    con,
                    "select name as source_id, value as gid from %s.knownToLocusLink"
                    % db_name)
            except Exception as exp:
                return pd.DataFrame()
            df.to_csv(file, index=False)

        data = util.read_csv(file)
        data = data[data['gid'].notnull()]
        data = data[data['source_id'].notnull()]
        data[['gid']] = data[['gid']].astype(int)
        data['tax_id'] = tax_id
        return data
コード例 #2
0
    def get_summary_expression_data (self, id_map, id_type):
        #Tracer()()
        if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),"proteinatlas.xml")):
            urllib.urlretrieve("http://www.proteinatlas.org/download/proteinatlas.xml.gz", os.path.join(SyncDB.DOWNLOAD_DIR(),"proteinatlas.xml.gz"))
            util.unix("gunzip " + os.path.join(SyncDB.DOWNLOAD_DIR(),"proteinatlas.xml.gz"))
            
        context = etree.iterparse(os.path.join(SyncDB.DOWNLOAD_DIR(),"proteinatlas.xml"), tag='entry')
        out = [];
        i = 0;
        for action, elem in context:
            i = i + 1;
            if elem.tag=='entry':
                row = self.process_gene_entry (elem);
                if row[id_type] in id_map:
                    content = str(unicode(row["protein_expression_tissue"]).encode('utf-8'))
                    content = content if  content!='nan' else '';
                    for gid in id_map[row[id_type]]:
                        out.append({'gid':gid, 'content':content, 'type_name': "Protein_Atlas_PROTEIN_EXPR_TISSUE" , 'annotation_field1':row['Gene']});      
                    content = str(unicode(row["protein_expression_cancer"]).encode('utf-8'))
                    content = content if  content!='nan' else '';  
                    for gid in id_map[row[id_type]]:
                        out.append({'gid':gid, 'content':content, 'type_name': "Protein_Atlas_PROTEIN_EXPR_CANCER" , 'annotation_field1':row['Gene']});               
                # processing goes here
                pass

            elem.clear()

            while elem.getprevious() is not None:
                del elem.getparent()[0]
            if i%1000 == 0: print "processed " + str(i) + " genes";
        return out;
コード例 #3
0
    def make_gid2taxid(self):
        taxidList = SyncDB.SUPPORTED_SPECIES
        for child in self.children:
            if type(child).__name__ == "Species":
                taxidList = child.supported_species
                break
        if not os.path.exists(
                os.path.join(SyncDB.DOWNLOAD_DIR(), "geneid2taxid.csv")):
            taxid_filter = ""
            if len(taxidList) != 0:
                taxid_filter = ['$1==\"' + t + '\"' for t in taxidList]
                taxid_filter = "if (" + "||".join(taxid_filter) + ")"
            # gene_id,tax_id
            cmd = "time zcat " + SyncDB.DOWNLOAD_DIR(
            ) + "/gene_info.gz | cut -f1,2 | sed 1d | awk 'BEGIN{FS=\"\\t\"; OFS=\"\\t\"}{" + taxid_filter + " print $2,$1;}' | sort -k1,1 -t $'\\t' >" + SyncDB.DOWNLOAD_DIR(
            ) + "/geneid2taxid.csv"
            print cmd
            util.unix(cmd)

        df = util.read_csv(SyncDB.DOWNLOAD_DIR() + "/geneid2taxid.csv",
                           names=["gid", "tax_id"],
                           sep=r'\t')
        self.gid2taxid = {
            str(df.ix[i, 'gid']): str(df.ix[i, 'tax_id'])
            for i in df.index
        }
コード例 #4
0
ファイル: db_backup.py プロジェクト: fossilqq/Metascape
    def dump_all(self):
        print 'backup...'
        conn_info = db.get_con_info(self.CONNECTION_ID)
        if conn_info is None:
            print "cannot find connection for " + self.CONNECTION_ID
            exit()

        tables = [
            'annotation',
            'annotation_type',
            'gid2source_id',
            'gid2terms',
            'homologene',
            'id_type',
            'interaction',
            'interaction_type',
            'taxid2name',
            'term',
            'term_category',
            'term2gids',
            'term2term',
            'statistics',
        ]
        cmd = [
            'mysqldump', '--host ' + conn_info["HOST"],
            '-u ' + conn_info["USR"], '-p' + conn_info["PWD"], self.DATABASE,
            ' '.join(tables), '>' + self.OUTPUT
        ]
        print cmd
        util.unix(cmd)
コード例 #5
0
ファイル: isoformsidmap.py プロジェクト: fossilqq/Metascape
    def ensembl_trans2gene_map_by_taxid(self, tax_id):     
        if tax_id == 9606:
            db_name="hsapiens_gene_ensembl";
            fname="ensembl_genes_human_trans2gid_map.csv";
        elif tax_id == 10090:
            db_name="mmusculus_gene_ensembl";
            fname="ensembl_genes_mouse_trans2gid_map.csv";
        elif tax_id == 10116:
            db_name="rnorvegicus_gene_ensembl";
            fname="ensembl_genes_rat_trans2gid_map.csv";

        util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files');       
        ensembl_file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/%s"%fname; 
        
        if not os.path.exists(ensembl_file ):
            cmd = 'wget -O ' + ensembl_file + ' \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\
            encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\
            formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default">\
            <Attribute name = "ensembl_gene_id"/><Attribute name = "ensembl_transcript_id"/><Attribute name = "ensembl_peptide_id"/>\
            </Dataset></Query>\'';
            util.unix(cmd);
            
        
        df=util.read_csv(ensembl_file, sep='\t', names=['Ensembl_Gene','Ensembl_Trans','Ensembl_Prot']);
        df=df.iloc[np.where(df["Ensembl_Trans"].notnull())[0]]
        df=df.drop_duplicates();
        
        df["Tax_ID"]=tax_id;
        
        return df;
コード例 #6
0
    def get_ensembl2gid_map_ensembl():
        if hasattr(GPUtils, 'ensembl2gid_map_ensembl'):
            return GPUtils.ensembl2gid_map_ensembl

        ensembl_file = path.join(SyncDB.DOWNLOAD_DIR(),
                                 "ensembl_genes_info.csv")

        if not os.path.exists(ensembl_file):
            cmd = 'wget -O ' + ensembl_file + ' - \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\
            encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\
            formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default"\
            ><Attribute name = "ensembl_gene_id"/><Attribute name = "entrezgene"/></Dataset></Query>\''

            util.unix(cmd)

        ensembl_data = util.read_csv(ensembl_file,
                                     sep="\t",
                                     header=None,
                                     names=['ensembl_gene_id', 'gene_id'])
        ensembl_data = ensembl_data[ensembl_data['gene_id'].notnull()]
        ensembl_data[['gene_id']] = ensembl_data[['gene_id']].astype(int)
        GPUtils.ensembl2gid_map_ensembl = {}
        for i in ensembl_data.index:
            GPUtils.ensembl2gid_map_ensembl[ensembl_data.at[
                i, 'ensembl_gene_id']] = GPUtils.ensembl2gid_map_ensembl.get(
                    ensembl_data.at[i, 'ensembl_gene_id']) or []
            GPUtils.ensembl2gid_map_ensembl[ensembl_data.at[
                i, 'ensembl_gene_id']].append(ensembl_data.at[i, 'gene_id'])
        #Tracer()()
        return GPUtils.ensembl2gid_map_ensembl
コード例 #7
0
ファイル: db_backup.py プロジェクト: fossilqq/Metascape
    def restore(self):
        print 'restoring...'
        if len(self.RESTORE_DB) == 0:
            print "You need to specify a database name to be restored using -r option."
            exit()

        if len(self.INPUT) == 0:
            print "You need to specify a .sql file name -i option."
            exit()

        conn_info = db.get_con_info(self.CONNECTION_ID)
        if conn_info is None:
            print "cannot find connection for " + self.CONNECTION_ID
            exit()
        cmd = [
            'mysql',
            '--host ' + conn_info["HOST"],
            '-u ' + conn_info["USR"],
            '-p' + conn_info["PWD"],
            '-e',
            '"create database {0}"'.format(self.RESTORE_DB),
        ]
        print ' '.join(cmd)
        util.unix(cmd)

        cmd = [
            'mysql', '--host ' + conn_info["HOST"], '-u ' + conn_info["USR"],
            '-p' + conn_info["PWD"], self.RESTORE_DB, '<' + self.INPUT
        ]
        print ' '.join(cmd)
        util.unix(cmd)
コード例 #8
0
ファイル: isoformsidmap.py プロジェクト: fossilqq/Metascape
    def __init__(self, xe=None):
        #return #DEBUG
        XmlClass.__init__(self,xe=xe)
        self.fn_dest =os.path.join(SyncDB.DOWNLOAD_DIR(),"isoform_idmap.csv")
        self.taxidList = SyncDB.SUPPORTED_SPECIES;
        for child in self.children:
            if type(child).__name__ == "Species":
                self.taxidList = child.supported_species
                break;

        util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files');
コード例 #9
0
ファイル: db_backup.py プロジェクト: fossilqq/Metascape
    def sync_category_tables(self, dest_db):
        print 'synchronizing category tables of the gp_devel with the production gp.'
        con_prod = db.get_con_info("MYSQL01_RW")
        cmd = [
            'mysqldump', '--host ' + con_prod["HOST"], '-u ' + con_prod["USR"],
            '-p' + con_prod["PWD"], 'gp',
            ' '.join(['annotation_type', 'id_type',
                      'term_category']), '>category_tables.sql'
        ]

        print ' '.join(cmd)
        util.unix(cmd)
コード例 #10
0
    def process(self):
        if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),
                                           "names.dmp")):
            urllib.urlretrieve(
                "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip",
                os.path.join(SyncDB.DOWNLOAD_DIR(), "taxdmp.zip"))
            util.unix("unzip %s " %
                      os.path.join(SyncDB.DOWNLOAD_DIR(), "taxdmp.zip"))

        df = util.read_csv(
            os.path.join(SyncDB.DOWNLOAD_DIR(), "names.dmp"),
            names=['tax_id', 'name', 'uname', 'type_name'],
            sep="\s*\|\s*",
            index_col=False).query(
                'type_name in ["scientific name", "genbank common name"]')
        df = df.query('tax_id in [%s]' %
                      ','.join(['"' + x + '"' for x in self.taxidList]))
        df = df.sort('tax_id')
        rows = []
        last_taxid = None
        row = None
        for e in df.iterrows():

            if e[1]['tax_id'] != last_taxid:
                if row is not None and 'scientific_name' in row:
                    row['tax_id'] = last_taxid
                    if 'common_name' not in row:
                        row['common_name'] = row['scientific_name']
                    rows.append(row)
                row = {}
                last_taxid = e[1]['tax_id']

            if 'scientific name' in e[1]['type_name'].strip():
                row['scientific_name'] = e[1]['name'].strip()
            if 'genbank common name' in e[1]['type_name'].strip():
                row['common_name'] = e[1]['name'].strip()

        if row is not None and 'scientific_name' in row:
            rows.append(row)

        if not os.path.exists(os.path.join(SyncDB.UPLOAD_DIR(), self.dest)):
            util.unix('mkdir -p %s' %
                      os.path.join(SyncDB.UPLOAD_DIR(), self.dest))
        pd.DataFrame(rows).to_csv(
            os.path.join(SyncDB.UPLOAD_DIR(), self.dest) + "/taxid2name.csv",
            index=False,
            header=True,
            columns=['tax_id', 'scientific_name', 'common_name'])
コード例 #11
0
 def do_update(self):
     self.prepare()
     if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),"protein_atlas")):            
         util.unix("mkdir " + os.path.join(SyncDB.DOWNLOAD_DIR(),"protein_atlas"))
     data = self.get_summary_expression_data(GPUtils.get_ensembl2gid_map(), "Ensembl") \
            + self.get_protein_class_data(GPUtils.get_ensembl2gid_map(), "Ensembl") \
            + self.get_subcellular_location() \
            + self.get_rna_tissue_category(GPUtils.get_ensembl2gid_map(), "Ensembl")
     
     dff=pd.DataFrame(data);
     reload(sys)  
     sys.setdefaultencoding('utf8')
     dff['content']=dff['content'].map(lambda x: unicode(x).encode('utf-8'))
     dff['content']=dff['content'].map(lambda x: x.replace('\r','').replace('\n','') if x!='nan' else '')
     dff['tax_id']='9606'
     dff.to_csv(self.fn_dest, index=False, sep=',');                        
コード例 #12
0
    def stat_js(rpath):
        import re
        con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE)
        #get last update date;
        dt = db.from_sql(
            con, "Select * FROM {0}.statistics order by history desc".format(
                SyncDB.DATABASE))
        dt['history'] = dt['history'].apply(str)
        latest_build_date = max(dt['history'])
        #Tracer()()
        dt = dt.query("history=='" + latest_build_date + "'")
        jquery_str = "window.descriptionStatistics ={'latest_build':'" + latest_build_date + "','ds_counts':{\n"

        totals = []
        for table in [['i_ann_', 'annotation'], ['i_gene_', 'gid2terms'],
                      ['i_term_', 'term2gids'],
                      ['i_interaction_', 'interaction']]:
            dt_type = dt.query("table_name=='" + table[1] + "'")
            for k, g in dt_type.groupby(by=['type_id']):
                total_in_one_type = g['total'].sum()
                totals.append("'" + table[0] + str(k) + "':" +
                              str(total_in_one_type))

        jquery_str += ',\n'.join(totals) + "}}"

        with open(rpath + "gp_description_stats.js", "w") as report_file:
            report_file.write(jquery_str)

        files = util.unix('cp ' + rpath + 'gp_description_stats.js ' + rpath +
                          'gp_description_stats_' + latest_build_date + '.js')
コード例 #13
0
ファイル: ppi.py プロジェクト: fossilqq/Metascape
    def biogrid(self, l_human_only=False):
        fn_source = os.path.join(SyncDB.DOWNLOAD_DIR(),
                                 "BIOGRID-ALL-3.4.134.tab2.txt")
        if not os.path.exists(fn_source):
            urllib.urlretrieve(
                "http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.134/BIOGRID-ALL-3.4.134.tab2.zip",
                os.path.join(SyncDB.DOWNLOAD_DIR(),
                             "BIOGRID-ALL-3.4.134.tab2.zip"))
            cmd = "unzip " + os.path.join(SyncDB.DOWNLOAD_DIR(),
                                          "BIOGRID-ALL-3.4.134.tab2.zip"
                                          ) + " -d " + SyncDB.DOWNLOAD_DIR()
            print cmd
            util.unix(cmd)

        t = pd.read_table(fn_source, dtype=str)
        #print t.header()
        #['#BioGRID Interaction ID', 'Entrez Gene Interactor A', 'Entrez Gene Interactor B', 'BioGRID ID Interactor A', 'BioGRID ID Interactor B', 'Systematic Name Interactor A', 'Systematic Name Interactor B', 'Official Symbol Interactor A', 'Official Symbol Interactor B', 'Synonyms Interactor A', 'Synonyms Interactor B', 'Experimental System', 'Experimental System Type', 'Author', 'Pubmed ID', 'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score', 'Modification', 'Phenotypes', 'Qualifications', 'Tags', 'Source Database']
        print util.unique(t['Experimental System Type'])
        #t=t[(t['Organism Interactor A']=='9606') & (t['Organism Interactor B']=='9606')]
        t.rename2({
            'Entrez Gene Interactor A': 'gid_A',
            'Entrez Gene Interactor B': 'gid_B',
            'Experimental System Type': 'interaction_category',
            'Experimental System': 'interaction_type',
            'Pubmed ID': 'support',
            'Source Database': 'ds',
            'Organism Interactor A': 'tax_id_A',
            'Organism Interactor B': 'tax_id_B',
            'Score': 'score'
        })
        #print t.header()
        t['interaction_type_id'] = 2
        t = t[[
            'gid_A', 'gid_B', 'tax_id_A', 'tax_id_B', 'interaction_type_id',
            'interaction_category', 'interaction_type', 'score', 'support',
            'ds'
        ]]
        t = t[(t.gid_A != '-') & (t.gid_B != '-')]
        t['gid_A'] = t.gid_A.astype(int)
        t['gid_B'] = t.gid_B.astype(int)
        t['tax_id_A'] = t.tax_id_A.astype(int)
        t['tax_id_B'] = t.tax_id_B.astype(int)
        t['ds'] = 'BioGrid'
        t = t[(t.gid_A != t.gid_B) & (t.tax_id_A == t.tax_id_B) & (t.gid_A > 0)
              & (t.gid_B > 0)].copy()
        self.bio = t
        return self.bio
コード例 #14
0
ファイル: genegodownload.py プロジェクト: fossilqq/Metascape
    def __init__(self, xe=None):
        XmlClass.__init__(self, xe=xe)
        self.fn_dest_genego_annotations = os.path.join(
            SyncDB.DOWNLOAD_DIR(), "genego_annotations.csv")
        self.fn_dest_genego_terms = os.path.join(SyncDB.DOWNLOAD_DIR(),
                                                 "genego_terms.csv")
        self.fn_dest_genego_term_gene_pair = os.path.join(
            SyncDB.DOWNLOAD_DIR(), "genego_term_gene_pair.csv")
        self.inputs = ['ds:genego']

        self.taxidList = SyncDB.SUPPORTED_SPECIES
        for child in self.children:
            if type(child).__name__ == "Species":
                self.taxidList = child.supported_species
                break
        self.dir = SyncDB.DOWNLOAD_DIR() + '/genego_files'
        util.unix('mkdir -p ' + self.dir)
コード例 #15
0
 def optimize(s_file):
     if re.search(r'\.cdt$', s_file) is not None:
         s_file=re.sub(r'\.cdt$', '', s_file)
     S_cmd=[Clustering.BIN_OPT, "-m O -d P "+s_file, s_file+"Opt"]
     print(util.unix(" ".join(S_cmd)))
     #print S_cmd
     #Clustering._fix_missing(s_file+"Opt.cdt")
     Clustering._remove_temp_files(s_file)
コード例 #16
0
ファイル: isoformsidmap.py プロジェクト: fossilqq/Metascape
    def ensembl_idmapping_by_taxid(self, tax_id):     
        if tax_id == 9606:
            db_name="hsapiens_gene_ensembl";
            fname="ensembl_genes_human_idmap";
            ucsc_attr='<Attribute name = "ucsc"/>'
        elif tax_id == 10090:
            db_name="mmusculus_gene_ensembl";
            fname="ensembl_genes_mouse_idmap";
            ucsc_attr='<Attribute name = "ucsc"/>'
        elif tax_id == 10116:
            db_name="rnorvegicus_gene_ensembl";
            fname="ensembl_genes_rat_idmap";
            ucsc_attr='';


        ensembl_file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/%s"%fname; 
        
        if not os.path.exists(ensembl_file + '_p1.csv'):
            cmd = 'wget -O ' + ensembl_file + '_p1.csv \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\
            encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\
            formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default">\
            <Attribute name = "ensembl_gene_id"/><Attribute name = "ensembl_transcript_id"/><Attribute name = "ensembl_peptide_id"/><Attribute name = "refseq_mrna"/><Attribute name = "refseq_peptide"/>' +  ucsc_attr + '</Dataset></Query>\'';
            util.unix(cmd);
            
        if not os.path.exists(ensembl_file + '_p2.csv'):        
            cmd = 'wget -O ' + ensembl_file + '_p2.csv \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\
            encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\
            formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default">\
            <Attribute name = "ensembl_transcript_id"/><Attribute name = "uniprot_swissprot"/><Attribute name = "entrezgene"/>\
            </Dataset></Query>\'';
            util.unix(cmd);        
       
        if ucsc_attr == '':
            df1=util.read_csv(ensembl_file + '_p1.csv', sep='\t', names=['Ensembl_Gene','Ensembl_Trans','Ensembl_Prot','Refseq_RNA','Refseq_Prot']);
        else:
            df1=util.read_csv(ensembl_file + '_p1.csv', sep='\t', names=['Ensembl_Gene','Ensembl_Trans','Ensembl_Prot','Refseq_RNA','Refseq_Prot','UCSC_ID']);
            df1["UCSC_ID"] = '-';
        df2=util.read_csv(ensembl_file + '_p2.csv', sep='\t', names=['Ensembl_Trans','Uniprot_ID','GeneID']).drop_duplicates(subset='Ensembl_Trans', take_last=True);
        df = pd.merge(df1, df2, on='Ensembl_Trans', how='left');
        df=df.iloc[np.where(df["GeneID"].notnull())[0]]
        df["Tax_ID"]=tax_id;
        df["Source"] = "Ensembl";

        return df.drop_duplicates();
コード例 #17
0
ファイル: godownload.py プロジェクト: fossilqq/Metascape
    def get_gene2go(self):
        terms = util.read_csv(self.fn_dest_go_term, sep=r',', skiprows = 1).values        
        #Tracer()()
        term_type_index = {terms[i][0]:terms[i][2] for i in range(len(terms))}
        term_description_index = {terms[i][0]:terms[i][1] for i in range(len(terms))}
                
        fn_source =os.path.join(SyncDB.DOWNLOAD_DIR(),"gene2go.gz") 
        urllib.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz",fn_source)

        taxidList = SyncDB.SUPPORTED_SPECIES;
        for child in self.children:
            if type(child).__name__ == "Species":
                taxidList = child.supported_species
                break;
        taxid_filter = "";
        if len(taxidList) !=0:
            taxid_filter = ['$1=="'+ t + '"' for t in taxidList]
            taxid_filter = "if ("+"||".join(taxid_filter) + ") "
        
        #cmd = "zegrep '^9606' %s |cut -f2,3,1,8,6 > %s" % ( fn_source, self.fn_gene2go_org)
        cmd = ('zcat %s | awk \'BEGIN {FS="\t"}{' + taxid_filter + ' print $0;}\' |cut -f2,3,1,8,6 > %s') % ( fn_source, self.fn_gene2go_org)
        print 'aaaaaaaaaaaaa'
        print cmd
        print 'aaaaaaaaaaaaa'
        util.unix(cmd)
        gene2go = util.read_csv(self.fn_gene2go_org, sep=r'\t', 
                names=['tax_id','gene_id','term_id','type', 'description'])
        extended_terms = [];
        
        for k,g in gene2go.groupby('gene_id', as_index=False):
            extended_term_ids = [];
			#map gene_id to all the ancestors of t. self.extend_go_term(t) returns list of ancestors of t which includes t itself.
            for t in g['term_id']:
                extended_term_ids += self.extend_go_term(t);
                    
            extended_term_ids = np.unique(extended_term_ids);

            taxId = g['tax_id'].values[0];
            extended_terms +=[[taxId,k, t, term_type_index[t] if t in term_type_index else '', term_description_index[t] if t in term_description_index else ''] for t in extended_term_ids];

        extended_terms_df = pd.DataFrame(extended_terms);
        extended_terms_df.columns = gene2go.columns;
        extended_terms_df.to_csv(self.fn_source_by_tax, index=False, sep='\t', header=False)
コード例 #18
0
ファイル: db_backup.py プロジェクト: fossilqq/Metascape
    def dump_data_tables(self):
        conn_info = db.get_con_info(self.CONNECTION_ID)
        if conn_info is None:
            print "cannot find connection for " + self.CONNECTION_ID
            exit()

        tables = [
            'annotation', 'gid2source_id', 'gid2terms', 'homologene', 'term',
            'term2gids', 'term2term'
        ]

        cmd = [
            'mysqldump', '--host ' + conn_info["HOST"],
            '-u ' + conn_info["USR"], '-p' + conn_info["PWD"], self.DATABASE,
            ' '.join(tables), '>' + self.OUTPUT
        ]

        util.unix(cmd)
        print " ".join(cmd)
コード例 #19
0
ファイル: ensembldownload.py プロジェクト: fossilqq/Metascape
    def __init__(self, xe=None):
        XmlClass.__init__(self, xe=xe)
        EnsemblDownload.set_ENSEMBL_VERSION()
        self.tag = "EnsemblDownload"
        self.fn_annotations = os.path.join(SyncDB.DOWNLOAD_DIR(),
                                           "ensembl_annotations.csv")
        #self.fn_phenotype=os.path.join(SyncDB.DOWNLOAD_DIR(),"ensembl_phenotype_annotations.csv")
        self.fn_ensembl_id_map = os.path.join(SyncDB.DOWNLOAD_DIR(),
                                              "ensembl_id_map.csv")
        self.taxidList = SyncDB.SUPPORTED_SPECIES
        for child in self.children:
            if type(child).__name__ == "Species":
                self.taxidList = child.supported_species
                break

        self.chrList = [str(x + 1) for x in range(22)]
        self.chrList += ['X', 'Y', 'MT']
        util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files')
        self.mart_ftp = "ftp://ftp.ensembl.org/pub/release-84/mysql/ensembl_mart_84/"
コード例 #20
0
ファイル: isoformsidmap.py プロジェクト: fossilqq/Metascape
    def uniprot_idmapping_data(self):
        if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),"idmapping_selected.tab.gz")):
            urllib.urlretrieve("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz", os.path.join(SyncDB.DOWNLOAD_DIR(),"idmapping_selected.tab.gz"))
            print "idmapping_selected.tab.gz file downloaded...";
        else:
            print "idmapping_selected.tab.gz exists. start processing...";
           
        #keep the lines that contain tax_id and gene_id from the file.
        if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),"uniprot_isoform_idmapping.dat")):
            print "extracting rows of desired species with awk command...."
            unix_cmd = "zcat " + os.path.join(SyncDB.DOWNLOAD_DIR(),"idmapping_selected.tab.gz") + " | awk -F'\t' '{if ($13 == \"9606\" || $13==\"10090\" || $13==\"10116\") print $1,$3,$4,$13,$19,$20,$21}' > " + os.path.join(SyncDB.DOWNLOAD_DIR(),"uniprot_isoform_idmapping.dat");
            print unix_cmd
            util.unix(unix_cmd)
            print "extracting rows of desired species with grep command is done."
            
        df = util.read_csv(os.path.join(SyncDB.DOWNLOAD_DIR(),"uniprot_isoform_idmapping.dat"), sep="\t", header=None, names=["Uniprot_ID","GeneID","Refseq_Prot","Tax_ID","Ensembl_Gene","Ensembl_Trans","Ensembl_Prot"]);
        df["Source"] = "Uniprot";
        df=df.iloc[np.where(df["GeneID"].notnull())[0]]
        print "Uniprot ID mapping data processed..."

        return df;
コード例 #21
0
ファイル: isoformsidmap.py プロジェクト: fossilqq/Metascape
    def create_ucsc_idmapping(self):
        if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "ucsc_refseq_to_id_map.csv")):
            cmd = 'mysql --host=genome-mysql.cse.ucsc.edu --user=genomep --password=password --database=hg38 -e "SELECT refLink.mrnaAcc as Refseq_RNA, kgXref.kgID as UCSC_ID from refLink, kgXref WHERE kgXref.refseq=refLink.mrnaAcc" > ' + path.join(SyncDB.DOWNLOAD_DIR(), "ucsc_refseq_to_id_map.csv");
            util.unix(cmd);
        
        if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "ucsc_id_to_ensemblid_map.csv")):
            cmd = 'mysql --host=genome-mysql.cse.ucsc.edu --user=genomep --password=password --database=hg38 -e "SELECT knownToEnsembl.name as UCSC_ID, knownToEnsembl.value as  Ensembl_Trans from knownToEnsembl" > ' + path.join(SyncDB.DOWNLOAD_DIR(), "ucsc_id_to_ensemblid_map.csv");
            util.unix(cmd);
                   
        
        df=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/ucsc_id_to_ensemblid_map.csv', sep='\t');
        global ensemblTrans2ucscID;
        ensemblTrans2ucscID = {}
        for index, el in df.iterrows():
            ensemblTrans2ucscID[el["Ensembl_Trans"]] = el["UCSC_ID"]

        df=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/ucsc_refseq_to_id_map.csv', sep='\t');
        global refseq2ucscID;
        refseq2ucscID = {}
        for index, el in df.iterrows():
            refseq2ucscID[el["Refseq_RNA"]] = el["UCSC_ID"]
コード例 #22
0
ファイル: isoformsidmap.py プロジェクト: fossilqq/Metascape
    def ucsc_idmapping_data_by_taxid(self, tax_id):
        if tax_id == 9606:
            db_name="hg38";
            fname="human_ucsc_isoform_map.csv";
        elif tax_id == 10090:
            db_name="mm10";
            fname="mouse_ucsc_isoform_map.csv";
        else:
            print "species not supported";
            return None;
    
        if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), fname)):
            cmd = 'mysql --host=genome-mysql.cse.ucsc.edu --user=genomep --password=password --database=' + db_name + ' -e "select a.*, refLink.protAcc as Refseq_Prot, knownToEnsembl.value as Ensembl_Trans from (SELECT kgXref.refseq as Refseq_RNA, knownToLocusLink.value as GeneID, knownGene.name as UCSC_ID, knownGene.proteinID as Uniprot_ID from kgXref,knownGene,knownToLocusLink WHERE knownToLocusLink.name = knownGene.name and kgXref.kgID=knownGene.name) a left join refLink on refLink.mrnaAcc = a.Refseq_RNA left join knownToEnsembl on a.UCSC_ID=knownToEnsembl.name" > ' + path.join(SyncDB.DOWNLOAD_DIR(), fname);
            util.unix(cmd);
            print "UCSC Query is done..."

        df=util.read_csv(path.join(SyncDB.DOWNLOAD_DIR(), fname), sep='\t');
        df["Tax_ID"]=tax_id;
        df=df.iloc[np.where(df["GeneID"].notnull())[0]].drop_duplicates();
        print "UCSC ID mapping data processed..."
        return df;
コード例 #23
0
ファイル: db_backup.py プロジェクト: fossilqq/Metascape
    def dump_schema(self):
        tables_with_data = [
            'annotation', 'annotation_type', 'gid2source_id', 'gid2terms',
            'homologene', 'id_type', 'interaction', 'interaction_type',
            'server_status', 'session', 'statistics', 'taxid2name', 'term',
            'term2gids', 'term2term', 'term_category'
        ]
        conn_info = db.get_con_info(self.CONNECTION_ID)
        if conn_info is None:
            print "cannot find connection for " + self.CONNECTION_ID
            exit()
        metascape_db_in_xxx = 'gp_metascape'
        #todo uncomment next line
        if len(tables_with_data):
            cmd = [
                'mysqldump', '--host ' + conn_info["HOST"],
                '-u ' + conn_info["USR"], '-p' + conn_info["PWD"],
                metascape_db_in_xxx, ' '.join(tables_with_data),
                '>' + self.OUTPUT + "_P2"
            ]

            print " ".join(cmd)
            util.unix(cmd)
            util.unix("cat " + self.OUTPUT + "_P2 >> " + self.OUTPUT)
            util.unix("rm " + self.OUTPUT + "_P2")
コード例 #24
0
ファイル: ensembldownload.py プロジェクト: fossilqq/Metascape
    def get_ensembl2gid_df_web(self, tax_id, type):
        #mmusculus_gene_ensembl   10090
        #rnorvegicus_gene_ensembl 10116
        print "Get %s to gene id for %s" % (type, tax_id)
        attr = '<Attribute name = "' + type + '" />'
        db_name = self.get_dbname_by_taxid(tax_id)
        if db_name is None:
            return None

        fname = 'ensembl2gid_%s_%s' % (type, tax_id)

        util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files')
        valid_files = []
        for chr in self.chrList:
            ensembl_file = SyncDB.DOWNLOAD_DIR(
            ) + "/ensembl_files/%s_chr%s" % (fname, chr)
            print "downloading %s from %s for chr %s..." % (type, db_name, chr)
            if not os.path.exists(ensembl_file):
                cmd = 'wget -O ' + ensembl_file + ' \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\
                encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\
                formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default"><Filter name="chromosome_name" value="' + chr + '" filter_list=""/>' + attr + '<Attribute name = "entrezgene"\
                /></Dataset></Query>\''

                util.unix(cmd)
            try:
                tdf = util.read_csv(ensembl_file,
                                    sep="\t",
                                    header=None,
                                    names=['source_id', 'gid'],
                                    nrows=1)
            except Exception as exp:
                tdf = pd.DataFrame()

            if len(tdf) != 0:
                valid_files.append(ensembl_file)

        if len(valid_files) == 0:
            return None

        cmd = 'cat %s >> %s' % (' '.join(valid_files), SyncDB.DOWNLOAD_DIR() +
                                "/ensembl_files/" + fname)
        print cmd
        util.unix(cmd)
        print "downloading %s from %s done." % (type, db_name)
        ensembl_data = util.read_csv(SyncDB.DOWNLOAD_DIR() +
                                     "/ensembl_files/" + fname,
                                     sep="\t",
                                     header=None,
                                     names=['source_id', 'gid'])
        ensembl_data = ensembl_data[ensembl_data['gid'].notnull()]
        ensembl_data = ensembl_data[ensembl_data['source_id'].notnull()]
        ensembl_data[['gid']] = ensembl_data[['gid']].astype(int)
        ensembl_data['tax_id'] = tax_id
        ensembl_data['type_name'] = type
        return ensembl_data
コード例 #25
0
    def report_js(rpath):
        import re
        con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE)
        #get last update date;
        dt = db.from_sql(
            con, "Select * FROM {0}.statistics order by history desc".format(
                SyncDB.DATABASE))
        dt['history'] = dt['history'].apply(str)
        cur_date = max(dt['history'])
        json_arr = []
        for c in dt.columns:
            json_arr.append('"' + c + '":' + dt[c].to_json(orient='values'))

        with open(rpath + "gp_stats.js", "w") as report_file:
            report_file.write("window.buildLogStatistics=")
            report_file.write(dt.to_json(orient='records'))

        files = util.unix('cp ' + rpath + 'gp_stats.js ' + rpath +
                          'gp_stats_' + cur_date + '.js')
コード例 #26
0
ファイル: godownload.py プロジェクト: fossilqq/Metascape
    def get_go_term_and_go_term2term(self):
        fn_source =os.path.join(SyncDB.DOWNLOAD_DIR(),"go_daily-termdb-tables.tar.gz") 

        dir = os.path.join(SyncDB.DOWNLOAD_DIR(),"go_daily-termdb-tables")
        urllib.urlretrieve("http://archive.geneontology.org/latest-termdb/go_daily-termdb-tables.tar.gz",fn_source)
        cmd = "tar -zxvf %s -C %s go_daily-termdb-tables/term.txt go_daily-termdb-tables/graph_path.txt go_daily-termdb-tables/term_definition.txt" % (fn_source,SyncDB.DOWNLOAD_DIR())
        util.unix(cmd)
        fn_term = os.path.join(dir,"term.txt")
        df_term = util.read_csv(fn_term, sep=r'\t', skiprows = 1, 
                names=['id','term_name','term_type','term_id','is_obsolete','is_root','is_relation'])
        df_term = df_term[(df_term.is_root == 0) & (df_term.is_relation == 0) ]
        df_term['term_type'] = df_term['term_type'].map({'biological_process':'BP',
            'molecular_function':'MF',
            'cellular_component':'CC'})
        fn_term_description = os.path.join(dir,"term_definition.txt")
        #remove bad lines.
        util.unix("awk 'BEGIN{FS=\"\\t\"} {if(NF==5) print $0}' " + fn_term_description + " > " + fn_term_description + ".tmp")
        util.unix("mv " + fn_term_description + ".tmp " + fn_term_description)
        df_term_description = util.read_csv(fn_term_description, sep=r'\t', skiprows = 1, 
                names=['id','description','dbxref_id','term_comment','reference'],
                usecols=['id','description'])
        df_term_final = pd.merge(df_term, df_term_description, on='id', how='left')
        df_term_final.drop(['id','is_obsolete','is_root','is_relation'],axis=1,inplace=True)
        df_term_final = df_term_final[df_term_final['term_id'].str.contains(r'^GO:')] 
        df_term_final = df_term_final.reindex(columns=['term_id','term_name','term_type','description'])

        fn_gp = os.path.join(dir,"graph_path.txt")
        df_gp = util.read_csv(fn_gp, sep=r'\t', skiprows = 1, 
                names=['tt_id','term1_id','term2_id','relationship_type_id','distance','relation_distance'])
        df_gp = pd.merge(df_gp,df_term,left_on='term1_id', right_on='id',how='left')
        util.rename2(df_gp, {'term_id':'parent_term_id'})
        df_gp = df_gp.loc[:,['term2_id','distance','parent_term_id']]

        df_gp = pd.merge(df_gp,df_term,left_on='term2_id', right_on='id',how='left')
        df_gp = df_gp.loc[:,['parent_term_id','distance','term_id']]

        df_gp = df_gp[df_gp['parent_term_id'].str.contains(r'^GO:') & df_gp['term_id'].str.contains(r'^GO:')] 
        df_gp.drop_duplicates(inplace=True)

        df_gp.to_csv(self.fn_dest_term2term,index=False)

        df_term_final.to_csv(self.fn_dest_go_term, index = False)
コード例 #27
0
    def report_html(rpath):
        import re
        con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE)
        #get last update date;
        dt = db.from_sql(
            con,
            "Select distinct history FROM {0}.statistics order by history desc"
            .format(SyncDB.DATABASE))
        if (len(dt['history']) < 2):
            print "No previous build statistics found."
            return

        cur_date = str(dt['history'][0])
        last_date = str(dt['history'][1])
        query = """
            select t.* from (
            SELECT new.ds as data_source, new.table_name, new.type_name, new.total as size, (new.total - old.total) as growth, (new.total - old.total)/old.total *100 AS delta, 0 as new_missing
            FROM {2}.statistics old, {2}.statistics new
            WHERE old.history = '{0}'
            AND new.history = '{1}'
            AND old.table_name = new.table_name
            AND old.type_name = new.type_name
            UNION
            SELECT new.ds as data_source, new.table_name, new.type_name, new.total as size, new.total as growth, 101 AS delta, 1 as new_missing
            FROM {2}.statistics new left Join (select * from {2}.statistics where history = '{0}') old on old.table_name = new.table_name and old.type_name = new.type_name
            WHERE new.history = '{1}' 
            AND old.table_name is NULL
            UNION
            SELECT old.ds as data_source, old.table_name, old.type_name, old.total as size, -old.total as growth, -101 AS delta, -1 as new_missing
            FROM {2}.statistics old left Join (select * from {2}.statistics where history = '{1}') new on old.table_name = new.table_name and old.type_name = new.type_name
            WHERE old.history = '{0}'
            AND old.table_name is NULL
            ) t
            ORDER BY data_source, table_name, delta DESC;
        """
        query = query.format(last_date, cur_date, SyncDB.DATABASE)
        dt = db.from_sql(con, query)
        #Tracer()()
        missing_data = dt.query('new_missing < 0')
        new_data = dt.query('new_missing > 0')
        expanded_data = dt.query('growth > 0')
        reduced_data = dt.query('growth < 0')
        unchanged_data = dt.query('growth == 0 and new_missing==0')

        reports = [
            r for r in sorted(os.listdir(rpath))
            if "Report_" in r and r != "Report_" + cur_date + ".html"
        ]
        last_report = reports[len(reports) - 1] if len(reports) > 0 else None

        #import datetime
        #time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        html = '<!DOCTYPE html><html><head><title> GP Build Report ' + cur_date + '</title></head><body><h1> GP Build Report ' + cur_date + '</h1>'
        if last_report:
            html += '<span id="lastreportlink"><a href="' + last_report + '"><<</a></span>\n'
        html += '<span id="nextreportlink"></span>\n'
        html += '<div>'
        if (len(missing_data) > 0):
            html += '<h4> Missing Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th></tr>'
        for i in range(len(missing_data)):
            html += '<tr><td>' + missing_data.irow(
                i)['data_source'] + '</td><td>' + missing_data.irow(
                    i)['table_name'] + '</td><td>' + missing_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            missing_data.irow(i)['growth']) + '</td></tr>'
        if (len(missing_data) > 0):
            html += '</table>'
        html += '</div><div>'

        if (len(new_data) > 0):
            html += '<h4> New Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th></tr>'
        for i in range(len(new_data)):
            html += '<tr><td>' + new_data.irow(
                i)['data_source'] + '</td><td>' + new_data.irow(
                    i)['table_name'] + '</td><td>' + new_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            new_data.irow(i)['growth']) + '</td></tr>'
        if (len(new_data) > 0):
            html += '</table>'
        html += '</div><div>'
        if (len(expanded_data) > 0):
            html += '<h4> Expanded Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size Diff</th> <th>Ratio</th></tr>'
        for i in range(len(expanded_data)):
            html += '<tr><td>' + expanded_data.irow(
                i)['data_source'] + '</td><td>' + expanded_data.irow(
                    i)['table_name'] + '</td><td>' + expanded_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            expanded_data.irow(i)
                            ['growth']) + '</td><td>' + str(
                                expanded_data.irow(i)['delta']) + '</td></tr>'
        if (len(expanded_data) > 0):
            html += '</table>'
        html += '</div><div>'
        if (len(reduced_data) > 0):
            html += '<h4> Reduced Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size Diff</th> <th>Ratio</th></tr>'
        for i in range(len(reduced_data)):
            html += '<tr><td>' + reduced_data.irow(
                i)['data_source'] + '</td><td>' + reduced_data.irow(
                    i)['table_name'] + '</td><td>' + reduced_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            reduced_data.irow(i)
                            ['growth']) + '</td><td>' + str(
                                reduced_data.irow(i)['delta']) + '</td></tr>'
        if (len(reduced_data) > 0):
            html += '</table>'
        html += '</div><div>'
        if (len(unchanged_data) > 0):
            html += '<h4> Unchanged Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th> </tr>'
        for i in range(len(unchanged_data)):
            html += '<tr><td>' + unchanged_data.irow(
                i)['data_source'] + '</td><td>' + unchanged_data.irow(
                    i)['table_name'] + '</td><td>' + unchanged_data.irow(
                        i)['type_name'] + '</td><td>' + str(
                            unchanged_data.irow(i)['size']) + '</td></tr>'
        if (len(unchanged_data) > 0):
            html += '</table>'

        html += '</div>'
        html += '</body></html>'

        with open(rpath + "Report.html", "w") as report_file:
            report_file.write(html)

        files = util.unix('cp ' + rpath + 'Report.html ' + rpath + 'Report_' +
                          cur_date + '.html')

        if last_report:
            with open(rpath + last_report, "r") as last_report_file:
                last_html = last_report_file.read()
                last_html = re.sub(
                    r"<span id=\"nextreportlink\">.*</span>",
                    '<span id="nextreportlink"><a href="' + 'Report_' +
                    cur_date + '.html">>></a></span>', last_html)

            with open(rpath + last_report, "w") as last_report_file:
                last_report_file.write(last_html)
コード例 #28
0
    def do_work(self):
        #get input objects
        print '***enter do_work', self.id
        #Tracer()();
        if self.has_calculated is not None:
            print 'has calculated', self.id, 'return:', self.has_calculated
            return self.has_calculated
        print self.print_me()
        input_dirty = self.prepare_inputs()
        #check all the output is existing , len is not 0
        output_dirty = any([self.is_dirty(f) for f in self.outputs])
        print 'input, output dirty checking: ', input_dirty, output_dirty, 'in', self.id

        update = True
        if ((not input_dirty)\
           and  (not output_dirty)\
           and not SyncDB.IS_REBUILD):
            update = False
        else:
            update = True
        if len(SyncDB.DEBUG_NODE_IDS):
            if self.id in SyncDB.DEBUG_NODE_IDS:
                update = True
                print 'input, output dirty checking: turn on by debug nodes'
            elif SyncDB.DEBUG_NODE_IDS_ONLY:
                update = False
        if not update:
            print '***do nothing,return from:', self.id
            r = False
        else:
            print '***calling do_update in ', self.id
            r = True
            if SyncDB.DONT_RECOVER:
                r = self.do_update()
            else:
                try:
                    r = self.do_update()
                except Exception as exp:
                    not_recovered = True
                    import traceback
                    err_msg = '!!!!!!!!!!!Error occurred while running %s (%s) \n (%s).' % (
                        self.id, exp, traceback.format_exc())
                    print err_msg
                    SyncDB.ERROR_LOG.write(err_msg)
                    if hasattr(SyncDB, 'RECOVERY_FOLDER') and os.path.exists(
                            SyncDB.RECOVERY_FOLDER):
                        #recover files from this folder
                        for f in self.outputs:
                            file = os.path.basename(f)
                            if os.path.exists(SyncDB.RECOVERY_FOLDER + "/" +
                                              file):
                                SyncDB.ERROR_LOG.write(
                                    '\tRecovered %s from folder %s\n' %
                                    (file, SyncDB.RECOVERY_FOLDER))
                                print '\tRecovered %s from folder %s\n' % (
                                    file, SyncDB.RECOVERY_FOLDER)
                                util.unix('cp ' + SyncDB.RECOVERY_FOLDER +
                                          "/" + file + ' ' +
                                          SyncDB.DOWNLOAD_DIR())
                            else:
                                SyncDB.ERROR_LOG.write(
                                    '\tDid not recover %s from folder %s (not such file)\n'
                                    % (file, SyncDB.RECOVERY_FOLDER))
                                print '\tDid not recover %s from folder %s (not such file)\n' % (
                                    file, SyncDB.RECOVERY_FOLDER)
                    else:
                        print '\tRecovery folder does not exist for %s' % (str(
                            self.outputs))
                        SyncDB.ERROR_LOG.write(
                            '\tRecovery folder does not exist\n')
                        SyncDB.ERROR_LOG.write(str(self.outputs))

            if r is None:
                r = True
        self.has_calculated = r
        return r
コード例 #29
0
ファイル: uniprotdownload.py プロジェクト: fossilqq/Metascape
    def create_idmapping(self):
        if not os.path.exists(
                os.path.join(SyncDB.DOWNLOAD_DIR(),
                             "uniprot_idmapping.dat.gz")):
            urllib.urlretrieve(
                "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz",
                os.path.join(SyncDB.DOWNLOAD_DIR(),
                             "uniprot_idmapping.dat.gz"))
            print "UniProt idmapping file downloaded..."
        else:
            print "uniprot_idmapping.dat.gz exists. start processing..."

        #keep the lines that contain tax_id and gene_id from the file.
        if not os.path.exists(
                os.path.join(SyncDB.DOWNLOAD_DIR(),
                             "uniprot_idmapping_cleaned.dat")):
            filter = '\|'.join(['NCBI_TaxID\\t' + x for x in self.taxidList])
            print "extracting rows of desired species with grep command...."
            unix_cmd = "LC_ALL=C zgrep 'NCBI_TaxID\|GeneID' " + os.path.join(SyncDB.DOWNLOAD_DIR(),"uniprot_idmapping.dat.gz") \
                       + " | LC_ALL=C grep --no-group-separator -A 1 $'" + filter + "' > " \
                       + os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_idmapping_cleaned.dat")
            print unix_cmd
            util.unix(unix_cmd)
            print "extracting rows of desired species with grep command is done."

        df_itr = util.read_csv(os.path.join(SyncDB.DOWNLOAD_DIR(),
                                            "uniprot_idmapping_cleaned.dat"),
                               sep="\t",
                               header=None,
                               names=["uniprot_id", "type", "value"],
                               iterator=True,
                               chunksize=50000)

        data = []
        chunks = 0
        cur_uid = None
        cur_tax_id = None
        cur_gene_id = None

        for df in df_itr:
            for index, el in df.iterrows():
                if cur_uid != el["uniprot_id"]:
                    cur_uid = el["uniprot_id"]
                    #discard previous tax_id and gene_id
                    cur_tax_id = None
                    cur_gene_id = None

                if el["type"] == "GeneID":
                    cur_gene_id = el["value"]
                elif el["type"] == "NCBI_TaxID":
                    cur_tax_id = el["value"]
                else:
                    print "error in idmapping_clean.dat file. expecting 'GeneID' or 'TaxID', seen ", el[
                        "type"]

                if cur_gene_id is not None and cur_tax_id is not None:
                    #gene_id and tax_id for the current uid is found
                    if cur_tax_id in [int(x) for x in self.taxidList]:
                        data.append({
                            "source_id": cur_uid,
                            "gid": cur_gene_id,
                            "tax_id": cur_tax_id
                        })
                        #Tracer()()
            chunks += 1
            print "processed ", chunks * 50000, " found ", len(data)
            del df
            #break;

        pd.DataFrame(data).to_csv(self.fn_dest_idmapping, index=False)
        print "UniProt idmapping file processed. ", len(data), " ids found"
コード例 #30
0
ファイル: uniprotdownload.py プロジェクト: fossilqq/Metascape
    def process_uniprot_xml(self):
        if not os.path.exists(
                os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_sprot.xml")):
            urllib.urlretrieve(
                "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz",
                os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_sprot.xml.gz"))
            util.unix(
                "gunzip " +
                os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_sprot.xml.gz"))

        #http://stackoverflow.com/questions/7018326/lxml-iterparse-in-python-cant-handle-namespaces
        context = etree.iterparse(os.path.join(SyncDB.DOWNLOAD_DIR(),
                                               "uniprot_sprot.xml"),
                                  tag='{http://uniprot.org/uniprot}entry')
        out = []
        i = 0
        gid_not_found = []
        multiple_gid = []
        multiple_type = []
        missing_type = []
        missing_keyword = []

        for action, elem in context:
            i = i + 1
            if i % 10000 == 0: print "processed " + str(i) + " uniprot entries"
            if elem.tag == '{http://uniprot.org/uniprot}entry':
                row = self.process_uniprot_entry(elem)
                if row is None:  #it's not a desired species.
                    continue
                #Tracer()()

                if len(row['type']) == 0:
                    if len(row['keyword']) != 0:
                        #print "missing_type";
                        #Tracer()();
                        missing_type.append(row)
                    continue

                if row['gid'] is None:
                    gid_not_found.append(row)
                    #print "gid not found";
                    #Tracer()();
                    continue

                if len(list(set(row['type']))) > 1:
                    #print "multiple type";
                    #Tracer()();
                    multiple_type.append(row)

                if len(row['keyword']) == 0:
                    #Tracer()();
                    missing_keyword.append(row)

                for t in list(set(row['type'])):
                    content = "yes"
                    if (t == "uniprot_transmembrane"):
                        content = len(row['acc'])
                    if (t == "uniprot_kinase_class"):
                        content = ",".join(row['content'])
                    out.append({
                        "gid": row["gid"],
                        "type_name": t,
                        "content": content,
                        "annotation_field1": ",".join(row["acc"]),
                        "tax_id": row["tax_id"]
                    })
                # processing goes here

            elem.clear()
            # second, delete previous siblings (records)
            while elem.getprevious() is not None:
                del elem.getparent()[0]

        df = pd.DataFrame(out)

        df.to_csv(self.fn_dest, index=False)
        print "Multiple type:", len(multiple_type)
        print "No GID:", len(gid_not_found)
        print "Missing type:", len(missing_type)
        print "Missing keyword:", len(missing_keyword)

        print "No GID:", gid_not_found
        return out