def collect_pfam2go_info(pfam2go_file): # Pfam2GO.txt pfam2go = {} for line in utils.gzip_bzip2_biom_open_readlines(pfam2go_file): line = line.strip() if not len(line): continue if re.search("^#", line): continue info = line.split("\t") pfam = info[0] go = info[1] ann = info[2] category = info[3] if category == "function": category = "MF" if category == "process": category = "BP" if category == "component": category = "CC" myann = ann + "(" + category + ":" + go + ")" if not pfam in pfam2go: pfam2go[pfam] = {} pfam2go[pfam][myann] = "" # foreach line return pfam2go
def collect_taxonomy_info(map_file): taxa_map = {} titles = {} open_file = open(map_file, "r") for line in utils.gzip_bzip2_biom_open_readlines(map_file): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^Taxon", line): for item in info: titles[item] = info.index(item) continue # title line taxa_id = info[titles["Taxon"]] taxa_name = info[titles["Scientific_name"]] taxa_rank = info[titles["Rank"]] taxa_lineage = info[titles["Lineage"]] tmp = taxa_lineage.split("|") myid = taxa_name if re.search("__", tmp[-1]): mym = re.search("__([\S]+)", tmp[-1]) myid = mym.group(1) taxa_map[ myid] = taxa_id + "\t" + taxa_name + "\t" + taxa_rank + "\t" + taxa_lineage # foreach line return taxa_map
def collect_interaction_info (int_file, filter_flag, human_pfam): # INTERACTION.txt interact = {} for line in utils.gzip_bzip2_biom_open_readlines (int_file): line = line.strip() if not len(line): continue info = line.split("|") id1 = info[0] id2 = info[1] level = info[-2] #if level == "NA": # continue if id1 == id2: # self-interaction continue if filter_flag == "yes": if not id1 in human_pfam and not id2 in human_pfam: # no interaction with human pfam continue if not id1 in interact: interact[id1] = {} interact[id1][id2] = level if not id2 in interact: interact[id2] = {} interact[id2][id1] = level # foreach line return interact
def collect_pfam_info(map_file): # uniprot_human_pfam.tsv pfams = {} titles = {} for line in utils.gzip_bzip2_biom_open_readlines(map_file): line = line.strip() if not len(line): continue if re.search("^#", line): continue if re.search("^Pfam\t", line): info = line.split("\t") myindex = 0 while myindex < len(info): titles[info[myindex]] = myindex myindex = myindex + 1 continue info = line.split("\t") if len(info) != len(titles.keys()): continue pfam = info[titles["Pfam"]] gene = info[titles["Gene_names"]] taxaID = info[titles["NCBI_TaxID"]] if taxaID != "9606": # filter out for human pfam continue pfams[pfam] = gene # foreach line return pfams
def collect_mapping_info (map_file): # uniprot_human_pfam.tsv maps = {} titles = {} for line in utils.gzip_bzip2_biom_open_readlines (map_file): line = line.strip() if not len(line): continue if re.search("^#", line): continue if re.search("^Pfam\t", line): info = line.split("\t") myindex = 0 while myindex < len(info): titles[info[myindex]] = myindex myindex = myindex + 1 continue info = line.split("\t") pfam = info[titles["Pfam"]] organism = info[titles["Organism"]] taxaID = info[titles["NCBI_TaxID"]] if not pfam in maps: maps[pfam] = {} maps[pfam][taxaID] = "" # foreach line # human pfams human_pfam = {} for mypfam in maps: if "9606" in maps[mypfam]: human_pfam[mypfam] = ";".join(sorted(maps[mypfam].keys())) return human_pfam
def extract_annotation_info (datfile, output_path): # Pfam-A.hmm.dat anns = {} myid = "" myann = "" if not os.path.isfile(datfile): sys.exit("Error: pfam file doesn't exit! " + datfile) outfile = os.path.join(output_path, "pfam_descriptions.txt") outfile1 = re.sub(".txt", ".txt.gz", outfile) for line in utils.gzip_bzip2_biom_open_readlines (datfile): line = line.strip() if not len(line): continue if re.search("\#=GF\s+AC", line): mym = re.search("\#=GF\s+AC\s+([^\.]+)", line) myid = mym.group(1) continue if re.search("\#=GF\s+DE", line): mym = re.search("\#=GF\s+DE\s+([\S\s]+)", line) myann = mym.group(1) if not myid in anns: anns[myid] = myann else: anns[myid] = anns[myid] + ";" + myann continue # foreach line open_file = open(outfile, "w") open_file.write("Pfam\tdescription\n") for mypfam in sorted(anns.keys()): open_file.write(mypfam + "\t" + anns[mypfam] + "\n") # foreach Pfam open_file.close() os.system("gzip " + outfile)
def collect_pfam2go_info(annfile): # Pfam2GO.txt pfam2go = {} for line in utils.gzip_bzip2_biom_open_readlines(annfile): line = line.strip() if not len(line): continue if re.search("^#", line): continue info = line.split("\t") mypfam = info[0] mygo = info[1] mydec = info[2] mytype = info[3] if mytype == "process": mytype = "BP" if mytype == "function": mytype = "MF" if mytype == "component": mytype = "CC" if not mypfam in pfam2go: pfam2go[mypfam] = {} pfam2go[mypfam][mygo] = mydec + "\t" + mytype # foreach line return pfam2go
def collect_expression_info(exp_list): # expression.list expression = {} for myfile in exp_list: myfile = myfile.strip() if not len(myfile): continue if re.search("^#", myfile): continue for line in utils.gzip_bzip2_biom_open_readlines(myfile): line = line.strip() if not len(line): continue if re.search("^#", line) or re.search("^Gene ID", line): continue info = line.split("\t") #flag = 0 #myindex = 2 #while myindex < len(info): # if info[myindex] == "": # continue # if float(info[myindex]) >= float(cutoff): # flag = 1 # myindex = myindex + 1 #if flag == 1: mygene = info[1] expression[mygene] = "" # foreach line # foreach dataset return expression
def collect_taxa_list (taxa_file): # uniprot_taxaID_bac-arc-vir.tsv taxa = {} for line in utils.gzip_bzip2_biom_open_readlines (taxa_file): line = line.strip() if not len(line): continue taxa[line] = "" # foreach line return taxa
def collect_pfam_info (pfamfile): # Pfam_ann.tsv pfam = {} for line in utils.gzip_bzip2_biom_open_readlines (pfamfile): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^Pfam", line): continue pfam[info[0]] = info[1] # foreach line return pfam
def collect_taxonomy_info(taxa_file, taxa_hits): # uniprot_taxonomy.map.tsv taxa = {} taxa_map = {} titles = {} for line in utils.gzip_bzip2_biom_open_readlines(taxa_file): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^Taxon", line): for item in info: titles[item] = info.index(item) continue # title line mytaxa = info[titles["Taxon"]] if not mytaxa in taxa_hits: continue myname = info[titles["Scientific_name"]] myline = info[titles["Lineage"]] myrank = info[titles["Rank"]] myinfo = myline.split("|") if re.search("unclassified_sequences", myinfo[0]): continue myid = myname if re.search("__", myinfo[-1]): mym = re.search("([^\_]+)__([\S]+)", myinfo[-1]) myrank = mym.group(1) myid = mym.group(2) if myrank == "k": myrank = "Kingdom" if myrank == "p": myrank = "Phylum" if myrank == "c": myrank = "Class" if myrank == "o": myrank = "Order" if myrank == "f": myrank = "Family" if myrank == "g": myrank = "Genus" if myrank == "s": myrank = "Species" if myrank == "t": myrank = "Terminal" taxa_map[myid] = mytaxa + "\t" + myname + "\t" + myrank + "\t" + myline taxa[mytaxa] = myname + "\t" + myrank + "\t" + myline # foreach line return taxa, taxa_map
def report_each_map(uniref_ann, map_type, uniref_type, output_path): outfile = os.path.join(output_path, "map_" + map_type + "_" + uniref_type + ".txt") outfile1 = re.sub(".txt", ".txt.gz", outfile) outs = {} titles = {} titles1 = {} ids = {} for line in utils.gzip_bzip2_biom_open_readlines(uniref_ann): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^ID", line) or re.search("^UniRef\t", line) or re.search( "^UniRefID\t", line): for item in info: titles[item] = info.index(item) titles1[info.index(item)] = item continue info = line.split("\t") if "UniRef" in titles: myuniref = info[titles["UniRef"]] elif "ID" in titles: myuniref = info[titles["ID"]] myindex = 0 while myindex < len(info): mykey = titles1[myindex] mykey = re.sub("\(", "_", mykey) mykey = re.sub("\)", "", mykey) myvalue = info[myindex] if myvalue == "NA" or myvalue == "na" or myvalue == "NaN": myindex = myindex + 1 continue if mykey == map_type: if not myvalue in outs: outs[myvalue] = {} outs[myvalue][myuniref] = "" myindex = myindex + 1 # foreach uniref open_out = open(outfile, "w") for myid in sorted(outs.keys()): myinfo = "\t".join(sorted(outs[myid].keys())) open_out.write(myid + "\t" + myinfo + "\n") open_out.close() os.system("gzip " + outfile)
def collect_pfam_ann (pfamfile): # Pfam_ann.tsv pfams = {} for line in utils.gzip_bzip2_biom_open_readlines (pfamfile): line = line.strip() if not len(line): continue if re.search("^#", line): continue info = line.split("\t") pfam = info[0] ann = info[1] if not pfam in pfams: pfams[pfam] = ann # foreach line return pfams
def collect_taxonomy_info(map_file): taxa_map = {} titles = {} for line in utils.gzip_bzip2_biom_open_readlines(map_file): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^Taxon", line): for item in info: titles[item] = info.index(item) continue # title line taxa_id = info[titles["Taxon"]] taxa_name = info[titles["Scientific_name"]] taxa_rank = info[titles["Rank"]] taxa_lineage = info[titles["Lineage"]] myid = taxa_name tmp = taxa_lineage.split("|") if re.search("__", tmp[-1]): mym = re.search("([^\_]+)__([\S]+)", tmp[-1]) myrank = mym.group(1) myid = mym.group(2) if myrank == "k": myrank = "Kingdom" if myrank == "p": myrank = "Phylum" if myrank == "c": myrank = "Class" if myrank == "o": myrank = "Order" if myrank == "f": myrank = "Family" if myrank == "g": myrank = "Genus" if myrank == "s": myrank = "Species" if myrank == "t": myrank = "Terminal" taxa_rank = myrank taxa_map[ myid] = taxa_id + "\t" + taxa_name + "\t" + taxa_rank + "\t" + taxa_lineage # foreach line return taxa_map
def extract_mapping_info(mapfile): # uniprot_taxonomy.map.tsv maps = {} titles = {} for line in utils.gzip_bzip2_biom_open_readlines(mapfile): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^Taxon", line): for item in info: titles[item] = info.index(item) continue taxa_id = info[titles["Taxon"]] taxa_name = info[titles["Scientific_name"]] maps[taxa_id] = taxa_name # TaxID <-> Tax # foreach line return maps
def collect_pfam_info (pfam_file): # pdb_chain_pfam.tsv pfams = {} for line in utils.gzip_bzip2_biom_open_readlines (pfam_file): line = line.strip() if not len(line): continue if re.search("^#", line) or re.search("^PDB", line): continue info = line.split("\t") mypdb = info[0] mychain = info[1] mypfam = info[-2] if not mypdb in pfams: pfams[mypdb] = {} if not mychain in pfams[mypdb]: pfams[mypdb][mychain] = {} pfams[mypdb][mychain][mypfam] = "" # foreach line return pfams
def collect_taxanomy_info (map_file): # pdb_chain_taxonomy.tsv taxa = {} taxa_hit = {} for line in utils.gzip_bzip2_biom_open_readlines (map_file): line = line.strip() if not len(line): continue if re.search("^#", line) or re.search("^PDB", line): continue info = line.split("\t") mypdb = info[0] mychain = info[1] tax = info[2] if not mypdb in taxa: taxa[mypdb] = {} if not mypdb in taxa_hit: taxa_hit[mypdb] = {} taxa_hit[mypdb][tax] = "" if not mychain in taxa[mypdb]: taxa[mypdb][mychain] = {} taxa[mypdb][mychain][tax] = "" # foreach line #taxa_flt = {} #for mypdb in taxa: # if mypdb in taxa_hit: # if filter_flag == "yes": # if not "9606" in taxa_hit[mypdb]: # human PDB # continue # # filtering human PDB # if not mypdb in taxa_flt: # taxa_flt[mypdb] = {} # for mychain in taxa[mypdb]: # if not mychain in taxa_flt[mypdb]: # taxa_flt[mypdb][mychain] = {} # for mytax in taxa[mypdb][mychain]: # taxa_flt[mypdb][mychain][mytax] = "" # # foreach chain # # if hit # foreach PDB return taxa
def read_vignettes_file(vignettes_file, specific_annotation): """ Collect proteins with specific annotations for prioritization Input: vignettes filename Output: vignettes = [ann1, ann2, ann3, ..] """ config.logger.info("Start read_vignettes_file") vignettes = {} titles = {} for line in utils.gzip_bzip2_biom_open_readlines(vignettes_file): line = line.strip() if not len(line): continue if re.search("^#", line): continue info = line.split("\t") if re.search("^type", line): for item in info: titles[item] = info.index(item) continue mytype = info[titles["type"]] if not mytype.lower( ) in specific_annotation and not mytype in specific_annotation: continue if not "annotation" in titles: # debug config.logger.info("WARNING! No annotation info!\t" + line) continue myid = info[titles["annotation"]] vignettes[myid] = "" # foreach line config.logger.info("Finish read_vignettes_file") return vignettes
def collect_basic_info (uniref_list, hits): uniref_info = {} uniref_info_tmp = {} flags = {} names = [] # ["UniRefID", "Protein_names", "Gene_names", "UniProtKB", "Tax", "TaxID", "Rep_Tax", "Rep_TaxID", "GO", "KO", "eggNOG", "Pfam", "Level4EC"] hits_items = set(sorted(hits.keys())) for myfile in uniref_list: myfile = myfile.strip() if not len(myfile): continue if re.search("^#", myfile): continue if not os.path.isfile(myfile): config.logger.info ("ERROR! File does not exist: " + myfile) continue myname = os.path.basename(myfile) if not re.search("^map_", myname): continue if not re.search("uniref90", myname) and not re.search("uniref50", myname): continue myname = re.sub("map_", "", myname) if re.search("^[\S]+_uniref", myname): myname = re.sub("_uniref[\S]+$", "", myname) if myname == "go": myname = "GO" if myname == "ko": myname = "KO" if myname == "eggnog": myname = "eggNOG" if myname == "pfam": myname = "Pfam" if myname == "level4ec": myname = "Level4EC" if re.search("^uniref[\d]+_name", myname): myname = "Protein_names" if not myname in flags: flags[myname] = "" names.append(myname) else: continue for line in utils.gzip_bzip2_biom_open_readlines (myfile): line = line.strip() if not len(line): continue if re.search("^#", line) : continue info = line.split("\t") myvalue = info[0] myset = set(sorted(info[1:len(info)])) myoverlap = myset.intersection(hits_items) for mykey in myoverlap: if not mykey in uniref_info_tmp: uniref_info_tmp[mykey] = {} if not myname in uniref_info_tmp[mykey]: uniref_info_tmp[mykey][myname] = myvalue else: uniref_info_tmp[mykey][myname] = uniref_info_tmp[mykey][myname] + ";" + myvalue #myindex = 1 #while myindex < len(info): # mykey = info[myindex] # if not mykey in hits: # myindex = myindex + 1 # continue # if not mykey in uniref_info_tmp: # uniref_info_tmp[mykey] = {} # if not myname in uniref_info_tmp[mykey]: # uniref_info_tmp[mykey][myname] = myvalue # else: # uniref_info_tmp[mykey][myname] = uniref_info_tmp[mykey][myname] + ";" + myvalue # myindex = myindex + 1 # foreach line # foreach dataset # collecting all info for myid in uniref_info_tmp.keys(): mystr = myid for myname in names: if myname in uniref_info_tmp[myid]: mystr = mystr + "\t" + uniref_info_tmp[myid][myname] else: mystr = mystr + "\tNA" uniref_info[myid] = mystr uniref_info_tmp = {} return uniref_info, names
def extract_annotation_info(output_path, maps): anns = {} human_pfams = {} titles = {} datfile = os.path.join(output_path, "uniprot_annotation.tsv.gz") if not os.path.isfile(datfile): sys.exit( "Error: uniprot annotation file (uniprot_annotation.tsv.gz) doesn't exit, please prepare for it" ) items = [ "UniProtKB", "Entry_name", "Organism", "Gene_names", "Length", "GO_BP", "GO_MF", "GO_CC", "KEGG", "KEGG-KO", "eggNOG", "Interpro", "Taxonomic_lineage", "Subcellular_location", "Transmembrane", "Signal_peptide", "Pfam" ] for line in utils.gzip_bzip2_biom_open_readlines(datfile): line = line.strip() if not len(line): continue info = line.split("\t") if re.search("^#", line): continue if re.search("^Entry", line): myindex = 0 while myindex < len(info): item = info[myindex] if item == "Entry": item = "UniProtKB" if item == "Entry name": item = "Entry_name" if item == "Gene names": item = "Gene_names" if item == "Gene ontology (biological process)": item = "GO_BP" if item == "Gene ontology (molecular function)": item = "GO_MF" if item == "Gene ontology (cellular component)": item = "GO_CC" if re.search("KEGG", item): item = "KEGG" if re.search("\(KO\)", item): item = "KEGG-KO" if re.search("COG", item): item = "eggNOG" if re.search("Taxonomic", item): item = "Taxonomic_lineage" if re.search("Subcellular location", item): item = "Subcellular_location" if re.search("Signal peptide", item): item = "Signal_peptide" if re.search("Pfam", item): item = "Pfam" if item == "Protein names": item = "Protein_names" if item in items: titles[myindex] = item if "NCBI_TaxID" == item: titles[myindex] = item if "Protein_names" == item: titles[myindex] = item myindex = myindex + 1 # foreach item continue myindex = 0 mystr = {} tax = "NA" taxID = "NA" # taxa uniprot_id = "NA" org = "NA" gene = "NA" protein = "NA" pfam_item = "NA" entry_name = info[1] while myindex < len(info): item = info[myindex] if myindex in titles: myid = titles[myindex] if item == "": item = "NA" if myid == "NCBI_TaxID": taxID = item if taxID in maps: tax = maps[taxID] myindex = myindex + 1 continue # UniProtKB if myid == "UniProtKB" and item != "NA" and item != "": uniprot_id = item # Gene name if myid == "Gene_names" and item != "NA" and item != "": gene = item gene = re.sub(";$", "", gene) gene = re.sub("\{[^\{]+\}", "", gene) gene = re.sub("\s+", ";", gene) # Protein name if myid == "Protein_names" and item != "NA" and item != "": protein = item protein = re.sub(";$", "", protein) protein = re.sub("\{[^\{]+\}", "", protein) protein = re.sub("\s+$", "", protein) # Organism if myid == "Organism" and item != "NA" and item != "": org = item # Pfam if myid == "Pfam" and item != "NA" and item != "": pfam_item = item mystr[myid] = item myindex = myindex + 1 # foreach item mystr_out = "" for item in items: if item in mystr: if mystr_out == "": mystr_out = mystr[item] else: mystr_out = mystr_out + "\t" + mystr[item] else: if mystr_out == "": mystr_out = "NA" else: mystr_out = mystr_out + "\tNA" anns[info[1]] = tax + "\t" + taxID + "\t" + mystr_out if taxID == "9606": # human proteins if pfam_item != "NA": pfam_item = re.sub("\s+", "", pfam_item) tmps = pfam_item.split(";") for i in tmps: if not i in human_pfams: human_pfams[i] = {} if uniprot_id != "NA": human_pfams[i][uniprot_id] = protein + "\t" + gene # foreach line # report human pfams outfile = os.path.join(output_path, "uniprot_human_pfam.tsv") outfile1 = re.sub(".tsv", ".tsv.gz", outfile) open_out = open(outfile, "w") open_out.write( "Pfam\tOrganism\tNCBI_TaxID\tUniProtKB\tProtein_names\tGene_names\n") for myid in sorted(human_pfams.keys()): mystr = myid + "\tHomo sapiens (Human)\t9606" #mypfam = ";".join(sorted(human_pfams[myid].keys())) mystr1 = "" mystr2 = "" mystr3 = "" for i in sorted(human_pfams[myid].keys()): mystr1 = mystr1 + i + ";" x, y = human_pfams[myid][i].split("\t") if x != "NA" and x != "": mystr2 = mystr2 + x + ";" if y != "NA" and y != "": mystr3 = mystr3 + y + ";" mystr1 = re.sub(";$", "", mystr1) mystr2 = re.sub(";$", "", mystr2) mystr3 = re.sub(";$", "", mystr3) if mystr1 == "": mystr1 = "NA" if mystr2 == "": mystr2 = "NA" if mystr3 == "": mystr3 = "NA" open_out.write(mystr + "\t" + mystr1 + "\t" + mystr2 + "\t" + mystr3 + "\n") open_out.close() os.system("gzip " + outfile) header = "Rep_Tax\tRep_TaxID\t" + "\t".join(items) return anns, header
def extract_annotation_info(datfile, output_path): title = "Entry\tEntry name\tGene names\tProtein names\tOrganism\tNCBI_TaxID\tLength\tGene ontology (biological process)\tGene ontology (molecular function)\tGene ontology (cellular component)\tCross-reference (KEGG)\tCross-reference (KO)\tCOG\tInterpro\tTaxonomic lineage (ALL)\tSubcellular location [CC]\tTransmembrane\tSignal peptide\tCross-reference (Pfam)" outfile = os.path.join(output_path, "uniprot_annotation.tsv") outfile1 = os.path.join(output_path, "uniprot_annotation.tsv.gz") if not os.path.isfile(datfile): sys.exit("Error: please download the uniprot dat file!") if os.path.isfile(outfile) and not os.path.isfile(outfile1): os.system("gzip " + outfile) config.logger.info("WARNING! Already exist file and skip this step: " + outfile1) return outfile1 if os.path.isfile(outfile1): config.logger.info("WARNING! Already exist file and skip this step: " + outfile1) return outfile1 open_out = open(outfile, "w") open_out.write(title + "\n") entry = "NA" entry_name = "NA" prot_name = "NA" gene_name = "NA" organism = "NA" taxa = "NA" length = "NA" go_bp = "NA" go_mf = "NA" go_cc = "NA" kegg = "NA" ko = "NA" cog = "NA" interpro = "NA" tax = "NA" sub = "NA" trans = "NA" signal = "NA" pfam = "NA" for line in utils.gzip_bzip2_biom_open_readlines(datfile): line = line.strip() if not len(line): continue if line == "//": # end of one entry # output info if entry != "NA": tax = tax + ";" + organism open_out.write(entry + "\t" + entry_name + "\t" + gene_name + "\t" + prot_name + "\t" + organism + "\t" + taxa + "\t" + length + "\t" + go_bp + "\t" + go_mf + "\t" + go_cc + "\t" + kegg + "\t" + ko + "\t" + cog + "\t" + interpro + "\t" + tax + "\t" + sub + "\t" + trans + "\t" + signal + "\t" + pfam + "\n") entry = "NA" entry_name = "NA" prot_name = "NA" gene_name = "NA" organism = "NA" taxa = "NA" length = "NA" go_bp = "NA" go_mf = "NA" go_cc = "NA" kegg = "NA" ko = "NA" cog = "NA" interpro = "NA" tax = "NA" sub = "NA" trans = "NA" signal = "NA" pfam = "NA" continue if re.search("^ID\s+", line): # entry name mym = re.search("^ID\s+([\S]+)\s+[\s\S]+\s+([\d]+)\s+AA", line) entry_name = mym.group(1) length = mym.group(2) continue if re.search("^AC\s+", line): # entry mym = re.search("AC\s+([\S]+)", line) entry = mym.group(1) entry = re.sub(";$", "", entry) continue if re.search("^GN\s+", line): # entry mym = re.search("GN\s+([\S]+[\s\S]+)", line) gene_tmp = mym.group(1) gene_tmp = re.sub("\s+", "", gene_tmp) tmps = gene_tmp.split(";") gene_name_tmp = "" for tmp in tmps: if re.search("\=([^\=]+)", tmp): mym = re.search("\=([^\=]+)", tmp) gene_name_tmp = gene_name_tmp + mym.group(1) + ";" gene_name_tmp = re.sub(";$", "", gene_name_tmp) if gene_name == "NA": gene_name = gene_name_tmp else: gene_name = gene_name + ";" + gene_name_tmp continue if re.search("^DE\s+[\s\S]+Full\=([\S\s]+)", line): # protein name mym = re.search("^DE\s+[\s\S]+Full\=([\S\s]+)", line) mydec = mym.group(1) mydec = re.sub(";$", "", mydec) if prot_name == "NA": prot_name = mydec else: prot_name = prot_name + ";" + mydec continue if re.search("^OS\s+", line): # organism mym = re.search("^OS\s+([\S\s]+)", line) mydec = mym.group(1) mydec = re.sub("\.$", "", mydec) if organism == "NA": organism = mydec else: organism = organism + ";" + mydec continue if re.search("^OX\s+", line): # NCBI taxonomy ID mym = re.search("^OX\s+NCBI_TaxID=([\d]+)", line) taxa_info = mym.group(1) taxa_info = re.sub(";$", "", taxa_info) if taxa == "NA": taxa = taxa_info else: taxa = taxa + ";" + taxa_info continue if re.search("^OC\s+", line): # taxonomy lineage mym = re.search("^OC\s+([\S\s]+)", line) mydec = mym.group(1) mydec = re.sub("\.$", "", mydec) mydec = re.sub(";$", "", mydec) if tax == "NA": tax = mydec else: tax = tax + ";" + mydec continue if re.search("^DR\s+KEGG;", line): # KEGG annotation mym = re.search("^DR\s+(KEGG;[\S\s]+)", line) mydec = mym.group(1) tmp = mydec.split("; ") kegg_tmp = tmp[1] if kegg == "NA": kegg = kegg_tmp else: kegg = kegg + ";" + kegg_tmp continue if re.search("^DR\s+KO;", line): # KO annotation mym = re.search("^DR\s+(KO;[\S\s]+)", line) mydec = mym.group(1) tmp = mydec.split("; ") ko_tmp = tmp[1] if ko == "NA": ko = ko_tmp else: ko = ko + ";" + ko_tmp continue if re.search("^DR\s+eggNOG;", line): # COG annotation mym = re.search("^DR\s+(eggNOG;[\S\s]+)", line) mydec = mym.group(1) tmp = mydec.split("; ") cog_tmp = tmp[1] if cog == "NA": cog = cog_tmp else: cog = cog + ";" + cog_tmp continue if re.search("^DR\s+InterPro;", line): # InterPro annotation mym = re.search("^DR\s+(InterPro;[\S\s]+)", line) mydec = mym.group(1) tmp = mydec.split("; ") inter_tmp = tmp[1] if interpro == "NA": interpro = inter_tmp else: interpro = interpro + ";" + inter_tmp continue if re.search("^DR\s+GO;", line): # GO annotation mym = re.search("^DR\s+(GO;[\S\s]+)", line) mydec = mym.group(1) tmp = mydec.split("; ") go = tmp[1] go_info = tmp[2] if re.search("^P:", go_info): go_info = re.sub("^P:", "", go_info) if go_bp == "NA": go_bp = go_info + " [" + go + "]" else: go_bp = go_bp + ";" + go_info + " [" + go + "]" if re.search("^F:", go_info): go_info = re.sub("^F:", "", go_info) if go_mf == "NA": go_mf = go_info + " [" + go + "]" else: go_mf = go_mf + ";" + go_info + " [" + go + "]" if re.search("^C:", go_info): go_info = re.sub("^C:", "", go_info) if go_cc == "NA": go_cc = go_info + " [" + go + "]" else: go_cc = go_cc + ";" + go_info + " [" + go + "]" continue if re.search("^DR\s+Pfam;", line): # GO annotation mym = re.search("^DR\s+(Pfam;[\S\s]+)", line) mydec = mym.group(1) tmp = mydec.split("; ") pfam_info = tmp[1] if pfam == "NA": pfam = pfam_info else: pfam = pfam + ";" + pfam_info continue if re.search("^CC\s+", line): # CC info if re.search("^CC\s+\-\!\-\s+SUBCELLULAR\s+LOCATION", line): # Subcellular mym = re.search( "^CC\s+\-\!\-\s+SUBCELLULAR\s+LOCATION\s*([\s\S]+)", line) myflag = 1 mydec = mym.group(1) if sub == "NA": sub = "SUBCELLULAR LOCATION" + mydec else: sub = sub + ";" + "SUBCELLULAR LOCATION" + mydec elif re.search("^CC\s+\-\!\-", line): # not subcellular myflag = 0 else: if myflag == 1: mym = re.search("^CC\s+(\S+[\s\S]+)", line) mydec = mym.group(1) mydec = re.sub("\.$", "", mydec) sub = sub + " " + mydec continue if re.search("^FT\s+", line): # FT info if re.search("^FT\s+TRANSMEM\s+", line): # transmembrane transflag = 1 signalflag = 0 if trans == "NA": trans = "TRANSMEM" else: trans = trans + ";" + "TRANSMEM" if re.search("^FT\s+SIGNAL\s+", line): # signal signalflag = 1 transflag = 0 if signal == "NA": signal = "SIGNAL" else: signal = signal + ";" + "SIGNAL" continue # FT info # foreach line # last entry tax = tax + ";" + organism open_out.write(entry + "\t" + entry_name + "\t" + prot_name + "\t" + organism + "\t" + taxa + "\t" + length + "\t" + go_bp + "\t" + go_mf + "\t" + go_cc + "\t" + kegg + "\t" + ko + "\t" + interpro + "\t" + cog + "\t" + tax + "\t" + sub + "\t" + trans + "\t" + signal + "\t" + pfam + "\n") open_out.close() os.system("gzip " + outfile)
def format_taxonomy_info (taxafile, output_path): # uniprot_taxonomy.tsv titles = {} taxa_info = {} kingdoms = {} phylums = {} classes = {} orders = {} families = {} gena = {} species = {} kingdom = ["Superkingdom", "Kingdom", "Subkingdom"] phylum = ["Superphylum", "Phylum", "Subphylum"] clas = ["Superclass", "Class", "Subclass", "Infraclass", "Cohort"] order = ["Superorder", "Order", "Suborder", "Infraorder", "Parvorder"] family = ["Superfamily", "Family", "Subfamily", "Tribe", "Subtribe"] genus = ["Genus", "Subgenus"] specie = ["Species", "Species group", "Species subgroup", "Species subsgroup", "Subspecies", "Forma", "Varietas"] # collect info title_num = 0 for line in utils.gzip_bzip2_biom_open_readlines (taxafile): line = line.strip() if not len(line): continue if re.search("^#", line): continue info = line.split("\t") if re.search("^Taxon", line): myindex = 0 while myindex < len(info): item = info[myindex] titles[item] = myindex myindex = myindex + 1 # foreach item title_num = len(info) continue # debug if len(info) < title_num: mynum_tmp = len(info) while mynum_tmp < title_num: info.append("") mynum_tmp = mynum_tmp + 1 #print(str(len(info)) + "\t" + line) mytaxa = info[titles["Taxon"]] myname = info[titles["Scientific name"]] if myname == "": continue myrank = info[titles["Rank"]] myline = info[titles["Lineage"]] mypar = info[titles["Parent"]] mylevel = re.sub("\s+", "_", myname) if myrank in kingdom: mylevel = "k__" + re.sub("\s+", "_", myname) kingdoms[myname] = mytaxa if myrank in phylum: mylevel = "p__" + re.sub("\s+", "_", myname) phylums[myname] = mytaxa if myrank in clas: mylevel = "c__" + re.sub("\s+", "_", myname) classes[myname] = mytaxa if myrank in order: mylevel = "o__" + re.sub("\s+", "_", myname) orders[myname] = mytaxa if myrank in family: mylevel = "f__" + re.sub("\s+", "_", myname) families[myname] = mytaxa if myrank in genus: mylevel = "g__" + re.sub("\s+", "_", myname) gena[myname] = mytaxa if myrank in specie: mylevel = "s__" + re.sub("\s+", "_", myname) species[myname] = mytaxa taxa_info[mytaxa] = mytaxa + "\t" + myname + "\t" + myrank + "\t" + myline + "\t" + mylevel + "\t" + mypar # foreach line ## format taxonomy info mic_types = ["k__Viruses", "k__Bacteria", "k__Archaea", "k__Fungi"] mammalia_types = ["c__Mammalia"] mic_ids = {} mammalia_ids = {} outfile = os.path.join(output_path, "uniprot_taxonomy.tsv") open_out = open(outfile, "w") open_out.write("Taxon\tScientific_name\tRank\tLineage\tParent\n") for mytaxa in sorted(taxa_info.keys()): mytaxa, myname, myrank, myline, mylevel, mypar = taxa_info[mytaxa].split("\t") if myrank == "": mylevel = "t__" + mylevel info = myline.split("; ") mystr = "" if len(info) < 1: mystr = mylevel else: mystr = "" for item in info: if item in kingdoms: item = "k__" + item if item in phylums: item = "p__" + item if item in classes: item = "c__" + item if item in orders: item = "o__" + item if item in families: item = "f__" + item if item in gena: item = "g__" + item if item in species: item = "s__" + item if item != "": item = re.sub("\s+", "_", item) mystr = mystr + item + "|" # foreach item # check parent tmp_info = mystr.split("|") if mypar in taxa_info: mypar_level = taxa_info[mypar].split("\t")[-2] if not mypar_level in tmp_info: # skipped the nearest ancester mystr = mystr + mypar_level + "|" mystr = mystr + mylevel # else # rename empoty myrank if myrank == "": if re.search("t__", mylevel): myrank = "Terminal" if re.search("s__", mylevel): myrank = "Species" if re.search("g__", mylevel): myrank = "Genus" if re.search("f__", mylevel): myrank = "Family" if re.search("o__", mylevel): myrank = "Order" if re.search("c__", mylevel): myrank = "Class" if re.search("p__", mylevel): myrank = "Phylum" if re.search("k__", mylevel): myrank = "Kingdom" open_out.write(mytaxa + "\t" + myname + "\t" + myrank + "\t" + mystr + "\t" + mypar + "\n") for i in mic_types: if re.search(i, mystr): mic_ids[mytaxa] = "" for i in mammalia_types: if re.search(i, mystr): mammalia_ids[mytaxa] = "" # foreach taxon open_out.close() os.system("gzip " + outfile) # output microbiome and mammalia taxa ids outfile = os.path.join(output_path, "uniprot_taxaID_microbiome.txt") open_out = open(outfile, "w") for myid in sorted(mic_ids.keys()): open_out.write(myid + "\n") open_out.close() os.system("gzip " + outfile) outfile = os.path.join(output_path, "uniprot_taxaID_mammalia.txt") open_out = open(outfile, "w") for myid in sorted(mammalia_ids.keys()): open_out.write(myid + "\n") open_out.close() os.system("gzip " + outfile)