def convert(self, infile, count=False): from ete3 import NCBITaxa tax_db = NCBITaxa() with open(infile) as handle: header = None counter = 0 while not header and counter < 15: head = handle.readline().rstrip().split("\t") if head[0] == "#query_name": header = head counter += 1 if not header: print( "\nYou sure this file is good? Like, is it the '.emapper.annotations' you got from running eggnoggmapper?\n" ) sys.exit(0) idx = [i for i, v in enumerate(header) if v == "eggNOG OGs"][0] print("Loading eggNOGs from file.", file=sys.stderr) gene_id2eggs = { l.split("\t")[0]: l.split("\t")[idx] for l in tqdm(handle.readlines()) if not l.startswith("#") } print("Parsing taxonomies, and simplifying to deepest eggNOG.", file=sys.stderr) taxos = { vv.split("@")[1] for v in gene_id2eggs.values() for vv in v.split(",") } tax2level = { k: len(v) for k, v in tax_db.get_lineage_translator(list(taxos)).items() } lowest_level = lambda x: tax2level.get(int(x[1]), 1000) gene_id2deepest_egg = { k: min([vv.split('@') for vv in v.split(",")], key=lowest_level)[0] for k, v in tqdm(gene_id2eggs.items()) } print("Stratify it to genome", file=sys.stderr) if not self.gene_id2genome: self.gene_id2genome = { k: "_".join(k.split("_")[:-1]) for k in gene_id2deepest_egg.keys() } genome2nog = {k: [] for k in set(self.gene_id2genome.values())} for k, v in gene_id2deepest_egg.items(): genome2nog[self.gene_id2genome[k]] += [v] if count: return { k: {vv: v.count(vv) for vv in set(v)} for k, v in genome2nog.items() } else: return {k: list(set(v)) for k, v in genome2nog.items()}
def findclade(namelist, ranks='family|genus'): #rankregex = re.compile('^(%s)$' % ranks) ncbi = NCBITaxa() name2taxid = ncbi.get_name_translator(namelist) lineages = ncbi.get_lineage_translator([v[0] for v in name2taxid.values()]) cladetaxids = [] for name in namelist: lineage = lineages[name2taxid[name][0]] #print(name, name2taxid[name], lineage) rank2clade = { rk: taxid for taxid, rk in ncbi.get_rank(lineage).items() } cladetaxids.append( [rank2clade.get(rank, 0) for rank in ranks.split('|')]) #print(cladetaxids) taxid2clade = ncbi.get_taxid_translator(chain(*cladetaxids)) for name, taxidlist in zip(namelist, cladetaxids): yield name, [taxid2clade.get(t, '') for t in taxidlist]
emapper_outs = [] for v in os.walk(os.path.dirname(genus), followlinks=True): for vv in v[2]: if vv.endswith(".emapper") and not "__" in vv: emapper_outs += [pjoin(v[0], vv)] for f in emapper_outs: with open(f) as handle: res = json.load(handle) gid = os.path.basename(f)[:-8] for k,v in res.items(): if v : nogs = v["eggNOG OGs"] if nogs not in nogs2lowest: taxos = [v.split("@")[1] for v in nogs.split(",")] tax2level = {k : len(v) for k,v in tax_db.get_lineage_translator(list(taxos)).items()} lowest_nog = min([vv.split('@') for vv in nogs.split(",")], key = lambda x : tax2level.get(int(x[1]),1000))[0] nogs2lowest[nogs] = lowest_nog else : lowest_nog = nogs2lowest[nogs] nog2ko[lowest_nog] += [v['KEGG_ko']] nog2cogcat[lowest_nog] += [v['COG Functional cat.']] nog2ec[lowest_nog] += [v['EC']] nog2cazy[lowest_nog] += [v['CAZy']] if k in gid2abinit: abinit2ko[gid2abinit[k]] += [v['KEGG_ko']] abinit2cogcat[gid2abinit[k]] += [v['COG Functional cat.']] abinit2ec[gid2abinit[k]] += [v['EC']] abinit2cazy[gid2abinit[k]] += [v['CAZy']]
gid2eggs = {} for g in tqdm(gids): patty = pjoin(clade_folder, g, g + ".emapper") with open(patty) as handle: gid2eggs[g] = { v["eggNOG OGs"] for k, v in json.load(handle).items() if v } taxos = { vvv.split("@")[1] for v in gid2eggs.values() for vv in v for vvv in vv.split(",") } tax2level = { k: len(v) for k, v in tax_db.get_lineage_translator(list(taxos)).items() } for g in tqdm(gids): gid2eggs[g] = list({ min([vv.split('@') for vv in v.split(",")], key=lambda x: tax2level.get(int(x[1]), 1000))[0] for v in gid2eggs[g] }) #checking checkm file completeness_switch = "--checkm " + checkm_file if os.path.exists( checkm_file) else "" print("executing mOTUpan") with tempfile.NamedTemporaryFile(mode="w", suffix=".gid2cog") as temp: json.dump(gid2eggs, temp, indent=4, sort_keys=True)
def create_CAMI_profile(data_file, sample_id): """ CSV Parser for converting information to the CAMI profiling format. Input: csv file with the required information, sample ID and the name of the file to write to Output: header and contents of the CAMI profile file (see format linked above) """ dataframe = pd.read_csv(data_file) subset = dataframe[dataframe["sample"] == sample_id] taxa = subset["Assignment"] total_percentages = subset["percentage_of_total_reads"] ncbi = NCBITaxa() rank_list_list = [] #save all taxonomies to find the longest #I use the longest, because virus taxonomy is diverse... output_list = [] #stores the CAMI profiles as strings for name in taxa: #remove names that have some addition in brackets, # like " (segment 1)" if ' (' in name: ncbi_name = name[:name.index(' (')] else: ncbi_name = name taxon_and_id = ncbi.get_name_translator([ncbi_name]) #ncbi.get_name_translator() returns a dictionary { 'taxon' : [id]} taxid = taxon_and_id[ncbi_name] #taxid is a list with one number taxid_nr = taxid[0] rank_dict = ncbi.get_rank(taxid) #ncbi.get_rank() requires a list of IDs, and returns a dictionary: # {id: 'rank'} rank = rank_dict[taxid_nr] tax_path_dict = ncbi.get_lineage_translator(taxid) #[taxid_nr] #ncbi.get_lineage_translator() requires a list of IDs, and returns # a dictionary {leaf_id: [root_id, node_id, leaf_id]} tax_path = tax_path_dict[taxid_nr][1:] tax_path_sn = [] #with a for-loop you can translate the taxids in the list # 'tax_path' to their corresponding scientific names (sn) for t in tax_path: tax_path_sn.append(ncbi.get_taxid_translator([t])[t]) rank_list = [] #Making this list requires using a for-loop; # using the function on a list makes an UNORDERED dictionary #Also, since the path differs between branches, I will look # for the longest using a list of lists for taxid in tax_path: rank_dict = ncbi.get_rank([taxid]) rank = rank_dict[taxid] rank_list.append(rank) rank_list_list.append(rank_list) tax_path_string = '|'.join(map(str, tax_path)) tax_path_sn_string = '|'.join(tax_path_sn) percentage = subset.loc[subset["Assignment"] == name]["percentage_of_total_reads"].values[0] output_line = "%s\t%s\t%s\t%s\t%s" % (taxid_nr, rank, tax_path_string, tax_path_sn_string, percentage) output_list.append(output_line) longest_taxonomy = '|'.join(max(rank_list_list, key=len)) #Read the specification for details about this header: #https://github.com/bioboxes/rfc/blob/60263f34c57bc4137deeceec4c68a7f9f810f6a5/data-format/profiling.mkd header = """# Taxonomic Profiling Output @SampleID:%s @Version:0.9.3 @Ranks:%s\t#the longest path in this sample: virus taxonomy is messy @TaxonomyID:ncbi-taxonomy_2018-05-25 @@TAXID\tRANK\tTAXPATH\tTAXPATHSN\tPERCENTAGE """ % (sample_id, longest_taxonomy) return (header, output_list)
mag2md[k]['sample derived from'] = ass_md['sample_accession'] mag2md[k]['ENA-CHECKLIST'] = 'ERC000047' mag2md[k]['isolation_source'] = " ".join(ass_md['taxonomy'].split()[:-1]) mag2md[k]['metagenomic source'] = ass_md['ncbi_taxid'] del mag2md[k]['ncbi_taxid'] del mag2md[k]['taxonomy'] if 'sample_accession' in mag2md[k]: del mag2md[k]['sample_accession'] if 'Run' in mag2md[k]: del mag2md[k]['Run'] if 'Lake_code' in mag2md[k]: del mag2md[k]['Lake_code'] get_parent = lambda taxid: max({ k: v for k, v in ncbi.get_lineage_translator(ncbi.get_lineage(taxid)).items() if k != taxid }.items(), key=lambda l: len(l[1]))[0] tax2uncul = {} for k, v in set(found_taxo.values()): options = ncbi.get_name_translator( ["uncultured " + k + " bacterium", "uncultured " + k + " archaeon"]) if k == 'root': tax2uncul[k] = ('uncultured prokaryote', 198431) continue if not options: options = ncbi.get_name_translator(["uncultured " + k + " sp."]) if not options: options = ncbi.get_name_translator( ["uncultured " + k + " cyanobacterium"])