def determine_phylogeny(fn, fqids, verbose=False): """ Determine if we know the phylogeny of this thing :param fn: the feature name :param fqids: the dict of ids->fastq files :return: the type (currently Bacteria, Archaea, Eukaryota, Metagenome, or unknown), the id in the fastq file, and if a metagenome the type of metagenome """ if fn in fqids: return "Metagenome", fn, fqids[fn] m = re.sub('\.\d+\.\d+$', '', fn) if m in fqids: return "Metagenome", m, fqids[m] m = re.sub('\.[\d\.]+$', '', fn) if m in fqids: return "Metagenome", m, fqids[m] m = re.search('\[(\d+)\]', fn) if not m: if verbose: sys.stderr.write("There is no taxid in {} and it is not in the fastq file\n".format(fn)) return "Unknown", fn, None tid = m.groups()[0] t,n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("Can't find tax for {} in the db\n".format(tid)) return "Unknown", fn, None while t.parent > 1 and t.parent != 131567: # 131567 is cellular organisms t,n = get_taxonomy(t.parent, c) return n.scientific_name, fn, None
def find_rank(tid, trank, tdb, verbose=False): """ Find the value for trank starting at tid. :param tid: The taxonomy ID :param trank: The taxonomic rank to return :param tdb: The taxonomy database :param verbose: More output :return: the taxonomic rank for tid or root if it is not found """ global id2rank if tid in id2rank: return id2rank[tid] seenids = set() t,n = get_taxonomy(tid, tdb) while t.parent != 1 and t.taxid != 1 and t.rank != trank and t.taxid not in id2rank: seenids.add(t.taxid) t, n = get_taxonomy(t.parent, tdb) if t.taxid in id2rank: for s in seenids: id2rank[s] = id2rank[t.taxid] return id2rank[t.taxid] if t.rank == trank: for s in seenids: id2rank[s] = n.scientific_name id2rank[tid] = n.scientific_name return n.scientific_name if verbose: sys.stderr.write(f"{colours.PINK}ERROR: No rank for {tid}\n{colours.ENDC}") return "root"
def tid_to_tax_set(tid, verbose=False): """ Convert a taxonomy ID to a set that contains the hierarchy. :param tid: the taxonomy ID :param verbose: print more stuff :return: the set of taxonomies """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies' ] taxonomy = set() t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("No taxonomy for {}\n".format(tid)) return None while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy.add("{}:{}".format(n.scientific_name, t.rank)) t, n = get_taxonomy(t.parent, c) return taxonomy
def resolve_taxonomy(tid, conn, verbose=False, nocolor=False): """ Convert the taxonomy id to a tab separated string :param tid: the taxonomy object :param conn: the database connection :param verbose: more output :param nocolor: no color ouput :return: a string representing the taxonomy """ global taxonomy_str if tid in taxonomy_str: return taxonomy_str[tid] wanted_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] rnk = ['', '', '', '', '', '', ''] t, n = get_taxonomy(tid, conn) while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: rnk[wanted_levels.index(t.rank)] = n.scientific_name t, n = get_taxonomy(t.parent, conn) taxonomy_str[tid] = "\t".join(map(str, rnk)) return taxonomy_str[tid]
def tid_to_tax_set(tid, verbose=False): """ Convert a taxonomy ID to a set that contains the hierarchy. :param tid: the taxonomy ID :param verbose: print more stuff :return: the set of taxonomies """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies'] taxonomy = set() t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("No taxonomy for {}\n".format(tid)) return None while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy.add("{}:{}".format(n.scientific_name, t.rank)) t, n = get_taxonomy(t.parent, c) return taxonomy
def print_tax(tid, verbose=False): """ Print the taxonomy associated with this id :param tid: the taxonomy id :param verbose: more output :return: nada """ wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies'] taxlist = ["", "", "", "", "", "", "", ""] # connect to the SQL dataabase c = get_taxonomy_db() t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("No taxonomy for {}\n".format(tid)) return while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxlist[wanted_levels.index(t.rank)] = n.scientific_name t, n = get_taxonomy(t.parent, c) print("\t".join(map(str, [tid]+taxlist)))
def print_tax(tid, verbose=False): """ Print the taxonomy associated with this id :param tid: the taxonomy id :param verbose: more output :return: nada """ wanted_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies' ] taxlist = ["", "", "", "", "", "", "", ""] # connect to the SQL dataabase c = get_taxonomy_db() t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("No taxonomy for {}\n".format(tid)) return while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxlist[wanted_levels.index(t.rank)] = n.scientific_name t, n = get_taxonomy(t.parent, c) print("\t".join(map(str, [tid] + taxlist)))
def determine_phylogeny(fn, fqids, verbose=False): """ Determine if we know the phylogeny of this thing :param fn: the feature name :param fqids: the dict of ids->fastq files :return: the type (currently Bacteria, Archaea, Eukaryota, Metagenome, or unknown), the id in the fastq file, and if a metagenome the type of metagenome """ if fn in fqids: return "Metagenome", fn, fqids[fn] m = re.sub('\.\d+\.\d+$', '', fn) if m in fqids: return "Metagenome", m, fqids[m] m = re.sub('\.[\d\.]+$', '', fn) if m in fqids: return "Metagenome", m, fqids[m] m = re.search('\[(\d+)\]', fn) if not m: if verbose: sys.stderr.write( "There is no taxid in {} and it is not in the fastq file\n". format(fn)) return "Unknown", fn, None tid = m.groups()[0] t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("Can't find tax for {} in the db\n".format(tid)) return "Unknown", fn, None while t.parent > 1 and t.parent != 131567: # 131567 is cellular organisms t, n = get_taxonomy(t.parent, c) return n.scientific_name, fn, None
def rename_nodes(tree, verbose=False): """ Rename the nodes based on everything below me """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies' ] wanted_levels.reverse() # too lazy to write in reverse :) taxonomy = {} # first get all the leaves and their parents. This is just to speed things up ... maybe for l in tree.get_leaves(): m = re.search('\[(\d+)\]', l.name) if not m: if verbose: sys.stderr.write("No taxid in {}\n".format(l.name)) continue tid = m.groups()[0] taxonomy[l.name] = {} t, n = get_taxonomy(tid, c) if not t: continue while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy[l.name][t.rank] = n.scientific_name t, n = get_taxonomy(t.parent, c) # now traverse every node that is not a leaf and see if we can some up with a # unique name for the node! sys.stderr.write("Traversing the tree\n") for n in tree.traverse("preorder"): if n.is_leaf(): continue sys.stderr.write("Checking {}\n".format(n.name)) taxs = {w: set() for w in wanted_levels} for l in n.get_leaves(): if l.name not in taxonomy: continue for w in wanted_levels: if w in taxonomy[l.name]: taxs[w].add(taxonomy[l.name][w]) # which is the LOWEST level with a single taxonomy for w in wanted_levels: if len(taxs[w]) == 1: newname = "{} r_{})".format(taxs[w].pop(), w) if verbose: True sys.stderr.write("Changing name from: {} to {}\n".format( n.name, newname)) n.name = newname break return tree
def rename_nodes(tree, verbose=False): """ Rename the nodes based on everything below me """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies'] wanted_levels.reverse() # too lazy to write in reverse :) taxonomy = {} # first get all the leaves and their parents. This is just to speed things up ... maybe for l in tree.get_leaves(): m = re.search('\[(\d+)\]', l.name) if not m: if verbose: sys.stderr.write("No taxid in {}\n".format(l.name)) continue tid = m.groups()[0] taxonomy[l.name] = {} t,n = get_taxonomy(tid, c) if not t: continue while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy[l.name][t.rank] = n.scientific_name t,n = get_taxonomy(t.parent, c) # now traverse every node that is not a leaf and see if we can some up with a # unique name for the node! sys.stderr.write("Traversing the tree\n") for n in tree.traverse("preorder"): if n.is_leaf(): continue sys.stderr.write("Checking {}\n".format(n.name)) taxs = {w:set() for w in wanted_levels} for l in n.get_leaves(): if l.name not in taxonomy: continue for w in wanted_levels: if w in taxonomy[l.name]: taxs[w].add(taxonomy[l.name][w]) # which is the LOWEST level with a single taxonomy for w in wanted_levels: if len(taxs[w]) == 1: newname = "{} r_{})".format(taxs[w].pop(), w) if verbose: True sys.stderr.write("Changing name from: {} to {}\n".format(n.name, newname)) n.name = newname break return tree
def determine_phylogeny(fn, verbose=False): m = re.search('\[(\d+)\]', fn) if not m: return "Unknown", None tid = m.groups()[0] t,n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("Can't find tax for {} in the db\n".format(tid)) return "Unknown", None while t.parent > 1 and t.parent != 131567: # 131567 is cellular organisms t,n = get_taxonomy(t.parent, c) return n.scientific_name, None
def determine_phylogeny(fn, verbose=False): m = re.search('\[(\d+)\]', fn) if not m: return "Unknown", None tid = m.groups()[0] t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("Can't find tax for {} in the db\n".format(tid)) return "Unknown", None while t.parent > 1 and t.parent != 131567: # 131567 is cellular organisms t, n = get_taxonomy(t.parent, c) return n.scientific_name, None
def printtaxa(i, c): """ Print out the taxonomy :param i: identifier :param c: database connection :return: """ names = {w: "" for w in want} t, n = get_taxonomy(i, c) if t.rank in want: names[t.rank] = n.get_name() while t.parent != 1 and t.taxid != 1: t, n = get_taxonomy(t.parent, c) if t.rank in want: names[t.rank] = n.get_name() print("\t".join([str(i)] + [names[w] for w in want]))
def taxstring(tid, verbose=False): """ :param tid: taxonomy ID :param verbose: more output :return: an array of the taxnomy from kingdom -> species """ global taxa if tid in taxa: return taxa[tid] want = [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] thistaxa = ['', '', '', '', '', '', ''] c = get_taxonomy_db() try: m, n = get_taxonomy(tid, c) except EntryNotInDatabaseError: sys.stderr.write( f"{bcolors.RED}{tid} not in database.Skipped line{bcolors.ENDC}\n") taxa[tid] = thistaxa return taxa[tid] thisname = choosename(n, verbose) if thisname: if m.rank in want: thistaxa[want.index(m.rank)] = thisname[0].upper() + thisname[1:] for p in taxonomy_hierarchy(tid, verbose=False): m, n = get_taxonomy(p, c) thisname = choosename(n, verbose) if not thisname: sys.stderr.write( f"{bcolors.RED}ERROR: No name for {tid}{bcolors.ENDC}\n") return if m.rank in want: thistaxa[want.index(m.rank)] = thisname[0].upper() + thisname[1:] taxa[tid] = thistaxa return taxa[tid]
def resolve_taxonomy(tid, conn, verbose=False, nocolor=False): """ Convert the taxonomy id to a tab separated string :param tid: the taxonomy object :param conn: the database connection :param verbose: more output :param nocolor: no color ouput :return: a string representing the taxonomy """ global taxonomy_str if tid in taxonomy_str: return taxonomy_str[tid] wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] rnk = ['', '', '', '', '', '', ''] t, n = get_taxonomy(tid, conn) while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: rnk[wanted_levels.index(t.rank)] = n.scientific_name t, n = get_taxonomy(t.parent, conn) taxonomy_str[tid] = "\t".join(rnk) return taxonomy_str[tid]
def print_kingdom(f, verbose=False): c = get_taxonomy_db() with open(f, 'r') as fin: for l in fin: l = l.strip() t = get_taxid_for_name(l, c) if not t: print(f"{l}\tUnknown") continue for p in taxonomy_hierarchy(t, verbose=False): m,n = get_taxonomy(p, c) if m.rank == 'superkingdom': nm = n.blast_name[0].upper() + n.blast_name[1:] print(f"{l}\t{nm}")
def rename_nodes_ncbi(tree, verbose=False): """ Rename the nodes based on everything below me, but also give each node a unique branch number. The format of this number is _b\d+ :param tree: the tree to rename :param verbose: more output :return: the renamed tree """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies'] wanted_levels.reverse() # too lazy to write in reverse :) taxonomy = {} # first get all the leaves and their parents. This is just to speed things up ... maybe for l in tree.get_leaves(): m = re.search('\[(\d+)\]', l.name) if not m: if verbose: sys.stderr.write("No taxid in {}\n".format(l.name)) continue tid = m.groups()[0] taxonomy[l.name] = {} t, n = get_taxonomy(tid, c) if not t: continue while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy[l.name][t.rank] = n.scientific_name t, n = get_taxonomy(t.parent, c) # now traverse every node that is not a leaf and see if we can some up with a # unique name for the node! if verbose: sys.stderr.write("Traversing the tree to rename the nodes\n") branchnum = 0 for n in tree.traverse("postorder"): if n.is_leaf(): continue ## if both our children have the same name, we acquire that name and reset their names ## otherwise we figure out what our name should be based on the last common ancestor ## of the leaves. children = n.get_children() names = set([re.sub('\s+b_\d+', '', x.name) for x in children]) if len(names) == 1: n.name = "{} b_{}".format(names.pop(), branchnum) if verbose: sys.stderr.write("Reset name to {} because both children are the same\n".format(n.name)) for c in children: oldname = c.name c.name = re.sub('r_\w+\s+', '', c.name) if verbose: sys.stderr.write("\tAs both children the same set {} to {}\n".format(oldname, c.name)) else: ## We have to figure out what our unique name should be taxs = {w: set() for w in wanted_levels} for l in n.get_leaves(): if l.name not in taxonomy: continue for w in wanted_levels: if w in taxonomy[l.name]: taxs[w].add(taxonomy[l.name][w]) # which is the LOWEST level with a single taxonomy for w in wanted_levels: if len(taxs[w]) == 1: newname = "{} r_{} b_{}".format(taxs[w].pop(), w, branchnum) if verbose: sys.stderr.write("Changing name from: {} to {}\n".format(n.name, newname)) n.name = newname break branchnum += 1 return tree
33340 name: Neoptera blast name: Neoptera rank: infraclass parent: 7496 7496 name: Pterygota blast name: Pterygota rank: subclass parent: 85512 85512 name: Dicondylia blast name: Dicondylia rank: no rank parent: 50557 50557 name: Insecta blast name: Insecta rank: class parent: 6960 6960 name: Hexapoda blast name: insects rank: superclass parent: 197562 197562 name: Pancrustacea blast name: Pancrustacea rank: no rank parent: 197563 197563 name: Mandibulata blast name: Mandibulata rank: no rank parent: 6656 6656 name: Arthropoda blast name: arthropods rank: phylum parent: 88770 88770 name: Panarthropoda blast name: Panarthropoda rank: no rank parent: 1206794 1206794 name: Ecdysozoa blast name: Ecdysozoa rank: no rank parent: 33317 33317 name: Protostomia blast name: Protostomia rank: no rank parent: 33213 33213 name: Bilateria blast name: Bilateria rank: no rank parent: 6072 6072 name: Eumetazoa blast name: Eumetazoa rank: no rank parent: 33208 33208 name: Metazoa blast name: animals rank: kingdom parent: 33154 33154 name: Opisthokonta blast name: Opisthokonta rank: no rank parent: 2759 2759 name: Eukaryota blast name: eukaryotes rank: superkingdom parent: 131567 """) sys.exit(0) else: ids=sys.argv[1:] c = get_taxonomy_db() for i in ids: t, n = get_taxonomy(i, c) while t.parent != 1 and t.taxid != 1: print("{}\tname: {}\trank: {}\tparent: {}".format(t.taxid, n.scientific_name, t.rank, t.parent)) t, n = get_taxonomy(t.parent, c)
with open(args.o, 'w', encoding='utf-8') as out: with open(args.f, 'r', encoding='utf-8') as f: for l in f: p = l.strip().split("\t") while (len(p) < maxp): p.append("") if l.startswith("genome_id"): out.write("{}\t{}\n".format(l.strip(), "\t".join(want))) continue tid = p[args.c] level = {} t, n = get_taxonomy(tid, c) while t and t.parent > 1 and t.parent != 131567: # 131567 is cellular organisms if t.rank in want: level[t.rank] = n.scientific_name t, n = get_taxonomy(t.parent, c) for w in want: if w in level: p.append(level[w]) else: p.append("") out.write("\t".join(map(str, p))) out.write("\n")