def print_tax(tid, verbose=False): """ Print the taxonomy associated with this id :param tid: the taxonomy id :param verbose: more output :return: nada """ wanted_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies' ] taxlist = ["", "", "", "", "", "", "", ""] # connect to the SQL dataabase c = get_taxonomy_db() t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("No taxonomy for {}\n".format(tid)) return while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxlist[wanted_levels.index(t.rank)] = n.scientific_name t, n = get_taxonomy(t.parent, c) print("\t".join(map(str, [tid] + taxlist)))
def print_tax(tid, verbose=False): """ Print the taxonomy associated with this id :param tid: the taxonomy id :param verbose: more output :return: nada """ wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies'] taxlist = ["", "", "", "", "", "", "", ""] # connect to the SQL dataabase c = get_taxonomy_db() t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("No taxonomy for {}\n".format(tid)) return while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxlist[wanted_levels.index(t.rank)] = n.scientific_name t, n = get_taxonomy(t.parent, c) print("\t".join(map(str, [tid]+taxlist)))
def tid_to_tax_set(tid, verbose=False): """ Convert a taxonomy ID to a set that contains the hierarchy. :param tid: the taxonomy ID :param verbose: print more stuff :return: the set of taxonomies """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies' ] taxonomy = set() t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("No taxonomy for {}\n".format(tid)) return None while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy.add("{}:{}".format(n.scientific_name, t.rank)) t, n = get_taxonomy(t.parent, c) return taxonomy
def tid_to_tax_set(tid, verbose=False): """ Convert a taxonomy ID to a set that contains the hierarchy. :param tid: the taxonomy ID :param verbose: print more stuff :return: the set of taxonomies """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies'] taxonomy = set() t, n = get_taxonomy(tid, c) if not t: if verbose: sys.stderr.write("No taxonomy for {}\n".format(tid)) return None while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy.add("{}:{}".format(n.scientific_name, t.rank)) t, n = get_taxonomy(t.parent, c) return taxonomy
def rename_nodes(tree, verbose=False): """ Rename the nodes based on everything below me """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies' ] wanted_levels.reverse() # too lazy to write in reverse :) taxonomy = {} # first get all the leaves and their parents. This is just to speed things up ... maybe for l in tree.get_leaves(): m = re.search('\[(\d+)\]', l.name) if not m: if verbose: sys.stderr.write("No taxid in {}\n".format(l.name)) continue tid = m.groups()[0] taxonomy[l.name] = {} t, n = get_taxonomy(tid, c) if not t: continue while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy[l.name][t.rank] = n.scientific_name t, n = get_taxonomy(t.parent, c) # now traverse every node that is not a leaf and see if we can some up with a # unique name for the node! sys.stderr.write("Traversing the tree\n") for n in tree.traverse("preorder"): if n.is_leaf(): continue sys.stderr.write("Checking {}\n".format(n.name)) taxs = {w: set() for w in wanted_levels} for l in n.get_leaves(): if l.name not in taxonomy: continue for w in wanted_levels: if w in taxonomy[l.name]: taxs[w].add(taxonomy[l.name][w]) # which is the LOWEST level with a single taxonomy for w in wanted_levels: if len(taxs[w]) == 1: newname = "{} r_{})".format(taxs[w].pop(), w) if verbose: True sys.stderr.write("Changing name from: {} to {}\n".format( n.name, newname)) n.name = newname break return tree
def rename_nodes(tree, verbose=False): """ Rename the nodes based on everything below me """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies'] wanted_levels.reverse() # too lazy to write in reverse :) taxonomy = {} # first get all the leaves and their parents. This is just to speed things up ... maybe for l in tree.get_leaves(): m = re.search('\[(\d+)\]', l.name) if not m: if verbose: sys.stderr.write("No taxid in {}\n".format(l.name)) continue tid = m.groups()[0] taxonomy[l.name] = {} t,n = get_taxonomy(tid, c) if not t: continue while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy[l.name][t.rank] = n.scientific_name t,n = get_taxonomy(t.parent, c) # now traverse every node that is not a leaf and see if we can some up with a # unique name for the node! sys.stderr.write("Traversing the tree\n") for n in tree.traverse("preorder"): if n.is_leaf(): continue sys.stderr.write("Checking {}\n".format(n.name)) taxs = {w:set() for w in wanted_levels} for l in n.get_leaves(): if l.name not in taxonomy: continue for w in wanted_levels: if w in taxonomy[l.name]: taxs[w].add(taxonomy[l.name][w]) # which is the LOWEST level with a single taxonomy for w in wanted_levels: if len(taxs[w]) == 1: newname = "{} r_{})".format(taxs[w].pop(), w) if verbose: True sys.stderr.write("Changing name from: {} to {}\n".format(n.name, newname)) n.name = newname break return tree
def print_kingdom(f, verbose=False): c = get_taxonomy_db() with open(f, 'r') as fin: for l in fin: l = l.strip() t = get_taxid_for_name(l, c) if not t: print(f"{l}\tUnknown") continue for p in taxonomy_hierarchy(t, verbose=False): m,n = get_taxonomy(p, c) if m.rank == 'superkingdom': nm = n.blast_name[0].upper() + n.blast_name[1:] print(f"{l}\t{nm}")
def parse_m8_tophit(m8f, evalue, verbose=False): """ Parse the m8 file and ... do something! :param m8f: the m8 output file from diamond :param evalue: the maximum evalue :param verbose: more output (maybe) :return: """ fig = re.compile('fig\|(\d+)\.\d+') c = get_taxonomy_db() global taxonomy matches = [] if verbose: sys.stderr.write(f"{colors.GREEN}Reading {m8f}{colors.ENDC}\n") printed=set() with open(m8f, 'r') as f: for l in f: p = l.strip().split("\t") if float(p[10]) > evalue: continue if p[0] in printed: continue m = fig.match(p[1]) if m: tid = m.group(1) if tid in ignore_tids: continue if tid not in taxonomy: try: taxonomy[tid] = taxonomy_hierarchy_as_list(c, tid, True) except EntryNotInDatabaseError as e: ignore_tids.add(tid) continue if taxonomy[tid]: if 'metagenome' in taxonomy[tid][6].lower(): taxonomy.pop(tid) ignore_tids.add(tid) continue r = "\t".join(taxonomy[tid]) printed.add(p[0]) print(f"{p[0]}\t{r}\ttaxid: {tid}")
def parse_m8(m8f, evalue, verbose=False): """ Parse the m8 file and ... do something! :param m8f: the m8 output file from diamond :param evalue: the maximum evalue :param verbose: more output (maybe) :return: """ fig = re.compile('fig\|(\d+)\.\d+') c = get_taxonomy_db() global taxonomy matches = [] if verbose: sys.stderr.write(f"{colors.GREEN}Reading {m8f}{colors.ENDC}\n") lastid = None with open(m8f, 'r') as f: for l in f: p = l.strip().split("\t") if float(p[10]) > evalue: continue if lastid and p[0] != lastid: printmatches(lastid, matches) matches = [] lastid = p[0] m = fig.match(p[1]) if m: tid = m.group(1) if tid in ignore_tids: continue if tid not in taxonomy: try: taxonomy[tid] = taxonomy_hierarchy_as_list(c, tid, True) except EntryNotInDatabaseError as e: ignore_tids.add(tid) continue if taxonomy[tid]: matches.append(taxonomy[tid])
def taxstring(tid, verbose=False): """ :param tid: taxonomy ID :param verbose: more output :return: an array of the taxnomy from kingdom -> species """ global taxa if tid in taxa: return taxa[tid] want = [ 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] thistaxa = ['', '', '', '', '', '', ''] c = get_taxonomy_db() try: m, n = get_taxonomy(tid, c) except EntryNotInDatabaseError: sys.stderr.write( f"{bcolors.RED}{tid} not in database.Skipped line{bcolors.ENDC}\n") taxa[tid] = thistaxa return taxa[tid] thisname = choosename(n, verbose) if thisname: if m.rank in want: thistaxa[want.index(m.rank)] = thisname[0].upper() + thisname[1:] for p in taxonomy_hierarchy(tid, verbose=False): m, n = get_taxonomy(p, c) thisname = choosename(n, verbose) if not thisname: sys.stderr.write( f"{bcolors.RED}ERROR: No name for {tid}{bcolors.ENDC}\n") return if m.rank in want: thistaxa[want.index(m.rank)] = thisname[0].upper() + thisname[1:] taxa[tid] = thistaxa return taxa[tid]
t, n = get_taxonomy(t.parent, tdb) if t.taxid in id2rank: for s in seenids: id2rank[s] = id2rank[t.taxid] return id2rank[t.taxid] if t.rank == trank: for s in seenids: id2rank[s] = n.scientific_name id2rank[tid] = n.scientific_name return n.scientific_name if verbose: sys.stderr.write(f"{colours.PINK}ERROR: No rank for {tid}\n{colours.ENDC}") return "root" if __name__ == '__main__': parser = argparse.ArgumentParser(description="Print the taxonomic rank for all ids ") parser.add_argument('-t', help='taxnomic rank', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() c = get_taxonomy_db() ais = all_ids(c, args.v) sys.stderr.write(f"{colours.GREEN}There are {len(ais)} ids\n{colours.ENDC}") for restple in ais: i = restple[0] print(f"{i}\t{find_rank(i, args.t, c, args.v)}")
33340 name: Neoptera blast name: Neoptera rank: infraclass parent: 7496 7496 name: Pterygota blast name: Pterygota rank: subclass parent: 85512 85512 name: Dicondylia blast name: Dicondylia rank: no rank parent: 50557 50557 name: Insecta blast name: Insecta rank: class parent: 6960 6960 name: Hexapoda blast name: insects rank: superclass parent: 197562 197562 name: Pancrustacea blast name: Pancrustacea rank: no rank parent: 197563 197563 name: Mandibulata blast name: Mandibulata rank: no rank parent: 6656 6656 name: Arthropoda blast name: arthropods rank: phylum parent: 88770 88770 name: Panarthropoda blast name: Panarthropoda rank: no rank parent: 1206794 1206794 name: Ecdysozoa blast name: Ecdysozoa rank: no rank parent: 33317 33317 name: Protostomia blast name: Protostomia rank: no rank parent: 33213 33213 name: Bilateria blast name: Bilateria rank: no rank parent: 6072 6072 name: Eumetazoa blast name: Eumetazoa rank: no rank parent: 33208 33208 name: Metazoa blast name: animals rank: kingdom parent: 33154 33154 name: Opisthokonta blast name: Opisthokonta rank: no rank parent: 2759 2759 name: Eukaryota blast name: eukaryotes rank: superkingdom parent: 131567 """) sys.exit(0) else: ids=sys.argv[1:] c = get_taxonomy_db() for i in ids: t, n = get_taxonomy(i, c) while t.parent != 1 and t.taxid != 1: print("{}\tname: {}\trank: {}\tparent: {}".format(t.taxid, n.scientific_name, t.rank, t.parent)) t, n = get_taxonomy(t.parent, c)
def rename_nodes_ncbi(tree, verbose=False): """ Rename the nodes based on everything below me, but also give each node a unique branch number. The format of this number is _b\d+ :param tree: the tree to rename :param verbose: more output :return: the renamed tree """ # connect to the SQL dataabase c = get_taxonomy_db() wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies'] wanted_levels.reverse() # too lazy to write in reverse :) taxonomy = {} # first get all the leaves and their parents. This is just to speed things up ... maybe for l in tree.get_leaves(): m = re.search('\[(\d+)\]', l.name) if not m: if verbose: sys.stderr.write("No taxid in {}\n".format(l.name)) continue tid = m.groups()[0] taxonomy[l.name] = {} t, n = get_taxonomy(tid, c) if not t: continue while t.parent != 1 and t.taxid != 1: if t.rank in wanted_levels: taxonomy[l.name][t.rank] = n.scientific_name t, n = get_taxonomy(t.parent, c) # now traverse every node that is not a leaf and see if we can some up with a # unique name for the node! if verbose: sys.stderr.write("Traversing the tree to rename the nodes\n") branchnum = 0 for n in tree.traverse("postorder"): if n.is_leaf(): continue ## if both our children have the same name, we acquire that name and reset their names ## otherwise we figure out what our name should be based on the last common ancestor ## of the leaves. children = n.get_children() names = set([re.sub('\s+b_\d+', '', x.name) for x in children]) if len(names) == 1: n.name = "{} b_{}".format(names.pop(), branchnum) if verbose: sys.stderr.write("Reset name to {} because both children are the same\n".format(n.name)) for c in children: oldname = c.name c.name = re.sub('r_\w+\s+', '', c.name) if verbose: sys.stderr.write("\tAs both children the same set {} to {}\n".format(oldname, c.name)) else: ## We have to figure out what our unique name should be taxs = {w: set() for w in wanted_levels} for l in n.get_leaves(): if l.name not in taxonomy: continue for w in wanted_levels: if w in taxonomy[l.name]: taxs[w].add(taxonomy[l.name][w]) # which is the LOWEST level with a single taxonomy for w in wanted_levels: if len(taxs[w]) == 1: newname = "{} r_{} b_{}".format(taxs[w].pop(), w, branchnum) if verbose: sys.stderr.write("Changing name from: {} to {}\n".format(n.name, newname)) n.name = newname break branchnum += 1 return tree