Exemple #1
0
def print_tax(tid, verbose=False):
    """
    Print the taxonomy associated with this id
    :param tid: the taxonomy id
    :param verbose: more output
    :return: nada
    """

    wanted_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species', 'subspecies'
    ]
    taxlist = ["", "", "", "", "", "", "", ""]

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("No taxonomy for {}\n".format(tid))
        return

    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            taxlist[wanted_levels.index(t.rank)] = n.scientific_name
        t, n = get_taxonomy(t.parent, c)
    print("\t".join(map(str, [tid] + taxlist)))
def print_tax(tid, verbose=False):
    """
    Print the taxonomy associated with this id
    :param tid: the taxonomy id
    :param verbose: more output
    :return: nada
    """

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
    taxlist = ["", "", "", "", "", "", "", ""]

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("No taxonomy for {}\n".format(tid))
        return

    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            taxlist[wanted_levels.index(t.rank)] = n.scientific_name
        t, n = get_taxonomy(t.parent, c)
    print("\t".join(map(str, [tid]+taxlist)))
Exemple #3
0
def tid_to_tax_set(tid, verbose=False):
    """
    Convert a taxonomy ID to a set that contains the hierarchy.
    :param tid: the taxonomy ID
    :param verbose: print more stuff
    :return: the set of taxonomies
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species', 'subspecies'
    ]
    taxonomy = set()
    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("No taxonomy for {}\n".format(tid))
        return None

    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            taxonomy.add("{}:{}".format(n.scientific_name, t.rank))
        t, n = get_taxonomy(t.parent, c)

    return taxonomy
def tid_to_tax_set(tid, verbose=False):
    """
    Convert a taxonomy ID to a set that contains the hierarchy.
    :param tid: the taxonomy ID
    :param verbose: print more stuff
    :return: the set of taxonomies
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
    taxonomy = set()
    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("No taxonomy for {}\n".format(tid))
        return None

    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            taxonomy.add("{}:{}".format(n.scientific_name, t.rank))
        t, n = get_taxonomy(t.parent, c)

    return taxonomy
Exemple #5
0
def rename_nodes(tree, verbose=False):
    """
    Rename the nodes based on everything below me
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species', 'subspecies'
    ]
    wanted_levels.reverse()  # too lazy to write in reverse :)
    taxonomy = {}
    # first get all the leaves and their parents. This is just to speed things up ... maybe
    for l in tree.get_leaves():
        m = re.search('\[(\d+)\]', l.name)
        if not m:
            if verbose:
                sys.stderr.write("No taxid in {}\n".format(l.name))
            continue
        tid = m.groups()[0]
        taxonomy[l.name] = {}
        t, n = get_taxonomy(tid, c)
        if not t:
            continue
        while t.parent != 1 and t.taxid != 1:
            if t.rank in wanted_levels:
                taxonomy[l.name][t.rank] = n.scientific_name
            t, n = get_taxonomy(t.parent, c)

    # now traverse every node that is not a leaf and see if we can some up with a
    # unique name for the node!
    sys.stderr.write("Traversing the tree\n")
    for n in tree.traverse("preorder"):
        if n.is_leaf():
            continue
        sys.stderr.write("Checking {}\n".format(n.name))
        taxs = {w: set() for w in wanted_levels}
        for l in n.get_leaves():
            if l.name not in taxonomy:
                continue
            for w in wanted_levels:
                if w in taxonomy[l.name]:
                    taxs[w].add(taxonomy[l.name][w])
        # which is the LOWEST level with a single taxonomy
        for w in wanted_levels:
            if len(taxs[w]) == 1:
                newname = "{} r_{})".format(taxs[w].pop(), w)
                if verbose:
                    True
                sys.stderr.write("Changing name from: {} to {}\n".format(
                    n.name, newname))
                n.name = newname
                break
    return tree
Exemple #6
0
def rename_nodes(tree, verbose=False):
    """
    Rename the nodes based on everything below me
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
    wanted_levels.reverse() # too lazy to write in reverse :)
    taxonomy = {}
    # first get all the leaves and their parents. This is just to speed things up ... maybe
    for l in tree.get_leaves():
        m = re.search('\[(\d+)\]', l.name)
        if not m:
            if verbose:
                sys.stderr.write("No taxid in {}\n".format(l.name))
            continue
        tid = m.groups()[0]
        taxonomy[l.name] = {}
        t,n = get_taxonomy(tid, c)
        if not t:
            continue
        while t.parent != 1 and t.taxid != 1:
            if t.rank in wanted_levels:
                taxonomy[l.name][t.rank] = n.scientific_name
            t,n = get_taxonomy(t.parent, c)
        
    # now traverse every node that is not a leaf and see if we can some up with a 
    # unique name for the node!
    sys.stderr.write("Traversing the tree\n")
    for n in tree.traverse("preorder"):
        if n.is_leaf():
            continue
        sys.stderr.write("Checking {}\n".format(n.name))
        taxs = {w:set() for w in wanted_levels}
        for l in n.get_leaves():
            if l.name not in taxonomy:
                continue
            for w in wanted_levels:
                if w in taxonomy[l.name]:
                    taxs[w].add(taxonomy[l.name][w])
        # which is the LOWEST level with a single taxonomy
        for w in wanted_levels:
            if len(taxs[w]) == 1:
                newname = "{} r_{})".format(taxs[w].pop(), w)
                if verbose:
                    True
                sys.stderr.write("Changing name from: {} to {}\n".format(n.name, newname))
                n.name = newname
                break
    return tree
Exemple #7
0
def print_kingdom(f, verbose=False):

    c = get_taxonomy_db()
    with open(f, 'r') as fin:
        for l in fin:
            l = l.strip()
            t = get_taxid_for_name(l, c)
            if not t:
                print(f"{l}\tUnknown")
                continue
            for p in taxonomy_hierarchy(t, verbose=False):
                m,n = get_taxonomy(p, c)
                if m.rank == 'superkingdom':
                    nm = n.blast_name[0].upper() + n.blast_name[1:]
                    print(f"{l}\t{nm}")
Exemple #8
0
def parse_m8_tophit(m8f, evalue, verbose=False):
    """
    Parse the m8 file and ... do something!
    :param m8f: the m8 output file from diamond
    :param evalue: the maximum evalue
    :param verbose: more output (maybe)
    :return:
    """

    fig = re.compile('fig\|(\d+)\.\d+')
    c = get_taxonomy_db()
    global taxonomy

    matches = []
    if verbose:
        sys.stderr.write(f"{colors.GREEN}Reading {m8f}{colors.ENDC}\n")
    printed=set()

    with open(m8f, 'r') as f:
        for l in f:
            p = l.strip().split("\t")
            if float(p[10]) > evalue:
                continue
            if p[0] in printed:
                continue
            m = fig.match(p[1])
            if m:
                tid = m.group(1)
                if tid in ignore_tids:
                    continue
                if tid not in taxonomy:
                    try:
                        taxonomy[tid] = taxonomy_hierarchy_as_list(c, tid, True)
                    except EntryNotInDatabaseError as e:
                        ignore_tids.add(tid)
                        continue
                if taxonomy[tid]:
                    if 'metagenome' in taxonomy[tid][6].lower():
                        taxonomy.pop(tid)
                        ignore_tids.add(tid)
                        continue
                    r = "\t".join(taxonomy[tid])
                    printed.add(p[0])
                    print(f"{p[0]}\t{r}\ttaxid: {tid}")
Exemple #9
0
def parse_m8(m8f, evalue, verbose=False):
    """
    Parse the m8 file and ... do something!
    :param m8f: the m8 output file from diamond
    :param evalue: the maximum evalue
    :param verbose: more output (maybe)
    :return:
    """

    fig = re.compile('fig\|(\d+)\.\d+')
    c = get_taxonomy_db()
    global taxonomy

    matches = []
    if verbose:
        sys.stderr.write(f"{colors.GREEN}Reading {m8f}{colors.ENDC}\n")
    lastid = None

    with open(m8f, 'r') as f:
        for l in f:
            p = l.strip().split("\t")
            if float(p[10]) > evalue:
                continue
            if lastid and p[0] != lastid:
                printmatches(lastid, matches)
                matches = []
            lastid = p[0]
            m = fig.match(p[1])
            if m:
                tid = m.group(1)
                if tid in ignore_tids:
                    continue
                if tid not in taxonomy:
                    try:
                        taxonomy[tid] = taxonomy_hierarchy_as_list(c, tid, True)
                    except EntryNotInDatabaseError as e:
                        ignore_tids.add(tid)
                        continue
                if taxonomy[tid]:
                    matches.append(taxonomy[tid])
Exemple #10
0
def taxstring(tid, verbose=False):
    """

    :param tid: taxonomy ID
    :param verbose: more output
    :return: an array of the taxnomy from kingdom -> species
    """
    global taxa
    if tid in taxa:
        return taxa[tid]

    want = [
        'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]
    thistaxa = ['', '', '', '', '', '', '']
    c = get_taxonomy_db()
    try:
        m, n = get_taxonomy(tid, c)
    except EntryNotInDatabaseError:
        sys.stderr.write(
            f"{bcolors.RED}{tid} not in database.Skipped line{bcolors.ENDC}\n")
        taxa[tid] = thistaxa
        return taxa[tid]

    thisname = choosename(n, verbose)
    if thisname:
        if m.rank in want:
            thistaxa[want.index(m.rank)] = thisname[0].upper() + thisname[1:]
    for p in taxonomy_hierarchy(tid, verbose=False):
        m, n = get_taxonomy(p, c)
        thisname = choosename(n, verbose)
        if not thisname:
            sys.stderr.write(
                f"{bcolors.RED}ERROR: No name for {tid}{bcolors.ENDC}\n")
            return
        if m.rank in want:
            thistaxa[want.index(m.rank)] = thisname[0].upper() + thisname[1:]
    taxa[tid] = thistaxa
    return taxa[tid]
Exemple #11
0
        t, n = get_taxonomy(t.parent, tdb)
    if t.taxid in id2rank:
        for s in seenids:
            id2rank[s] = id2rank[t.taxid]
        return id2rank[t.taxid]
    if t.rank == trank:
        for s in seenids:
            id2rank[s] = n.scientific_name
        id2rank[tid] = n.scientific_name
        return n.scientific_name
    if verbose:
        sys.stderr.write(f"{colours.PINK}ERROR: No rank for {tid}\n{colours.ENDC}")

    return "root"




if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Print the taxonomic rank for all ids ")
    parser.add_argument('-t', help='taxnomic rank', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    c = get_taxonomy_db()
    ais = all_ids(c, args.v)
    sys.stderr.write(f"{colours.GREEN}There are {len(ais)} ids\n{colours.ENDC}")
    for restple in ais:
        i = restple[0]
        print(f"{i}\t{find_rank(i, args.t, c, args.v)}")
              33340 name: Neoptera    blast name: Neoptera   rank: infraclass   parent: 7496
              7496 name: Pterygota    blast name: Pterygota   rank: subclass   parent: 85512
              85512 name: Dicondylia    blast name: Dicondylia   rank: no rank   parent: 50557
              50557 name: Insecta    blast name: Insecta   rank: class   parent: 6960
              6960 name: Hexapoda    blast name: insects   rank: superclass   parent: 197562
              197562 name: Pancrustacea    blast name: Pancrustacea   rank: no rank   parent: 197563
              197563 name: Mandibulata    blast name: Mandibulata   rank: no rank   parent: 6656
              6656 name: Arthropoda    blast name: arthropods   rank: phylum   parent: 88770
              88770 name: Panarthropoda    blast name: Panarthropoda   rank: no rank   parent: 1206794
              1206794 name: Ecdysozoa    blast name: Ecdysozoa   rank: no rank   parent: 33317
              33317 name: Protostomia    blast name: Protostomia   rank: no rank   parent: 33213
              33213 name: Bilateria    blast name: Bilateria   rank: no rank   parent: 6072
              6072 name: Eumetazoa    blast name: Eumetazoa   rank: no rank   parent: 33208
              33208 name: Metazoa    blast name: animals   rank: kingdom   parent: 33154
              33154 name: Opisthokonta    blast name: Opisthokonta   rank: no rank   parent: 2759
              2759 name: Eukaryota    blast name: eukaryotes   rank: superkingdom   parent: 131567
              """)
        sys.exit(0)
    else:
        ids=sys.argv[1:]


    c = get_taxonomy_db()
    for i in ids:
        t, n = get_taxonomy(i, c)
        while t.parent != 1 and t.taxid != 1:
            print("{}\tname: {}\trank: {}\tparent: {}".format(t.taxid, n.scientific_name, t.rank, t.parent))
            t, n = get_taxonomy(t.parent, c)


def rename_nodes_ncbi(tree, verbose=False):
    """
    Rename the nodes based on everything below me, but also give each node a unique branch number.
    The format of this number is _b\d+

    :param tree: the tree to rename
    :param verbose: more output
    :return: the renamed tree
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
    wanted_levels.reverse()  # too lazy to write in reverse :)
    taxonomy = {}
    # first get all the leaves and their parents. This is just to speed things up ... maybe
    for l in tree.get_leaves():
        m = re.search('\[(\d+)\]', l.name)
        if not m:
            if verbose:
                sys.stderr.write("No taxid in {}\n".format(l.name))
            continue
        tid = m.groups()[0]
        taxonomy[l.name] = {}
        t, n = get_taxonomy(tid, c)
        if not t:
            continue
        while t.parent != 1 and t.taxid != 1:
            if t.rank in wanted_levels:
                taxonomy[l.name][t.rank] = n.scientific_name
            t, n = get_taxonomy(t.parent, c)

    # now traverse every node that is not a leaf and see if we can some up with a
    # unique name for the node!
    if verbose:
        sys.stderr.write("Traversing the tree to rename the nodes\n")
    branchnum = 0
    for n in tree.traverse("postorder"):
        if n.is_leaf():
            continue
        ## if both our children have the same name, we acquire that name and reset their names
        ## otherwise we figure out what our name should be based on the last common ancestor
        ## of the leaves.

        children = n.get_children()
        names = set([re.sub('\s+b_\d+', '', x.name) for x in children])
        if len(names) == 1:
            n.name = "{} b_{}".format(names.pop(), branchnum)
            if verbose:
                sys.stderr.write("Reset name to {} because both children are the same\n".format(n.name))
            for c in children:
                oldname = c.name
                c.name = re.sub('r_\w+\s+', '', c.name)
                if verbose:
                    sys.stderr.write("\tAs both children the same set {} to {}\n".format(oldname, c.name))
        else:
            ## We have to figure out what our unique name should be
            taxs = {w: set() for w in wanted_levels}
            for l in n.get_leaves():
                if l.name not in taxonomy:
                    continue
                for w in wanted_levels:
                    if w in taxonomy[l.name]:
                        taxs[w].add(taxonomy[l.name][w])
            # which is the LOWEST level with a single taxonomy
            for w in wanted_levels:
                if len(taxs[w]) == 1:
                    newname = "{} r_{} b_{}".format(taxs[w].pop(), w, branchnum)
                    if verbose:
                        sys.stderr.write("Changing name from: {} to {}\n".format(n.name, newname))
                    n.name = newname
                    break
        branchnum += 1
    return tree