Example #1
0
def determine_phylogeny(fn, fqids, verbose=False):
    """
    Determine if we know the phylogeny of this thing
    :param fn: the feature name
    :param fqids: the dict of ids->fastq files
    :return: the type (currently Bacteria, Archaea, Eukaryota, Metagenome, or unknown), the id in the fastq file, and if a metagenome the type of metagenome
    """


    if fn in fqids:
        return "Metagenome", fn, fqids[fn]
    m = re.sub('\.\d+\.\d+$', '', fn)
    if m in fqids:
        return "Metagenome", m, fqids[m]
    m = re.sub('\.[\d\.]+$', '', fn)
    if m in fqids:
        return "Metagenome", m, fqids[m]
    m = re.search('\[(\d+)\]', fn)
    if not m:
        if verbose:
            sys.stderr.write("There is no taxid in {} and it is not in the fastq file\n".format(fn))
        return "Unknown", fn, None

    tid = m.groups()[0]
    t,n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("Can't find tax for {} in the db\n".format(tid))
        return "Unknown", fn, None

    while t.parent > 1 and t.parent != 131567:
        # 131567 is cellular organisms
        t,n = get_taxonomy(t.parent, c)
    return n.scientific_name, fn, None
Example #2
0
def find_rank(tid, trank, tdb, verbose=False):
    """
    Find the value for trank starting at tid.
    :param tid: The taxonomy ID
    :param trank: The taxonomic rank to return
    :param tdb: The taxonomy database
    :param verbose: More output
    :return: the taxonomic rank for tid or root if it is not found
    """
    global id2rank
    if tid in id2rank:
        return id2rank[tid]

    seenids = set()
    t,n = get_taxonomy(tid, tdb)
    while t.parent != 1 and t.taxid != 1 and t.rank != trank and t.taxid not in id2rank:
        seenids.add(t.taxid)
        t, n = get_taxonomy(t.parent, tdb)
    if t.taxid in id2rank:
        for s in seenids:
            id2rank[s] = id2rank[t.taxid]
        return id2rank[t.taxid]
    if t.rank == trank:
        for s in seenids:
            id2rank[s] = n.scientific_name
        id2rank[tid] = n.scientific_name
        return n.scientific_name
    if verbose:
        sys.stderr.write(f"{colours.PINK}ERROR: No rank for {tid}\n{colours.ENDC}")

    return "root"
Example #3
0
def tid_to_tax_set(tid, verbose=False):
    """
    Convert a taxonomy ID to a set that contains the hierarchy.
    :param tid: the taxonomy ID
    :param verbose: print more stuff
    :return: the set of taxonomies
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species', 'subspecies'
    ]
    taxonomy = set()
    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("No taxonomy for {}\n".format(tid))
        return None

    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            taxonomy.add("{}:{}".format(n.scientific_name, t.rank))
        t, n = get_taxonomy(t.parent, c)

    return taxonomy
Example #4
0
def resolve_taxonomy(tid, conn, verbose=False, nocolor=False):
    """
    Convert the taxonomy id to a tab separated string
    :param tid: the taxonomy object
    :param conn: the database connection
    :param verbose: more output
    :param nocolor: no color ouput
    :return: a string representing the taxonomy
    """

    global taxonomy_str
    if tid in taxonomy_str:
        return taxonomy_str[tid]

    wanted_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species'
    ]
    rnk = ['', '', '', '', '', '', '']
    t, n = get_taxonomy(tid, conn)
    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            rnk[wanted_levels.index(t.rank)] = n.scientific_name
        t, n = get_taxonomy(t.parent, conn)
    taxonomy_str[tid] = "\t".join(map(str, rnk))
    return taxonomy_str[tid]
Example #5
0
def tid_to_tax_set(tid, verbose=False):
    """
    Convert a taxonomy ID to a set that contains the hierarchy.
    :param tid: the taxonomy ID
    :param verbose: print more stuff
    :return: the set of taxonomies
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
    taxonomy = set()
    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("No taxonomy for {}\n".format(tid))
        return None

    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            taxonomy.add("{}:{}".format(n.scientific_name, t.rank))
        t, n = get_taxonomy(t.parent, c)

    return taxonomy
Example #6
0
def print_tax(tid, verbose=False):
    """
    Print the taxonomy associated with this id
    :param tid: the taxonomy id
    :param verbose: more output
    :return: nada
    """

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
    taxlist = ["", "", "", "", "", "", "", ""]

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("No taxonomy for {}\n".format(tid))
        return

    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            taxlist[wanted_levels.index(t.rank)] = n.scientific_name
        t, n = get_taxonomy(t.parent, c)
    print("\t".join(map(str, [tid]+taxlist)))
Example #7
0
def print_tax(tid, verbose=False):
    """
    Print the taxonomy associated with this id
    :param tid: the taxonomy id
    :param verbose: more output
    :return: nada
    """

    wanted_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species', 'subspecies'
    ]
    taxlist = ["", "", "", "", "", "", "", ""]

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("No taxonomy for {}\n".format(tid))
        return

    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            taxlist[wanted_levels.index(t.rank)] = n.scientific_name
        t, n = get_taxonomy(t.parent, c)
    print("\t".join(map(str, [tid] + taxlist)))
Example #8
0
def determine_phylogeny(fn, fqids, verbose=False):
    """
    Determine if we know the phylogeny of this thing
    :param fn: the feature name
    :param fqids: the dict of ids->fastq files
    :return: the type (currently Bacteria, Archaea, Eukaryota, Metagenome, or unknown), the id in the fastq file, and if a metagenome the type of metagenome
    """

    if fn in fqids:
        return "Metagenome", fn, fqids[fn]
    m = re.sub('\.\d+\.\d+$', '', fn)
    if m in fqids:
        return "Metagenome", m, fqids[m]
    m = re.sub('\.[\d\.]+$', '', fn)
    if m in fqids:
        return "Metagenome", m, fqids[m]
    m = re.search('\[(\d+)\]', fn)
    if not m:
        if verbose:
            sys.stderr.write(
                "There is no taxid in {} and it is not in the fastq file\n".
                format(fn))
        return "Unknown", fn, None

    tid = m.groups()[0]
    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("Can't find tax for {} in the db\n".format(tid))
        return "Unknown", fn, None

    while t.parent > 1 and t.parent != 131567:
        # 131567 is cellular organisms
        t, n = get_taxonomy(t.parent, c)
    return n.scientific_name, fn, None
Example #9
0
def rename_nodes(tree, verbose=False):
    """
    Rename the nodes based on everything below me
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species', 'subspecies'
    ]
    wanted_levels.reverse()  # too lazy to write in reverse :)
    taxonomy = {}
    # first get all the leaves and their parents. This is just to speed things up ... maybe
    for l in tree.get_leaves():
        m = re.search('\[(\d+)\]', l.name)
        if not m:
            if verbose:
                sys.stderr.write("No taxid in {}\n".format(l.name))
            continue
        tid = m.groups()[0]
        taxonomy[l.name] = {}
        t, n = get_taxonomy(tid, c)
        if not t:
            continue
        while t.parent != 1 and t.taxid != 1:
            if t.rank in wanted_levels:
                taxonomy[l.name][t.rank] = n.scientific_name
            t, n = get_taxonomy(t.parent, c)

    # now traverse every node that is not a leaf and see if we can some up with a
    # unique name for the node!
    sys.stderr.write("Traversing the tree\n")
    for n in tree.traverse("preorder"):
        if n.is_leaf():
            continue
        sys.stderr.write("Checking {}\n".format(n.name))
        taxs = {w: set() for w in wanted_levels}
        for l in n.get_leaves():
            if l.name not in taxonomy:
                continue
            for w in wanted_levels:
                if w in taxonomy[l.name]:
                    taxs[w].add(taxonomy[l.name][w])
        # which is the LOWEST level with a single taxonomy
        for w in wanted_levels:
            if len(taxs[w]) == 1:
                newname = "{} r_{})".format(taxs[w].pop(), w)
                if verbose:
                    True
                sys.stderr.write("Changing name from: {} to {}\n".format(
                    n.name, newname))
                n.name = newname
                break
    return tree
Example #10
0
def rename_nodes(tree, verbose=False):
    """
    Rename the nodes based on everything below me
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
    wanted_levels.reverse() # too lazy to write in reverse :)
    taxonomy = {}
    # first get all the leaves and their parents. This is just to speed things up ... maybe
    for l in tree.get_leaves():
        m = re.search('\[(\d+)\]', l.name)
        if not m:
            if verbose:
                sys.stderr.write("No taxid in {}\n".format(l.name))
            continue
        tid = m.groups()[0]
        taxonomy[l.name] = {}
        t,n = get_taxonomy(tid, c)
        if not t:
            continue
        while t.parent != 1 and t.taxid != 1:
            if t.rank in wanted_levels:
                taxonomy[l.name][t.rank] = n.scientific_name
            t,n = get_taxonomy(t.parent, c)
        
    # now traverse every node that is not a leaf and see if we can some up with a 
    # unique name for the node!
    sys.stderr.write("Traversing the tree\n")
    for n in tree.traverse("preorder"):
        if n.is_leaf():
            continue
        sys.stderr.write("Checking {}\n".format(n.name))
        taxs = {w:set() for w in wanted_levels}
        for l in n.get_leaves():
            if l.name not in taxonomy:
                continue
            for w in wanted_levels:
                if w in taxonomy[l.name]:
                    taxs[w].add(taxonomy[l.name][w])
        # which is the LOWEST level with a single taxonomy
        for w in wanted_levels:
            if len(taxs[w]) == 1:
                newname = "{} r_{})".format(taxs[w].pop(), w)
                if verbose:
                    True
                sys.stderr.write("Changing name from: {} to {}\n".format(n.name, newname))
                n.name = newname
                break
    return tree
Example #11
0
def determine_phylogeny(fn, verbose=False):

    m = re.search('\[(\d+)\]', fn)
    if not m:
        return "Unknown", None

    tid = m.groups()[0]
    t,n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("Can't find tax for {} in the db\n".format(tid))
        return "Unknown", None

    while t.parent > 1 and t.parent != 131567:
        # 131567 is cellular organisms
        t,n = get_taxonomy(t.parent, c)
    return n.scientific_name, None
Example #12
0
def determine_phylogeny(fn, verbose=False):

    m = re.search('\[(\d+)\]', fn)
    if not m:
        return "Unknown", None

    tid = m.groups()[0]
    t, n = get_taxonomy(tid, c)
    if not t:
        if verbose:
            sys.stderr.write("Can't find tax for {} in the db\n".format(tid))
        return "Unknown", None

    while t.parent > 1 and t.parent != 131567:
        # 131567 is cellular organisms
        t, n = get_taxonomy(t.parent, c)
    return n.scientific_name, None
Example #13
0
def printtaxa(i, c):
    """
    Print out the taxonomy
    :param i: identifier
    :param c: database connection
    :return:
    """

    names = {w: "" for w in want}
    t, n = get_taxonomy(i, c)
    if t.rank in want:
        names[t.rank] = n.get_name()
    while t.parent != 1 and t.taxid != 1:
        t, n = get_taxonomy(t.parent, c)
        if t.rank in want:
            names[t.rank] = n.get_name()
    print("\t".join([str(i)] + [names[w] for w in want]))
Example #14
0
def taxstring(tid, verbose=False):
    """

    :param tid: taxonomy ID
    :param verbose: more output
    :return: an array of the taxnomy from kingdom -> species
    """
    global taxa
    if tid in taxa:
        return taxa[tid]

    want = [
        'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'
    ]
    thistaxa = ['', '', '', '', '', '', '']
    c = get_taxonomy_db()
    try:
        m, n = get_taxonomy(tid, c)
    except EntryNotInDatabaseError:
        sys.stderr.write(
            f"{bcolors.RED}{tid} not in database.Skipped line{bcolors.ENDC}\n")
        taxa[tid] = thistaxa
        return taxa[tid]

    thisname = choosename(n, verbose)
    if thisname:
        if m.rank in want:
            thistaxa[want.index(m.rank)] = thisname[0].upper() + thisname[1:]
    for p in taxonomy_hierarchy(tid, verbose=False):
        m, n = get_taxonomy(p, c)
        thisname = choosename(n, verbose)
        if not thisname:
            sys.stderr.write(
                f"{bcolors.RED}ERROR: No name for {tid}{bcolors.ENDC}\n")
            return
        if m.rank in want:
            thistaxa[want.index(m.rank)] = thisname[0].upper() + thisname[1:]
    taxa[tid] = thistaxa
    return taxa[tid]
def resolve_taxonomy(tid, conn, verbose=False, nocolor=False):
    """
    Convert the taxonomy id to a tab separated string
    :param tid: the taxonomy object
    :param conn: the database connection
    :param verbose: more output
    :param nocolor: no color ouput
    :return: a string representing the taxonomy
    """

    global taxonomy_str
    if tid in taxonomy_str:
        return taxonomy_str[tid]

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    rnk = ['', '', '', '', '', '', '']
    t, n = get_taxonomy(tid, conn)
    while t.parent != 1 and t.taxid != 1:
        if t.rank in wanted_levels:
            rnk[wanted_levels.index(t.rank)] = n.scientific_name
        t, n = get_taxonomy(t.parent, conn)
    taxonomy_str[tid] = "\t".join(rnk)
    return taxonomy_str[tid]
Example #16
0
def print_kingdom(f, verbose=False):

    c = get_taxonomy_db()
    with open(f, 'r') as fin:
        for l in fin:
            l = l.strip()
            t = get_taxid_for_name(l, c)
            if not t:
                print(f"{l}\tUnknown")
                continue
            for p in taxonomy_hierarchy(t, verbose=False):
                m,n = get_taxonomy(p, c)
                if m.rank == 'superkingdom':
                    nm = n.blast_name[0].upper() + n.blast_name[1:]
                    print(f"{l}\t{nm}")
def rename_nodes_ncbi(tree, verbose=False):
    """
    Rename the nodes based on everything below me, but also give each node a unique branch number.
    The format of this number is _b\d+

    :param tree: the tree to rename
    :param verbose: more output
    :return: the renamed tree
    """

    # connect to the SQL dataabase
    c = get_taxonomy_db()

    wanted_levels = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'subspecies']
    wanted_levels.reverse()  # too lazy to write in reverse :)
    taxonomy = {}
    # first get all the leaves and their parents. This is just to speed things up ... maybe
    for l in tree.get_leaves():
        m = re.search('\[(\d+)\]', l.name)
        if not m:
            if verbose:
                sys.stderr.write("No taxid in {}\n".format(l.name))
            continue
        tid = m.groups()[0]
        taxonomy[l.name] = {}
        t, n = get_taxonomy(tid, c)
        if not t:
            continue
        while t.parent != 1 and t.taxid != 1:
            if t.rank in wanted_levels:
                taxonomy[l.name][t.rank] = n.scientific_name
            t, n = get_taxonomy(t.parent, c)

    # now traverse every node that is not a leaf and see if we can some up with a
    # unique name for the node!
    if verbose:
        sys.stderr.write("Traversing the tree to rename the nodes\n")
    branchnum = 0
    for n in tree.traverse("postorder"):
        if n.is_leaf():
            continue
        ## if both our children have the same name, we acquire that name and reset their names
        ## otherwise we figure out what our name should be based on the last common ancestor
        ## of the leaves.

        children = n.get_children()
        names = set([re.sub('\s+b_\d+', '', x.name) for x in children])
        if len(names) == 1:
            n.name = "{} b_{}".format(names.pop(), branchnum)
            if verbose:
                sys.stderr.write("Reset name to {} because both children are the same\n".format(n.name))
            for c in children:
                oldname = c.name
                c.name = re.sub('r_\w+\s+', '', c.name)
                if verbose:
                    sys.stderr.write("\tAs both children the same set {} to {}\n".format(oldname, c.name))
        else:
            ## We have to figure out what our unique name should be
            taxs = {w: set() for w in wanted_levels}
            for l in n.get_leaves():
                if l.name not in taxonomy:
                    continue
                for w in wanted_levels:
                    if w in taxonomy[l.name]:
                        taxs[w].add(taxonomy[l.name][w])
            # which is the LOWEST level with a single taxonomy
            for w in wanted_levels:
                if len(taxs[w]) == 1:
                    newname = "{} r_{} b_{}".format(taxs[w].pop(), w, branchnum)
                    if verbose:
                        sys.stderr.write("Changing name from: {} to {}\n".format(n.name, newname))
                    n.name = newname
                    break
        branchnum += 1
    return tree
Example #18
0
              33340 name: Neoptera    blast name: Neoptera   rank: infraclass   parent: 7496
              7496 name: Pterygota    blast name: Pterygota   rank: subclass   parent: 85512
              85512 name: Dicondylia    blast name: Dicondylia   rank: no rank   parent: 50557
              50557 name: Insecta    blast name: Insecta   rank: class   parent: 6960
              6960 name: Hexapoda    blast name: insects   rank: superclass   parent: 197562
              197562 name: Pancrustacea    blast name: Pancrustacea   rank: no rank   parent: 197563
              197563 name: Mandibulata    blast name: Mandibulata   rank: no rank   parent: 6656
              6656 name: Arthropoda    blast name: arthropods   rank: phylum   parent: 88770
              88770 name: Panarthropoda    blast name: Panarthropoda   rank: no rank   parent: 1206794
              1206794 name: Ecdysozoa    blast name: Ecdysozoa   rank: no rank   parent: 33317
              33317 name: Protostomia    blast name: Protostomia   rank: no rank   parent: 33213
              33213 name: Bilateria    blast name: Bilateria   rank: no rank   parent: 6072
              6072 name: Eumetazoa    blast name: Eumetazoa   rank: no rank   parent: 33208
              33208 name: Metazoa    blast name: animals   rank: kingdom   parent: 33154
              33154 name: Opisthokonta    blast name: Opisthokonta   rank: no rank   parent: 2759
              2759 name: Eukaryota    blast name: eukaryotes   rank: superkingdom   parent: 131567
              """)
        sys.exit(0)
    else:
        ids=sys.argv[1:]


    c = get_taxonomy_db()
    for i in ids:
        t, n = get_taxonomy(i, c)
        while t.parent != 1 and t.taxid != 1:
            print("{}\tname: {}\trank: {}\tparent: {}".format(t.taxid, n.scientific_name, t.rank, t.parent))
            t, n = get_taxonomy(t.parent, c)


Example #19
0
    with open(args.o, 'w', encoding='utf-8') as out: 
        with open(args.f, 'r', encoding='utf-8') as f:
            for l in f:
                p = l.strip().split("\t")
                while (len(p) < maxp):
                    p.append("")

                if l.startswith("genome_id"):
                    out.write("{}\t{}\n".format(l.strip(), "\t".join(want)))
                    continue

                tid = p[args.c]

                level = {}

                t, n = get_taxonomy(tid, c)

                while t and t.parent > 1 and t.parent != 131567:
                    # 131567 is cellular organisms
                    if t.rank in want:
                        level[t.rank] = n.scientific_name
                    t, n = get_taxonomy(t.parent, c)

                for w in want:
                    if w in level:
                        p.append(level[w])
                    else:
                        p.append("")

                out.write("\t".join(map(str, p)))
                out.write("\n")