コード例 #1
0
ファイル: Kraken-SBT.py プロジェクト: tbenavi1/Kraken-SBT
def get_tree(taxonids, num_taxonids = 0):
	ncbi = NCBITaxa()
	
	if num_taxonids != 0:
		taxonids = taxonids[:num_taxonids] #smaller set of taxonids for tree construction and testing
	
	return ncbi.get_topology(taxonids) #5,360 total nodes for full dataset
コード例 #2
0
def get_ncbi_taxonomy_species_tree(names_list):
    ncbi = NCBITaxa()
    name2taxid_dic = ncbi.get_name_translator(names_list)
    taxid_list = []
    for i in range(len(names_list)):
        taxid_list.append(name2taxid_dic[names_list[i]][0])
    return ncbi.get_topology(taxid_list)
コード例 #3
0
def get_tax(
    cma_file
):  #finds the 'lowest common ancestor' of species represented in a cma file
    ncbi = NCBITaxa()
    org_regex = r'\[(.*)\]'
    taxid_list = []
    for line in open(cma_file, 'r'):
        if line.startswith(">"):
            find_org_name = re.search(org_regex, line)
            if find_org_name is not None:
                org_name = find_org_name.group(1)
                taxid = str(ncbi.get_name_translator([org_name]))
                taxid = re.sub(r'^.*\[', '', taxid)
                taxid = re.sub(r'\].*$', '', taxid)
                if taxid != '{}' and taxid != '32630' and taxid != '10239':  #omit sequences from viruses and synthetic constructs'
                    taxid_list.append(taxid)
    tax_list = ncbi.get_taxid_translator(taxid_list)
    tree = ncbi.get_topology(taxid_list)
    tree_labeled = tree.get_ascii(attributes=['sci_name', 'taxid'])
    lca_id = str(tree.get_tree_root)
    lca_id = re.sub(r"^.*node '", '', lca_id)
    lca_id = re.sub(r"'.*$", '', lca_id)
    lca_name = str(ncbi.get_taxid_translator([lca_id]))
    lca_name = re.sub(r"'}$", '', lca_name)
    lca_name = re.sub(r"^.*'", '', lca_name)
    return (lca_name, tax_list, tree_labeled)
コード例 #4
0
    def get_distance_to_common_ancestor(self, other):
        """ Calculate the number of links in the NCBI taxonomic tree between two taxa and their latest common ancestor

        Note: This distances depends on the granularity of the lineage of the taxon. For example, there are only 7 links
        between most bacteria species and the Bacteria superkingdom. However, there are 28 links between the H**o sapiens
        species and the Eukaryota superkingdom.

        Args:
            other (:obj:`Taxon`): a second taxon

        Returns:
             :obj:`int`: number of links between :obj:`self` and its latest common ancestor with :obj:`other` in the NCBI
                taxonomic tree
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return id_of_nearest_ncbi_taxon

        ncbi_taxa = NCBITaxa()
        tree = ncbi_taxa.get_topology(
            [self.id_of_nearest_ncbi_taxon, other.id_of_nearest_ncbi_taxon],
            intermediate_nodes=True)
        self_node = tree.search_nodes(
            name=str(self.id_of_nearest_ncbi_taxon))[0]
        other_node = tree.search_nodes(
            name=str(other.id_of_nearest_ncbi_taxon))[0]
        ancestor = tree.get_common_ancestor(self_node, other_node)
        return tree.get_distance(
            self_node, ancestor) + self.distance_from_nearest_ncbi_taxon
コード例 #5
0
ファイル: topology.py プロジェクト: ThThalamas/pyproteinsExt
    def create_ete3_tree(self):
        ncbi = NCBITaxa()
        taxids = set([e.taxo.taxid for e in self])
        if None in taxids:
            raise Exception("Entries doesn't have taxids")
        tree = ncbi.get_topology(list(taxids))

        # Complete Tree object with list of domains and proteins for each node
        node_list = []
        for n in tree.traverse('postorder'):  # Browse tree in postorder, starts from leaf and ascend to root
            n.sameDomainNode = set()
            node_list.append(n)
            n.domains = set([h.domain for e in self for h in e.hmmr if e.taxo.taxid == n.name])
            n.proteins = set([e.prot for e in self if e.taxo.taxid == n.name])
            if n.get_descendants():
                for child in n.children:
                    n.domains.update(child.domains)
                    n.proteins.update(child.proteins)

        # Complete Tree object with list of nodes with same domains for each node
        c = 0
        for i in range(len(node_list)):
            c += 1
            for j in range(i+1, len(node_list)):
                n1 = node_list[i]
                n2 = node_list[j]
                if len(n1.domains) == len(n2.domains):
                    if not n1.domains.difference(n2.domains):
                        n1.sameDomainNode.add(n2)
                        n2.sameDomainNode.add(n1)
        self.ete3_tree = tree
コード例 #6
0
def build_complete_tree(tree, log):
    """Build the taxonomic tree including internal nodes (for rank dependent evaluation)

    Args:
        tree (fileobject): file name of the reference tree, without internal nodes
        log (fileobject): log file
    """
    ncbi = NCBITaxa()
    original_tree = Tree(tree, format=1)
    taxa = [n.taxid for n in original_tree.traverse('postorder')]
    built = False
    while not built:
        try:
            complete_tree = ncbi.get_topology(taxa, intermediate_nodes=True)
            built = True
        except KeyError as e:
            # if a taxid is not found, try to build the tree without it
            taxid_not_found = int(e.args[0])
            taxa.remove(taxid_not_found)
            if log:
                print('[prophyle_otu_table] ERROR: TaxID ' +
                      str(taxid_not_found) +
                      ' not found in ETE DB (try updating it)',
                      file=log)
            pass

    return complete_tree
コード例 #7
0
    def get_NCBI_tree(self,seqidmap):

        ncbi = NCBITaxa()
        taxids = []
        with open(seqidmap,"r") as f:
            for line in f:
                splited = line.split("\t")
                taxids.append(splited[1].strip("\n"))
        tree = ncbi.get_topology(taxids,intermediate_nodes=True)
        return tree
コード例 #8
0
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file):
    """From NCBI taxon ID, extract taxonomy rank and create a tree file

    Args:
        mpwt_taxon_file (str): mpwt taxon file for species in sbml folder
        taxon_output_file (str): path to phylum output file
        tree_output_file (str): path to tree output file

    """
    ncbi = NCBITaxa()

    taxon_ids = []

    phylum_count = {}
    with open(taxon_output_file, "w") as phylum_file:
        csvwriter = csv.writer(phylum_file, delimiter="\t")
        csvwriter.writerow([
            "species", "taxid", "phylum_number", "phylum", "class", "order",
            "family", "genus", "species"
        ])
        with open(mpwt_taxon_file, "r") as taxon_file:
            csvfile = csv.reader(taxon_file, delimiter="\t")
            for line in csvfile:
                if "taxon" not in line[1]:
                    taxon_ids.append(line[1])
                    lineage = ncbi.get_lineage(line[1])
                    lineage2ranks = ncbi.get_rank(lineage)
                    names = ncbi.get_taxid_translator(lineage)
                    ranks2lineage = dict(
                        (rank, names[taxid])
                        for (taxid, rank) in lineage2ranks.items())
                    ranks = [
                        ranks2lineage.get(rank, "no_information") for rank in [
                            "phylum", "class", "order", "family", "genus",
                            "species"
                        ]
                    ]
                    if ranks[0] != "no_information":
                        phylum = ranks[0][:4]
                    else:
                        phylum = "no_information"
                    if phylum not in phylum_count:
                        phylum_count[phylum] = 1
                    elif phylum == "no_information":
                        phylum_count[phylum] = ""
                    else:
                        phylum_count[phylum] += 1
                    row = ([line[0], line[1]] +
                           [phylum + str(phylum_count[phylum])] + ranks)
                    csvwriter.writerow(row)

    tree = ncbi.get_topology(taxon_ids)

    with open(tree_output_file, "w") as tree_file:
        tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
コード例 #9
0
def plot_taxids(taxids_list, tree_png, tree_nw, tax_db=None):
    if tax_db is not None:
        ncbi = NCBITaxa(dbfile=tax_db)
    else:
        ncbi=NCBITaxa()

    tree = ncbi.get_topology(taxids_list)
    ts = TreeStyle()
    ncbi.annotate_tree(tree, taxid_attr="sci_name")
    ts.show_leaf_name = False
    ts.mode = "c"
    ts.layout_fn = layout
    tree.render(tree_png, tree_style=ts)
    tree.write(format=1, outfile=tree_nw)
コード例 #10
0
def get_off_target_last_common_taxon_rank(df, target_rank, target_taxon):
    ncbi = NCBITaxa()
    if (target_taxon != 0) & (df.loc[target_rank] != 0):
        if not pd.isnull(df.loc[target_rank]):
            last_common_taxon = ncbi.get_topology([df.loc[target_rank], target_taxon])
            last_common_taxon_rank = last_common_taxon.rank
            if last_common_taxon_rank != 'no rank':
                lineage = ncbi.get_lineage(last_common_taxon.taxid)
                last_common_taxon_rank = ncbi.get_rank([lineage[-1]])[lineage[-1]]
            else:
                last_common_taxon_rank = 'no rank'
        else:
            last_common_taxon_rank = 'no rank'
    else:
        last_common_taxon_rank = 'no rank'
    return(last_common_taxon_rank)
コード例 #11
0
ファイル: ete_gui.py プロジェクト: dengzq1234/ete_gui
def run_ete_ncbiquery_py(query):
    ncbi = NCBITaxa()
    query = query.split(',')
    final_query = []
    for i in query:
        try:
            i.lstrip()
            i = int(i)
            final_query.append(i)
        except ValueError:
            i = i.lstrip()
            name2taxid = ncbi.get_name_translator([i])[i]
            final_query += name2taxid
    tree = ncbi.get_topology(final_query)

    return tree.get_ascii(attributes=["sci_name", "rank"])
コード例 #12
0
def my_tree():
    ncbi = NCBITaxa()
    my_tree = ncbi.get_topology([54263, 8324, 8323, 8327, 8325, 57571, 323754])

    for n in my_tree.traverse():
        n.add_features(weight=random.randint(0, 50))

    ts = TreeStyle()

    ts.layout_fn = layout

    ts.mode = "c"

    ts.show_branch_length = True
    ts.show_branch_support = True
    my_tree.get_ascii(attributes=["sci_name", "rank"])
    return my_tree, ts
コード例 #13
0
    def get_common_ancestor(self, other):
        """ Get the lastest common ancestor of two taxa

        Args:
            other (:obj:`Taxon`): a second taxon

        Returns:
            :obj:`Taxon`: latest common ancestor
        """
        if self.id_of_nearest_ncbi_taxon is None:
            return id_of_nearest_ncbi_taxon

        ncbi_taxa = NCBITaxa()
        tree = ncbi_taxa.get_topology([self.id_of_nearest_ncbi_taxon, other.id_of_nearest_ncbi_taxon], intermediate_nodes=True)
        self_node = tree.search_nodes(name=str(self.id_of_nearest_ncbi_taxon))[0]
        other_node = tree.search_nodes(name=str(other.id_of_nearest_ncbi_taxon))[0]
        ancestor = tree.get_common_ancestor(self_node, other_node)
        cls = self.__class__
        return cls(ncbi_id=float(ancestor.name))
コード例 #14
0
ファイル: topology.py プロジェクト: ThThalamas/pyproteinsExt
    def compute_upper_node_and_distance(self, core_domains=[]):
        ncbi = NCBITaxa()
        if not self.domain_entries:
            raise Exception("Compute domain_entries first.")
        if not self.ete3_tree:
            raise Exception("Compute ete3_tree first.")

        for d in self.domain_entries.values():
            if d.name not in core_domains:
                distances = []
                if len(d.taxo) == 1:
                    taxo = list(d.taxo)[0]
                    d.upper_node = self.ete3_tree.search_nodes(name=taxo.taxid)[0]
                    d.mean_distance = 0
                else:
                    list_taxids = list(set([t.taxid for t in d.taxo]))
                    domain_tree = ncbi.get_topology(list_taxids)
                    traverse_generator = domain_tree.traverse()
                    d.upper_node = next(traverse_generator)
                    for i in range(len(list_taxids)):
                        for j in range(i+1, len(list_taxids)):
                            dist = self.ete3_tree.get_distance(list_taxids[i], list_taxids[j])
                            distances.append(dist)
                    d.mean_distance = mean(distances)
コード例 #15
0
def get_taxo_of():
    if not request.is_json:
        return jsonify(success=False, reason="Bad content type"), 400

    data = request.json

    if not 'taxids' in data:
        return jsonify(success=False, reason="Taxonomic IDs are missing"), 400

    if not isinstance(data['taxids'], list):
        return jsonify(
            success=False,
            reason="Taxonomic IDs must be sended as a string array"), 400

    ncbi = NCBITaxa(
    )  # Obligé de l'instancier à chaque requête; Requiert un SQLite qui demande un thread ID == thread appelant
    sended_list = data['taxids']

    if not len(sended_list):
        return jsonify(success=False, reason="Query list is empty"), 400

    # Convert every string ID of the list to integers
    # Map returns an iterator, so the value error will not append when creating it
    sended_list = map(lambda x: int(x), sended_list)

    try:  # Use the iterator
        tree_topology = ncbi.get_topology(sended_list)
    except ValueError:
        return jsonify(
            success=False,
            reason="One of the sended IDs is not a valid integer"), 400

    # Constructing root
    tree = {tree_topology.name: node_to_dict(ncbi, tree_topology)}

    return jsonify(success=True, tree=tree)
コード例 #16
0
def main(argv):

    parser = argparse.ArgumentParser(
        description=textwrap.dedent("""\
			Summarize and filter alignments by taxid.

			Required arguments are --dbfile, --inherited_markers, --taxid_link, --readcounts, --primarytab, --eukfrac, --alltab, --taxid_genelens

			"""),
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("--dbfile",
                        type=str,
                        action="store",
                        dest="dbfile",
                        help="Eukdetect database folder",
                        required=True)

    parser.add_argument("--inherited_markers",
                        type=str,
                        action="store",
                        dest="inherited_markers",
                        help="Eukdetect database folder",
                        required=True)

    parser.add_argument("--taxid_link",
                        type=str,
                        action="store",
                        dest="taxid_link",
                        help="Eukdetect database folder",
                        required=True)

    parser.add_argument("--readcounts",
                        type=str,
                        action="store",
                        dest="readcounts",
                        help="Read counts and mismatches file.",
                        required=True)

    parser.add_argument("--eukfrac",
                        type=str,
                        action="store",
                        dest="eukfrac",
                        help="Eukaryotic abundance & fraction output file",
                        required=True)

    parser.add_argument("--primarytab",
                        type=str,
                        action="store",
                        dest="primarytab",
                        help="Table output of filtered hits.",
                        required=True)

    parser.add_argument("--alltab",
                        type=str,
                        action="store",
                        dest="alltab",
                        help="Table output of all hits.",
                        required=True)
    parser.add_argument("--taxid_genelens",
                        type=str,
                        action="store",
                        dest="taxid_genelens",
                        help="Cumulative gene length per taxid",
                        required=True)

    files = parser.parse_args()

    #initialize NCBI taxdb
    ncbi = NCBITaxa(files.dbfile)

    #taxid genelength correspondence
    #taxid_genelen = {taxid: length}
    taxid_genelen = {
        line.split('\t')[0]: int(line.split('\t')[1].strip('\n'))
        for line in open(files.taxid_genelens)
    }

    #create 2 dicts for ease of lookup

    #correspondence between taxid & marker gene name
    #taxid_seqs: {taxid: [seq1, seq2]}. Save every seen taxid and which seqs
    #seq_taxids = {seq: taxid, seq:taxid} Save every seq

    taxid_seqs = {}
    seq_taxids = {}
    for line in open(files.taxid_link):
        line = line.strip('\n')
        taxid = line.split('\t')[1]
        if taxid not in taxid_seqs:
            taxid_seqs[taxid] = []
        seq = line.split('\t')[0]
        taxid_seqs[taxid].append(seq)
        seq_taxids[seq] = taxid

    #save contents of read_counts_and_mismatches file as dict per observed taxid
    #save observed genuses
    #taxid_counts: {taxid: [[marker, readcount, correct_bases, total_bases, seqlen, coverage, pid, busco]]}
    taxid_counts = {}

    counter = 0
    countfile = open(files.readcounts)
    countfile.readline()

    genuses = {}
    above_species = []
    #genuses: {genus:[taxid, taxid, taxid]}

    for line in countfile:
        counter += 1
        line = line.strip('\n')
        seq = line.split('\t')[0]
        count = int(line.split('\t')[1])
        correct_bases = int(line.split('\t')[2])
        incorrect_bases = int(line.split('\t')[3])
        total_bases = int(line.split('\t')[4])
        subjlen = int(line.split('\t')[5])
        coverage = float(line.split('\t')[6])
        pid = float(line.split('\t')[7])
        taxid = seq_taxids[seq]

        if "Collapse" not in seq:
            busco = re.findall('-\d*at\d*-', seq)[0].strip('-')
        else:
            busco = "Collapsed"

        #determine genus
        lineage = ncbi.get_lineage(int(taxid))
        ranks = {value: key for (key, value) in ncbi.get_rank(lineage).items()}
        #lowest = list(ncbi.get_rank([lineage[-1]]).values())[0]
        if 'genus' in ranks and 'Collapse' not in seq and "species" in ranks:  #lowest != "genus": #dont filter if it's at the genus level
            genus = ranks['genus']
            if genus not in genuses:
                genuses[genus] = []
            if taxid not in genuses[genus]:
                genuses[genus].append(taxid)
        elif "SSCollapse" not in seq:  #don't add anything that's got SSCollapse in it
            above_species.append(taxid)

        #save info per sequence in seq_counts dict
        #seq_counts[seq] = [count, correct_bases, total_bases, subjlen, coverage, pid, busco]

        if taxid not in taxid_counts:
            taxid_counts[taxid] = []
            #find the genus if not a spcollapsed gene

        taxid_counts[taxid].append([
            seq, count, correct_bases, total_bases, subjlen, coverage, pid,
            busco
        ])

    if counter == 0:
        message = "Empty read count file. Likely no aligned reads in sample."
        #print(message)
        #still have to write stuff
        f = open(files.eukfrac, 'w')
        f.write(message + '\n')
        f.close()
        f = open(files.alltab, 'w')
        f.write(message + '\n')
        f = open(files.primarytab, 'w')
        f.write(message + '\n')
        f.close()
        sys.exit()
    countfile.close()

    #done parsing read_counts_and_mismatches file

    #calculate stats for each observed taxid
    taxon_coverage = {}

    #taxon_coverage[taxon] = [
    #observed_markers,
    #readcounts,
    #total_bases,
    #percentage_markers,
    #marker_coverage,
    #percent_id,
    #total_observed_marker_len,
    #buscos,
    #total_gene_length,
    #total_markers]

    seen_taxids = []
    for tax in taxid_counts:
        mc = len(taxid_counts[tax])
        counts = 0
        bases = 0
        correct = 0
        total_bases = 0
        subj_len = 0
        buscos = []
        for i in range(0, len(taxid_counts[tax])):

            busco = taxid_counts[tax][i][-1]
            if len(busco) > 1:
                buscos.append(busco)

            counts += taxid_counts[tax][i][1]
            bases += taxid_counts[tax][i][3]
            correct += taxid_counts[tax][i][2]
            total_bases += taxid_counts[tax][i][3]
            subj_len += taxid_counts[tax][i][4]

        percent_identity = round((correct / total_bases) * 100, 2)
        overall_coverage = round((total_bases / subj_len) * 100, 2)
        total_markers = len(taxid_seqs[tax])
        marker_percentage = round(mc / total_markers * 100, 2)
        name = [
            ncbi.get_taxid_translator([tax])[e]
            for e in ncbi.get_taxid_translator([tax])
        ][0]

        taxid_len = taxid_genelen[tax]
        rpkg = counts / (taxid_len / 1000)

        if tax not in seen_taxids:
            seen_taxids.append(tax)

        taxon_coverage[tax] = [
            mc, counts, total_bases, marker_percentage, overall_coverage,
            percent_identity, subj_len, buscos, taxid_len, total_markers
        ]

    #create tree structure for all observed taxids

    tree = ncbi.get_topology(seen_taxids)
    tree_root = tree.get_tree_root().name
    lineage = ncbi.get_lineage(tree_root)
    tree_taxids = seen_taxids + lineage
    full_tree = ncbi.get_topology(tree_taxids, intermediate_nodes=True)
    full_taxid_lineage = [node.name for node in full_tree.traverse()]

    #full_seq_taxids: {taxid: [[specific buscos], specific count, specific + inherited count]}
    full_seq_taxids = {}
    for line in open(files.inherited_markers):
        line = line.strip('\n')
        taxid = line.split('\t')[0]
        if taxid in full_taxid_lineage:
            buscos = []
            for seq in line.split('\t')[1].split(','):
                if len(re.findall('-\d*at\d*-', seq)) > 0:
                    busco = re.findall('-\d*at\d*-', seq)[0].strip('-')
                    if busco not in buscos:
                        buscos.append(busco)

            specific_count = len(line.split('\t')[1].split(','))
            sp_and_inherited_count = len(line.split('\t')[2].split(','))

            full_seq_taxids[taxid] = [
                buscos, specific_count, sp_and_inherited_count
            ]

    #write full table
    marker_sorted = sorted(taxon_coverage.keys(),
                           reverse=True,
                           key=lambda x: taxon_coverage[x][3])

    dest = open(files.alltab, 'w')
    dest.write(
        "Name\tTaxid\tRank\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\tAmount of marker length in EukDetect db\n"
    )
    for tax in marker_sorted:
        rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]
        name = [
            ncbi.get_taxid_translator([tax])[e]
            for e in ncbi.get_taxid_translator([tax])
        ][0]

        if rank == "no rank":
            #parent rank
            parent = ncbi.get_lineage(tax)[-2]
            rank = [
                ncbi.get_rank([parent])[e] for e in ncbi.get_rank([parent])
            ][0]

        mc = taxon_coverage[tax][0]
        counts = taxon_coverage[tax][1]
        marker_percentage = taxon_coverage[tax][3]
        overall_coverage = taxon_coverage[tax][4]
        percent_identity = taxon_coverage[tax][5]
        total_marker_len = taxon_coverage[tax][6]
        blen = taxid_genelen[tax]
        dest.write(name + '\t' + str(tax) + '\t' + rank + '\t' + str(mc) +
                   '\t' + str(counts) + '\t' + str(marker_percentage) + '%\t' +
                   str(overall_coverage) + '%\t' + str(percent_identity) +
                   '%\t' + str(blen) + '\n')
    dest.close()

    #determine primary and secondary hits
    #if MRCA is at the level of genus, consider whether one should be primary or secondary by looking at buscos
    primary = {}
    secondary = {}
    genus_secondary_hits = {}  #structure: {genus: [secondary_hit_taxid]}

    for g in genuses:
        if len(genuses[g]) > 1:  #multiple species in same genus
            taxids = genuses[g]
            reads = [taxon_coverage[taxid][1] for taxid in taxids]
            bases = [taxon_coverage[taxid][2] for taxid in taxids]

            #if one has more reads and more bases than all others, it is primary, others are secondary
            maxreads = max(reads)
            maxbases = max(bases)
            ptaxids = []

            if (reads.count(maxreads) == 1 and bases.count(maxbases) == 1)\
             and (reads.index(maxreads) == bases.index(maxbases)): #no ties, same ID
                maxtax = taxids[reads.index(maxreads)]
                primary[maxtax] = taxon_coverage[maxtax][0:5]
                ptaxids.append(maxtax)
            #ptaxids.append(taxids[reads.index(maxreads)])
            #primary[ptaxid] = taxon_coverage[ptaxid][0:5]
            #p_buscos = full_seq_taxids[ptaxid][0]
            else:
                for t in taxids:
                    if taxon_coverage[t][1] == maxreads or taxon_coverage[t][
                            2] == maxbases:
                        ptaxids.append(t)
                        primary[t] = taxon_coverage[t][0:5]

            unsorted_ataxids = [t for t in taxids if t not in ptaxids]
            ataxids = sorted(unsorted_ataxids,
                             key=lambda x: taxon_coverage[x][1],
                             reverse=True)
            for ataxid in ataxids:

                is_secondary = False
                for ptaxid in primary:
                    p_buscos = [b for b in full_seq_taxids[ptaxid][0]]
                    a_buscos = taxon_coverage[ataxid][7]
                    a_remain = [b for b in a_buscos if b in p_buscos]

                    if len(a_remain) > 0:
                        a_above = []
                        for b in a_remain:

                            #it may not be a hit for the other one! check first
                            #check that the pid for this hit is lower
                            apid = [
                                seq[6] for seq in taxid_counts[ataxid]
                                if seq[7] == b
                            ]
                            ppid = [
                                seq[6] for seq in taxid_counts[ptaxid]
                                if seq[7] == b
                            ]

                            if len(ppid) > 0 and apid[0] >= ppid[0]:
                                a_above.append(b)
                            elif len(ppid) == 0:
                                a_above.append(b)
                        #if a_buscos is fewer than 5, all must be correct
                        #print(a_above)
                        if len(a_buscos) < 5:
                            if len(a_above) < len(a_buscos):
                                is_secondary = True
                        else:
                            if len(a_above) <= len(
                                    a_buscos
                            ) / 2:  #change: alt hit has to be half or busco hits being above
                                is_secondary = True
                    else:
                        is_secondary = True
                if is_secondary:
                    secondary[ataxid] = taxon_coverage[ataxid][0:5] + [ptaxid]
                    genus = str(g)
                    if genus not in genus_secondary_hits:
                        genus_secondary_hits[genus] = []
                    genus_secondary_hits[genus].append(ataxid)
                    #secondary_hit_reads[g].append([ataxid, taxon_coverage[ataxid][1], taxid_genelen[ataxid]])
                else:
                    primary[ataxid] = taxon_coverage[ataxid][0:5]
        else:  #primary
            taxid = genuses[g][0]
            primary[taxid] = taxon_coverage[taxid][0:5]

    #add anything else
    for t in above_species:
        primary[t] = taxon_coverage[t][0:5]

    primary_sorted = sorted(primary.keys(),
                            reverse=True,
                            key=lambda x: primary[x][3])
    #secondary_sorted = sorted(secondary.keys(), reverse=True, key=lambda x: secondary[x][3])

    filter_passing_taxids = []

    for tax in primary_sorted:
        rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]
        if rank == "no rank":
            prev = ncbi.get_lineage(tax)[-1]
            prevrank = [
                ncbi.get_rank([prev])[e] for e in ncbi.get_rank([prev])
            ][0]
            if prevrank == "species":
                rank = "species"
        name = [
            ncbi.get_taxid_translator([tax])[e]
            for e in ncbi.get_taxid_translator([tax])
        ][0]
        mc = taxon_coverage[tax][0]
        counts = taxon_coverage[tax][1]
        marker_percentage = taxon_coverage[tax][3]
        overall_coverage = taxon_coverage[tax][4]
        percent_identity = taxon_coverage[tax][5]

        #filter
        if int(mc) >= 2 and int(counts) >= 4:
            filter_passing_taxids.append(tax)

    #close if no filter passing taxids
    if len(filter_passing_taxids) == 0:
        message = "No taxa passing filter requirements."
        #print(message)

        #still have to write stuff
        f = open(files.primarytab, 'w')
        f.write(message + '\n')
        f.close()
        f = open(files.eukfrac, 'w')
        f.write(message + '\n')
        f.close()
        sys.exit()

    #create NCBI taxon tree of observed taxa + extend to cellular_org
    tree = ncbi.get_topology(filter_passing_taxids)
    tree_root = tree.get_tree_root().name
    lineage = ncbi.get_lineage(tree_root)
    primary_tree_taxids = [int(e) for e in filter_passing_taxids] + lineage
    primary_tree = ncbi.get_topology(primary_tree_taxids,
                                     intermediate_nodes=True)

    orphan_children = []

    #phylum class order family genus species

    taxid_lendenoms = {
    }  #for all species, get full marker possibilities, for higher rank, get just what's specific

    #find counts of seqs for internal nodes
    relab_levels = {
        'species': [],
        'genus': [],
        'family': [],
        'order': [],
        'class': [],
        'phylum': []
    }
    ordered_labels = ["phylum", "class", "order", "family", "genus", "species"]

    lineages = {}

    #pre-add secondary hits to each genus
    for g in genus_secondary_hits:
        for s in genus_secondary_hits[g]:
            for seq in taxid_counts[s]:
                #if g not in taxid_lendenoms:
                #	taxid_lendenoms[g] = 0
                #taxid_lendenoms[g] += seq[4]
                if g not in taxid_counts:
                    taxid_counts[g] = []
                taxid_counts[g].append(seq)

    #calculate seqs and seqlens for each taxonomic node. this goes top-down
    for node in primary_tree.traverse():

        #get lineage name
        lin_name = ""
        currname = [
            ncbi.get_taxid_translator([node.name])[e]
            for e in ncbi.get_taxid_translator([node.name])
        ][0]
        lineage = ncbi.get_lineage(node.name)
        names = ncbi.get_taxid_translator(ncbi.get_lineage(node.name))
        ranks = ncbi.get_rank(ncbi.get_lineage(node.name))
        ranks_rev = {ranks[e]: e for e in ranks}

        #print(lineage)
        #print(ranks)
        #print(ranks_rev)
        prev_rank = ranks[lineage[-2]]

        for i in ordered_labels:
            if i in ranks_rev:
                lin_name += i + "-" + names[ranks_rev[i]] + "|"
        lin_name = lin_name.strip('|').replace(' ', "_")
        lineages[node.name] = lin_name

        #init taxid_lendenoms and taxid_counts if does not exist
        if node.name not in taxid_lendenoms:
            taxid_lendenoms[node.name] = 0
        if node.name not in taxid_counts:
            taxid_counts[node.name] = []

        rank = [
            ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name])
        ][0]

        #if node.is_leaf() == False and rank != "species":
        #if rank != "species":
        if rank in relab_levels:
            relab_levels[rank].append(node.name)

        if rank != "species" and prev_rank != "species":  #if not a species or a strain, add individuals
            #add indiv seqs
            for seq in taxid_counts[node.name]:
                taxid_lendenoms[node.name] += seq[4]

        if (rank == "species"
                or prev_rank == "species") and node.name in taxid_genelen:
            taxid_lendenoms[node.name] += taxid_genelen[node.name]

        for desc in node.iter_descendants():
            if desc.name in taxid_counts:
                descrank = [
                    ncbi.get_rank([desc.name])[e]
                    for e in ncbi.get_rank([desc.name])
                ][0]

                dlineage = ncbi.get_lineage(node.name)
                dnames = ncbi.get_taxid_translator(ncbi.get_lineage(node.name))
                dranks = ncbi.get_rank(ncbi.get_lineage(node.name))
                d_prev_rank = dranks[dlineage[-2]]

                if descrank == "species" or d_prev_rank == "species":
                    taxid_lendenoms[node.name] += taxid_genelen[
                        desc.name]  #if sp add full markers
                else:
                    for seq in taxid_counts[desc.name]:
                        taxid_lendenoms[node.name] += seq[
                            4]  #if not sp add submarkers

                for seq in taxid_counts[desc.name]:
                    if seq not in taxid_counts[node.name]:
                        taxid_counts[node.name].append(seq)
        #elif node.is_leaf() == False and rank == "species":
        #stuff
    #		x = 0
    #	else: #has to be a strain?

    #add full seq since is species
    #if node.name in taxid_genelen: #avoids case where there is a strain without dedicated taxid

    #		taxid_lendenoms[node.name] += taxid_genelen[node.name]
    #		relab_levels['species'].append(node.name)
    #		if node.name not in taxid_counts:
    #			orphan_children.append(node.name)

    #determine if all hits have all levels

    levels_to_remove = []
    for tax in filter_passing_taxids:
        lin = lineages[tax]
        groups = [l.split('-')[0] for l in lin.split('|')]
        levels = ordered_labels[0:len(groups)]

        if levels != groups:
            #for g in groups:
            for l in levels:
                if l not in groups:
                    if l not in levels_to_remove:
                        levels_to_remove.append(l)

    for l in levels_to_remove:
        relab_levels.pop(l)

    #calculate relabs for each level
    relabs = {}  #relabs[taxid] = [reads, amt_marker_sequence, rpks, eukfrac]
    for group in relab_levels:
        sum_rpks = 0
        for tax in relab_levels[group]:
            reads = 0
            for seq in taxid_counts[tax]:
                reads += seq[1]
            amt_marker_sequence = taxid_lendenoms[tax]
            rpks = reads / (amt_marker_sequence / 1000)
            sum_rpks += rpks
            relabs[tax] = [reads, amt_marker_sequence, rpks]
        for tax in relab_levels[group]:
            eukfrac = (relabs[tax][2] / sum_rpks) * 100
            relabs[tax].append(eukfrac)

    dest = open(files.primarytab, 'w')
    dest.write(
        "Name\tRank\tLineage\tTaxid\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n"
    )
    for tax in filter_passing_taxids:
        if tax in lineages:
            lin = lineages[tax]
        else:
            lin = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]

        rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]

        if rank == "no rank":
            #parent rank
            parent = ncbi.get_lineage(tax)[-2]
            prevrank = [
                ncbi.get_rank([parent])[e] for e in ncbi.get_rank([parent])
            ][0]
            if prevrank == "species":
                rank = "species"

        #lin = lineages[tax]
        name = [
            ncbi.get_taxid_translator([tax])[e]
            for e in ncbi.get_taxid_translator([tax])
        ][0]
        mc = taxon_coverage[tax][0]
        counts = taxon_coverage[tax][1]
        marker_percentage = taxon_coverage[tax][3]
        overall_coverage = taxon_coverage[tax][4]
        percent_identity = taxon_coverage[tax][5]

        #filter

        dest.write(name + '\t' + rank + '\t' + lin + '\t' + str(tax) + '\t' +
                   str(mc) + '\t' + str(counts) + '\t' +
                   str(marker_percentage) + '%\t' + str(overall_coverage) +
                   '%\t' + str(percent_identity) + '%\n')
    dest.close()

    #table with relative abundance for all levels
    dest = open(files.eukfrac, 'w')
    dest.write(
        "Lineage\tRank\tName\tTaxID\tRPKS\tEuk_fraction\tReads\tAmt_marker_sequence\n"
    )

    for node in primary_tree.traverse("preorder"):
        rank = [
            ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name])
        ][0]
        name = [
            ncbi.get_taxid_translator([node.name])[e]
            for e in ncbi.get_taxid_translator([node.name])
        ][0]
        if node.name in lineages:
            lin = lineages[node.name]
        else:
            lin = [
                ncbi.get_rank([node.name])[e]
                for e in ncbi.get_rank([node.name])
            ][0]

        if rank == "no rank" and node.is_leaf():
            continue  #is strain, have already printed species at this point
            #rank = "species"
        if node.name in relabs:
            rpks = round(relabs[node.name][2], 4)
            eukfrac = round(relabs[node.name][3], 4)
            reads = relabs[node.name][0]
            markerseq = relabs[node.name][1]
            dest.write(lin + '\t' + rank + '\t' + name + '\t' + node.name +
                       '\t' + str(rpks) + '\t' + str(eukfrac) + '\t' +
                       str(reads) + '\t' + str(markerseq) + '\n')
    dest.close()
コード例 #17
0
ファイル: efecht.py プロジェクト: GuiSeSanz/myScripts
NCBI = False
if NCBI :
    from ete3 import NCBITaxa
    ncbi = NCBITaxa()
    #ncbi.update_taxonomy_database()
    taxIDlist=[]
    for gene in geneList:
        name2taxID = ncbi.get_name_translator([gene.organism])
        gene.taxID = name2taxID[gene.organism][0]
        for i in ncbi.get_lineage(gene.taxID):
            
            gene.addlineageid(i)
        taxIDlist.append(gene.taxID)
        
    #taxid2name = ncbi.get_taxid_translator([9606, 9443])
    #print taxid2name
tree = False
if tree :    
    tree = ncbi.get_topology(taxIDlist)
    print tree.get_ascii(attributes=["sci_name", "rank"])
    




    
    
    
    
コード例 #18
0
ファイル: models.py プロジェクト: GuiSeSanz/myScripts
def model2Tree(modelIDList):    
    ncbi = NCBITaxa()        
    tree = ncbi.get_topology(modelIDList)
    print tree.get_ascii(attributes=["sci_name", "rank"])
    
    return tree
コード例 #19
0
        ncbiID = OMA2ncbiID[ID]
        confirmFamID_ncbiIDs[famID].append(ncbiID)

fam_commonAncestorNode = defaultdict(list)
for fam, ncbis in confirmFamID_ncbiIDs.items():
    ncbi_list = []
    ncbi_ancestor_list = []
    taxa_list = []
    node_list = []
    for ID in ncbis:
        ncbiTaxa = str(ncbiID2taxa[ID])
        taxa_list.append(ncbiTaxa)
        ncbiID = int(ID)
        ncbi_list.append(ncbiID)
        ncbi_ancestor_list.append(ID)
    tree = ncbi.get_topology(ncbi_list)
    taxa_tree = tree.get_ascii(attributes=["sci_name"])
    ancestor = tree.get_common_ancestor(ncbi_ancestor_list)
    for node in tree.traverse("levelorder"):
        node_list.append(node.sci_name)
    MRCA_node = node_list[0]
    fam_commonAncestorNode[MRCA_node].append(fam)

MRCA_dict = ast.literal_eval(json.dumps(fam_commonAncestorNode))
final_dict = dict()
for k, v in MRCA_dict.items():
    famCount = len(v)
    final_dict[k] = famCount

df = pd.DataFrame(final_dict, index=[0])
sorted_df = df.sort_values(df.last_valid_index(), axis=1)
コード例 #20
0
ref_model_file = '/home/acabbia/Documents/Muscle_Model/models/AGORA_universe.xml'
models_taxonomy = pd.read_csv(
    '/home/acabbia/Documents/Muscle_Model/GSMM-distance/agora_taxonomy.tsv',
    sep='\t').sort_values(by='organism')

#%%
#####
# MAKE REFERENCE NCBI TAXONOMY TREE
####
from ete3 import NCBITaxa

ncbi = NCBITaxa()
ncbi.update_taxonomy_database()

NCBI_ID = list(models_taxonomy['ncbiid'].dropna().values)
NCBI_tree = ncbi.get_topology(NCBI_ID)

# Ugly way to convert "phyloTree" obj into "Tree" obj for comparison with other trees
NCBI_tree.write(
    format=1,
    outfile="/home/acabbia/Documents/Muscle_Model/GSMM-distance/NCBI_tree.nw")
NCBI_tree = Tree(
    "/home/acabbia/Documents/Muscle_Model/GSMM-distance/NCBI_tree.nw",
    format=1)

#%%
#####
# MAKE GK TREE
####

graphList = []
コード例 #21
0
def main(argv):

	parser = argparse.ArgumentParser(
		description=textwrap.dedent("""\
			Summarize and filter alignments by taxid.

			Required arguments are --dbfile, --inherited_markers, --taxid_link, --readcounts, --primarytax, --primarytab, --alltab

			"""),
		formatter_class = argparse.RawDescriptionHelpFormatter
		)

	parser.add_argument(
		"--dbfile",
		type=str,
		action="store",
		dest="dbfile",
		help= "Eukdetect database folder",
		required=True
		)


	parser.add_argument(
		"--inherited_markers",
		type=str,
		action="store",
		dest="inherited_markers",
		help= "Eukdetect database folder",
		required=True
		)

	parser.add_argument(
		"--taxid_link",
		type=str,
		action="store",
		dest="taxid_link",
		help= "Eukdetect database folder",
		required=True
		)


	parser.add_argument(
		"--readcounts",
		type=str,
		action="store",
		dest="readcounts",
		help= "Read counts and mismatches file.",
		required=True
		)

	parser.add_argument(
		"--primarytax",
		type=str,
		action="store",
		dest="primarytax",
		help= "Taxonomy output of filtered hits",
		required=True
		)


	parser.add_argument(
		"--primarytab",
		type=str,
		action="store",
		dest="primarytab",
		help= "Table output of filtered hits.",
		required=True
		)

	parser.add_argument(
		"--alltab",
		type=str,
		action="store",
		dest="alltab",
		help= "Table output of all hits.",
		required=True
		)

	files = parser.parse_args()

	#initialize NCBI taxdb
	ncbi = NCBITaxa(files.dbfile)

	#create 2 dicts for ease of lookup

	#correspondence between taxid & marker gene name
	#taxid_seqs: {taxid: [seq1, seq2]}. Save every seen taxid and which seqs
	#seq_taxids = {seq: taxid, seq:taxid} Save every seq

	taxid_seqs = {}
	seq_taxids = {}
	for line in open(files.taxid_link):
		line = line.strip('\n')
		taxid = line.split('\t')[1]
		if taxid not in taxid_seqs:
			taxid_seqs[taxid] = []
		seq = line.split('\t')[0]
		taxid_seqs[taxid].append(seq)
		seq_taxids[seq] = taxid



	#save contents of read_counts_and_mismatches file as dict per observed taxid
	#save observed genuses
	#taxid_counts: {taxid: [[marker, readcount, correct_bases, total_bases, seqlen, coverage, pid, busco]]}
	taxid_counts = {}

	counter = 0
	countfile = open(files.readcounts)
	countfile.readline()

	genuses = {}
	above_species = []
	#genuses: {genus:[taxid, taxid, taxid]}

	for line in countfile:
		counter += 1
		line = line.strip('\n')
		seq = line.split('\t')[0]
		count = int(line.split('\t')[1])
		correct_bases = int(line.split('\t')[2])
		incorrect_bases = int(line.split('\t')[3])
		total_bases = int(line.split('\t')[4])
		subjlen = int(line.split('\t')[5])
		coverage = float(line.split('\t')[6])
		pid = float(line.split('\t')[7])
		taxid = seq_taxids[seq]

		if "Collapse" not in seq:
			busco = re.findall('-\d*at\d*-', seq)[0].strip('-')
		else:
			busco = "Collapsed"
		
		#determine genus
		lineage = ncbi.get_lineage(int(taxid))
		ranks = {value: key for (key, value) in ncbi.get_rank(lineage).items()}
		#lowest = list(ncbi.get_rank([lineage[-1]]).values())[0]
		if 'genus' in ranks and 'Collapse' not in seq and "species" in ranks: #lowest != "genus": #dont filter if it's at the genus level
			genus = ranks['genus']
			if genus not in genuses:
				genuses[genus] = []
			if taxid not in genuses[genus]:
				genuses[genus].append(taxid)
		elif "SSCollapse" not in seq: #don't add anything that's got SSCollapse in it
			above_species.append(taxid)

		#save info per sequence in seq_counts dict
		#seq_counts[seq] = [count, correct_bases, total_bases, subjlen, coverage, pid, busco]

		if taxid not in taxid_counts:
			taxid_counts[taxid] = []
			#find the genus if not a spcollapsed gene

		taxid_counts[taxid].append([seq, 
									count, 
									correct_bases, 
									total_bases, 
									subjlen, 
									coverage,
									pid,
									busco])

	if counter == 0:
		message = "Empty read count file. Likely no aligned reads in sample."
		#print(message)
		#still have to write stuff
		f = open(files.primarytax, 'w')
		f.write(message + '\n')
		f.close()
		f = open(files.alltab, 'w')
		f.write(message + '\n')
		f = open(files.primarytab, 'w')
		f.write(message + '\n')
		f.close()
		sys.exit()
	countfile.close()

	#done parsing read_counts_and_mismatches file

	#calculate stats for each observed taxid
	taxon_coverage = {}
	
	#taxon_coverage[taxon] = [observed_markers, 
	#readcounts, 
	#total_bases, 
	#percentage_markers, 
	#marker_coverage, 
	#percent_id, 
	#buscos]

	seen_taxids = []

	for tax in taxid_counts:
		mc = len(taxid_counts[tax])
		counts = 0
		bases = 0
		correct = 0
		total_bases = 0
		subj_len = 0
		buscos = []
		for i in range(0, len(taxid_counts[tax])):

			busco = taxid_counts[tax][i][-1]
			if len(busco) > 1:
				buscos.append(busco)

			counts += taxid_counts[tax][i][1]
			bases += taxid_counts[tax][i][3]
			correct += taxid_counts[tax][i][2]
			total_bases += taxid_counts[tax][i][3]
			subj_len += taxid_counts[tax][i][4]

		percent_identity = round((correct / total_bases) * 100, 2)
		overall_coverage = round((total_bases / subj_len ) * 100, 2)
		total_markers = len(taxid_seqs[tax])
		marker_percentage = round( mc / total_markers * 100, 2)
		name = [ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax])][0]

		if tax not in seen_taxids:
			seen_taxids.append(tax)

		taxon_coverage[tax] = [mc, 
								counts, 
								total_bases, 
								marker_percentage, 
								overall_coverage, 
								percent_identity, 
								buscos]
	#create tree structure for all observed taxids

	tree = ncbi.get_topology(seen_taxids)
	tree_root = tree.get_tree_root().name
	lineage = ncbi.get_lineage(tree_root)
	tree_taxids = seen_taxids + lineage
	full_tree = ncbi.get_topology(tree_taxids, intermediate_nodes=True)
	full_taxid_lineage = [node.name for node in full_tree.traverse()]

	#full_seq_taxids: {taxid: [[specific buscos], specific count, specific + inherited count]}
	full_seq_taxids = {}
	for line in open(files.inherited_markers):
		line = line.strip('\n')
		taxid = line.split('\t')[0]
		if taxid in full_taxid_lineage:
			buscos = []
			for seq in line.split('\t')[1].split(','):
				if len(re.findall('-\d*at\d*-', seq)) > 0:
					busco = re.findall('-\d*at\d*-',seq)[0].strip('-')
					if busco not in buscos:
						buscos.append(busco)

			specific_count = len(line.split('\t')[1].split(','))
			sp_and_inherited_count = len(line.split('\t')[2].split(','))

			full_seq_taxids[taxid] = [buscos, specific_count, sp_and_inherited_count]

	#determine primary and secondary hits
	#if MRCA is at the level of genus, consider whether one should be primary or secondary by looking at buscos
	primary = {}
	secondary = {}


	for g in genuses:
		if len(genuses[g]) > 1: #multiple species in same genus
			taxids = genuses[g]
			reads = [taxon_coverage[taxid][1] for taxid in taxids]
			bases = [taxon_coverage[taxid][2] for taxid in taxids]

			#if one has more reads and more bases than all others, it is primary, others are secondary
			maxreads = max(reads)
			maxbases = max(bases)
			ptaxids = []

			if (reads.count(maxreads) == 1 and bases.count(maxbases) == 1)\
			 and (reads.index(maxreads) == bases.index(maxbases)): #no ties, same ID
			 	maxtax = taxids[reads.index(maxreads)]
			 	primary[maxtax] = taxon_coverage[maxtax][0:5]
			 	ptaxids.append(maxtax)
				#ptaxids.append(taxids[reads.index(maxreads)])
				#primary[ptaxid] = taxon_coverage[ptaxid][0:5]
				#p_buscos = full_seq_taxids[ptaxid][0]
			else:
				for t in taxids: 
					if taxon_coverage[t][1] == maxreads or taxon_coverage[t][2] == maxbases:
						ptaxids.append(t)
						primary[t] = taxon_coverage[t][0:5]

			unsorted_ataxids = [t for t in taxids if t not in ptaxids]
			ataxids = sorted(unsorted_ataxids, key = lambda x: taxon_coverage[x][1], reverse = True)
			for ataxid in ataxids:
				#print(ataxid)
				is_secondary = False
				for ptaxid in primary:
					p_buscos = [b for b in full_seq_taxids[ptaxid][0]]
					a_buscos = taxon_coverage[ataxid][-1]
					a_remain = [b for b in a_buscos if b in p_buscos]
					#print(a_buscos)
					#print(a_remain)
					if len(a_remain) > 0:
						a_above = []
						for b in a_remain:

							#it may not be a hit for the other one! check first
							#check that the pid for this hit is lower
							apid = [seq[6] for seq in taxid_counts[ataxid] if seq[7] == b]
							ppid = [seq[6] for seq in taxid_counts[ptaxid] if seq[7] == b]

							if len(ppid) > 0 and apid[0] >= ppid[0]:
								a_above.append(b)
							elif len(ppid) == 0:
								a_above.append(b)
						#if a_buscos is fewer than 5, all must be correct
						#print(a_above)
						if len(a_buscos) < 5:
							if len(a_above) < len(a_buscos):
								is_secondary = True
						else:
							if len(a_above) <= len(a_buscos)/2: #change: alt hit has to be half or busco hits being above
								is_secondary = True
					else:
						is_secondary = True
				if is_secondary:
					secondary[ataxid] = taxon_coverage[ataxid][0:5] + [ptaxid]
				else:
					primary[ataxid] = taxon_coverage[ataxid][0:5]
		else: #primary
			taxid = genuses[g][0]
			primary[taxid] = taxon_coverage[taxid][0:5]

	#add anything else
	for t in above_species:
		primary[t] = taxon_coverage[t][0:5]

	#write full table
	marker_sorted = sorted(taxon_coverage.keys(), reverse = True, key = lambda x: taxon_coverage[x][3])

	dest = open(files.alltab, 'w')
	dest.write("Name\tTaxid\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n")
	for tax in marker_sorted:
		rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]
		name = [ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax])][0]
		mc = taxon_coverage[tax][0]
		counts = taxon_coverage[tax][1]
		marker_percentage = taxon_coverage[tax][3]
		overall_coverage = taxon_coverage[tax][4]
		percent_identity = taxon_coverage[tax][5]
		dest.write(name + '\t'
			+ str(tax) + '\t'
			+ str(mc) + '\t' 
			+ str(counts) + '\t' 
			+ str(marker_percentage) + '%\t'
			+ str(overall_coverage) + '%\t'
			+ str(percent_identity) + '%\n')
	dest.close()

	dest = open(files.primarytab, 'w')
	dest.write("Name\tTaxid\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n")
	#TODO: implement filters

	primary_sorted = sorted(primary.keys(), reverse = True, key = lambda x: primary[x][3])
	#secondary_sorted = sorted(secondary.keys(), reverse=True, key=lambda x: secondary[x][3])

	filter_passing_taxids = []

	for tax in primary_sorted:
		rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]
		name = [ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax])][0]
		mc = taxon_coverage[tax][0]
		counts = taxon_coverage[tax][1]
		marker_percentage = taxon_coverage[tax][3]
		overall_coverage = taxon_coverage[tax][4]
		percent_identity = taxon_coverage[tax][5]
		#filter
		if int(mc) >= 2 and int(counts) >= 4:
			filter_passing_taxids.append(tax)
			dest.write(name + '\t'
				+ str(tax) + '\t'
				+ str(mc) + '\t' 
				+ str(counts) + '\t' 
				+ str(marker_percentage) + '%\t'
				+ str(overall_coverage) + '%\t'
				+ str(percent_identity) + '%\n')
	dest.close()

	#close if no filter passing taxids
	if len(filter_passing_taxids) == 0:
		message = "No taxa passing filter requirements."
		#print(message)

		#still have to write stuff
		f = open(files.primarytab, 'w')
		f.write(message + '\n')
		f.close()
		f = open(files.primarytax, 'w')
		f.write(message + '\n')
		f.close()
		sys.exit()

	#create NCBI taxon tree of observed taxa + extend to cellular_org
	tree = ncbi.get_topology(filter_passing_taxids)
	tree_root = tree.get_tree_root().name
	lineage = ncbi.get_lineage(tree_root)
	primary_tree_taxids = [int(e) for e in filter_passing_taxids] + lineage
	primary_tree = ncbi.get_topology(primary_tree_taxids, intermediate_nodes=True)
	#write the tree	structure to file

	orphan_children = []

	#find counts of seqs for internal nodes
	for node in full_tree.traverse():
		if node.is_leaf() == False:
			if node.name not in taxid_counts:
				taxid_counts[node.name] = []
			for desc in node.iter_descendants():
				if desc.name in taxid_counts:
					for seq in taxid_counts[desc.name]:
						if seq not in taxid_counts[node.name]:
							taxid_counts[node.name].append(seq)
		else:
			if node.name not in taxid_counts:
				orphan_children.append(node.name)


	#create new tree of filter passing hits


	level_counts = []
	currspaces = 0
	currparent = ''
	seen_parents = {}

	dest = open(files.primarytax, 'w')
	dest.write("Markers_Obs\tTotal_Markers\tPercent_Makers_Obs\tPercent_ID\tMarker_read_count\tRank\tName\n")
	for node in primary_tree.traverse("preorder"):

		if node.name not in orphan_children and node.name in full_seq_taxids:
			rank = [ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name])][0]
			name = [ncbi.get_taxid_translator([node.name])[e] for e in ncbi.get_taxid_translator([node.name])][0]
			if node.is_root():
				currspaces = 0
			else:
				if currparent == '':
					currparent = node.up.name
					currspaces += 4
				else:
					if currparent != node.up.name:
						currparent = node.up.name
						if currparent in seen_parents:
							currspaces = seen_parents[currparent]
						else:
							currspaces += 4
							seen_parents[currparent] = currspaces
			if node.name in taxon_coverage:
				pid = str(taxon_coverage[node.name][5]) + '%'
			else:
				pid = "NA"
			#total_buscos
			buscos = len(taxid_counts[str(node.name)])
			seqs = sum([b[1] for b in taxid_counts[node.name]])
			total_buscos = full_seq_taxids[node.name][2]
			percent = round((buscos/total_buscos)*100,2)
			dest.write(str(buscos) + '\t' 
				+ str(total_buscos) + "\t" 
				+ str(percent)  + '%\t' 
				+ str(pid) + '\t'
				+ str(seqs) + '\t' 
				+ rank + '\t' 
				+ ' ' * currspaces + name + '\n')
	dest.close()
コード例 #22
0
from ete3 import NCBITaxa

#The first time this will download the taxonomic NCBI database and save a parsed version
#of it in  `~/.etetoolkit/taxa.sqlite`.May take some minutes
ncbi = NCBITaxa()
print("ncbi.dbfile", ncbi.dbfile)

with open(snakemake.input[0], 'r', encoding='utf8') as fh:
    genus_list = fh.read().strip().split('\n')

genus_to_taxid = ncbi.get_name_translator(genus_list)
tax_id_vals = genus_to_taxid.values()

tree = ncbi.get_topology(
    [genus_id for subls in tax_id_vals for genus_id in subls],
    intermediate_nodes=True)

# `get_ascii()` has a bug, prints the taxons before to genus without any separation between them, so a way to avoid that is using extra attribues, `dist` seems to be less invasive. Also, numbers from 'dist' are replaced
with open(snakemake.output[0], mode='w', encoding='utf8') as fh:
    print(tree.get_ascii(attributes=["dist", "sci_name"]).replace('1.0,', '-'),
          file=fh)
コード例 #23
0
# Load the species names
try:
    with open(args.taxons, 'r') as taxFile:
        listTaxa = taxFile.readlines()
        listTaxa = [x.strip() for x in listTaxa]
        listTaxa = [x.split(" ") for x in listTaxa]
        listTaxa = list(set(itertools.chain(*listTaxa)))
        listTaxa = [x.replace("_", " ") for x in listTaxa]
except FileNotFoundError:
    print("File does not exist")
    sys.exit(1)

# Retrieve TaxId from species names
IdTaxList = (ncbi.get_name_translator(listTaxa)).values()
IdTaxList = list(itertools.chain(*IdTaxList))

# Create un dictionary with IdTax as key and species names as value
idTaxa2names = ncbi.get_taxid_translator(IdTaxList)

# Retrieve phylogenetic tree
tree = ncbi.get_topology(IdTaxList)

# Change idTax to names
leaves = tree.get_leaves()
for i in range(len(leaves)):
    leaves[i].name = idTaxa2names[int(leaves[i].name)]

# write newick
with open(args.output, 'w') as treeFile:
    treeFile.write(tree.write(format=9).replace(" ", "_"))
コード例 #24
0
list_name = [x.strip('\n') for x in file_name.readlines()]

ref_levels = [
    'subspecies', 'species', 'genus', 'family', 'order', 'class', 'phylum',
    'superkingdom'
]
myList = []

for linea in list_name:
    taxid_dirty_dir = (ncbi.get_name_translator([linea]))

    if len(taxid_dirty_dir) > 0:
        nude = str(taxid_dirty_dir.values()[0][0])
        myList.insert(0, nude)

t = ncbi.get_topology(myList, intermediate_nodes=True)
linaje = ncbi.get_lineage(t.get_common_ancestor(myList).name)
pairs = ncbi.get_rank(linaje)

flag = 0
for each_ref in ref_levels:
    if each_ref in pairs.values() and flag == 0:
        print ncbi.get_taxid_translator([
            (list(pairs.keys())[list(pairs.values()).index(each_ref)])
        ]).values()
        flag = 1

if flag == 0:
    print "Unclassified"

sys.stdout = orig_stdout
コード例 #25
0
                  'steps_from_Eukaryota' + '\n')
outputFile3 = open('ontology.tab', 'w')
#Here I open the file that Matt script creates and loops in each line and get the taxids
with open('SP_by_taxa.tab', 'r') as fo:
    for line in fo:
        line = line.rstrip()
        (uniprotid, taxids) = line.split('\t')
        one_taxid = taxids.split(
            ',')  # divide the list of taxids to diff taxids 'strings'
        tax_dict[uniprotid] = one_taxid
        one_taxid_int = []
        for i in range(len(one_taxid)):
            one_taxid_int.append(int(
                one_taxid[i]))  #ete3 take a list of taxid integers
        #print(one_taxid)
        tree = ncbi.get_topology(
            one_taxid)  #creates the tree of taxids for each uniprot id
        outputFile.write(uniprotid + '\t' + tree.write(format=3) +
                         '\n')  #writing tab file of uniprot/tree_string
        tree_dict[uniprotid] = tree
        one_taxid_str = []
        for i in range(len(one_taxid_int)):
            one_taxid_str.append(str(
                one_taxid_int[i]))  # returning to list of strings again
        #print(one_taxid)

        #get the first common ancestor of each uniprotid as taxid
        first_common_ancestor_taxid = tree.get_tree_root()
        #print(first_common_ancestor_taxid.name)
        if first_common_ancestor_taxid.name in ontology:
            uniprotid_set = ontology[first_common_ancestor_taxid.name]
            uniprotid_set.add(uniprotid)
コード例 #26
0
#export PATH=~/anaconda_ete/bin:$PATH
from ete3 import NCBITaxa
ncbi = NCBITaxa()

####### BRAINCODE viruses taxonomy tree ########
fp_in = open("/PHShome/tw786/localView/overview/Tree/BRAINCODE_viruses.txt")
viruses1 = fp_in.readlines()
viruses1 = [x.strip() for x in viruses1]
viruses_taxid = ncbi.get_name_translator(viruses1)
viruses_taxid = [x[0] for x in viruses_taxid.values()]
tree = ncbi.get_topology(viruses_taxid)
file_path = "/PHShome/tw786/localView/overview/Tree/BRAINCODE_viruses_tree.txt"
fp = open(file_path, 'w')
print >> fp, tree.get_ascii(attributes=['sci_name', 'rank'])
fp_in.close()
fp.close()
####### GTEx viruses taxonomy tree ########
fp_in = open("/PHShome/tw786/localView/overview/Tree/GTEx_viruses.txt")
viruses2 = fp_in.readlines()
viruses2 = [x.strip() for x in viruses2]
viruses_taxid = ncbi.get_name_translator(viruses2)
viruses_taxid = [x[0] for x in viruses_taxid.values()]
tree = ncbi.get_topology(viruses_taxid)
file_path = "/PHShome/tw786/localView/overview/Tree/GTEx_viruses_tree.txt"
fp = open(file_path, 'w')
print >> fp, tree.get_ascii(attributes=['sci_name', 'rank'])
fp_in.close()
fp.close()
####### BRAINCODE + GTEx viruses taxonomy tree ########
viruses_merge = viruses1 + viruses2
viruses_merge = list(set(viruses_merge))
コード例 #27
0
def main(argv):

    #read in taxonomy info for each BUSCO
    species_taxids = []  #species_taxids[marker_id] = taxid

    for line in open(sys.argv[1]):
        tax = line.split('\t')[1].strip('\n')
        if tax not in species_taxids:
            species_taxids.append(tax)

    #initialize NCBI taxdb
    ncbi = NCBITaxa(sys.argv[2])

    #create 2 dicts for ease of lookup
    #taxid_seqs: {taxid: [seq1, seq2]}. Save every seen taxid and which seqs
    #seq_taxids = {seq: taxid, seq:taxid} Save every seq
    taxid_seqs = {}
    seq_taxids = {}
    for line in open(sys.argv[1]):
        line = line.strip('\n')
        taxid = line.split('\t')[1]
        if taxid not in taxid_seqs:
            taxid_seqs[taxid] = []
        seq = line.split('\t')[0]
        taxid_seqs[taxid].append(seq)
        seq_taxids[seq] = taxid

    #iterate over idxstats file and save counts
    #seq_counts[seq] = [readcount, correct_bases, total_bases, seqlen, coverage]
    seq_counts = {}
    seen_taxids = []
    counter = 0
    countfile = open(sys.argv[4])
    countfile.readline()
    for line in countfile:
        counter += 1
        line = line.strip('\n')
        seq = line.split('\t')[0]
        count = int(line.split('\t')[1])
        correct_bases = int(line.split('\t')[2])
        incorrect_bases = int(line.split('\t')[3])
        total_bases = int(line.split('\t')[4])
        subjlen = int(line.split('\t')[5])
        coverage = float(line.split('\t')[6])
        seq_counts[seq] = [
            count, correct_bases, total_bases, subjlen, coverage
        ]
        taxid = seq_taxids[seq]
        if taxid not in seen_taxids:
            seen_taxids.append(int(taxid))

    if counter == 0:
        message = "Empty read count file. Likely no aligned reads in sample."
        print(message)
        #still have to write stuff
        f = open(sys.argv[5], 'w')
        f.write(message + '\n')
        f.close()
        f = open(sys.argv[6], 'w')
        f.write(message + '\n')
        f.close()
        sys.exit()
    #done parsing idxstats file

    #create NCBI taxon tree of observed taxa + extend to cellular_org
    tree = ncbi.get_topology(seen_taxids)
    tree_root = tree.get_tree_root().name
    lineage = ncbi.get_lineage(tree_root)
    full_taxids = seen_taxids + lineage
    full_tree = ncbi.get_topology(full_taxids, intermediate_nodes=True)

    full_seq_taxids = {
        line.split('\t')[0]: [
            line.split('\t')[1].split(','),
            line.split('\t')[-1].strip('\n').split(',')
        ]
        for line in open(sys.argv[3])
    }
    #full_seq_taxids: {taxid: [[specific buscos], [specific + inherited buscos]]}
    #determine seq counts

    #taxid_counts: {taxid: [[marker, readcount, correct_bases, total_bases, seqlen, coverage]]}
    taxid_counts = {}
    for seq in seq_counts:
        taxid = seq_taxids[seq]
        if taxid not in taxid_counts:
            taxid_counts[taxid] = []
        taxid_counts[taxid].append([
            seq,
            int(seq_counts[seq][0]),
            int(seq_counts[seq][1]), seq_counts[seq][2], seq_counts[seq][3],
            seq_counts[seq][4]
        ])

    #write just observed taxid seqs
    taxon_coverage = {}
    #taxon_coverage[taxon] = [observed_markers, readcounts, total_bases, percentage_markers, marker_coverage, percent_id ]
    #dest = open(sys.argv[6], 'w')
    #dest.write("Name\tNCBI_Rank\tTaxID\tObserved_markers\tRead_counts\tPercent_observed_markers\tMarker_coverage\tPercent_identity\n")

    for tax in taxid_counts:
        mc = len(taxid_counts[tax])
        counts = 0
        bases = 0
        correct = 0
        total_bases = 0
        subj_len = 0
        for i in range(0, len(taxid_counts[tax])):
            counts += taxid_counts[tax][i][1]
            bases += taxid_counts[tax][i][3]
            correct += taxid_counts[tax][i][2]
            total_bases += taxid_counts[tax][i][3]
            subj_len += taxid_counts[tax][i][4]
        percent_identity = round((correct / total_bases) * 100, 2)
        overall_coverage = round((total_bases / subj_len) * 100, 2)
        total_markers = len(taxid_seqs[tax])
        marker_percentage = round(mc / total_markers * 100, 2)
        name = [
            ncbi.get_taxid_translator([tax])[e]
            for e in ncbi.get_taxid_translator([tax])
        ][0]
        #rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]
        #dest.write(name + '\t'
        #	+ rank + '\t'
        #		+ tax + '\t'
        #		+ str(mc) + '\t'
        #		+ str(counts) + '\t'
        #		+ str(marker_percentage) + '%\t'
        #		+ str(overall_coverage) + '%\t'
        #		+ str(percent_identity) + '%\n')
        taxon_coverage[tax] = [
            mc, counts, total_bases, marker_percentage, overall_coverage,
            percent_identity
        ]
    #dest.close()
    dest = open(sys.argv[6], 'w')
    dest.write(
        "Name\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n"
    )
    marker_sorted = sorted(taxon_coverage.keys(),
                           reverse=True,
                           key=lambda x: taxon_coverage[x][3])

    for tax in marker_sorted:
        rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0]
        name = [
            ncbi.get_taxid_translator([tax])[e]
            for e in ncbi.get_taxid_translator([tax])
        ][0]
        mc = taxon_coverage[tax][0]
        counts = taxon_coverage[tax][1]
        marker_percentage = taxon_coverage[tax][3]
        overall_coverage = taxon_coverage[tax][4]
        percent_identity = taxon_coverage[tax][5]
        dest.write(name + '\t' + str(mc) + '\t' + str(counts) + '\t' +
                   str(marker_percentage) + '%\t' + str(overall_coverage) +
                   '%\t' + str(percent_identity) + '%\n')

    orphan_children = []

    #find counts of seqs for internal nodes
    for node in full_tree.traverse():
        if node.is_leaf() == False:
            if node.name not in taxid_counts:
                taxid_counts[node.name] = []
            for desc in node.iter_descendants():
                if desc.name in taxid_counts:
                    for seq in taxid_counts[desc.name]:
                        if seq not in taxid_counts[node.name]:
                            taxid_counts[node.name].append(seq)
        else:
            if node.name not in taxid_counts:
                orphan_children.append(node.name)

    #print the tree
    level_counts = []
    currspaces = 0
    currparent = ''
    seen_parents = {}
    dest = open(sys.argv[5], 'w')
    dest.write(
        "Markers_Obs\tTotal_Markers\tPercent_Makers_Obs\tPercent_ID\tMarker_read_count\tRank\tName\n"
    )
    for node in full_tree.traverse("preorder"):
        if node.name not in orphan_children:
            rank = [
                ncbi.get_rank([node.name])[e]
                for e in ncbi.get_rank([node.name])
            ][0]
            name = [
                ncbi.get_taxid_translator([node.name])[e]
                for e in ncbi.get_taxid_translator([node.name])
            ][0]
            if node.is_root():
                currspaces = 0
            else:
                if currparent == '':
                    currparent = node.up.name
                    currspaces += 4
                else:
                    if currparent != node.up.name:
                        currparent = node.up.name
                        if currparent in seen_parents:
                            currspaces = seen_parents[currparent]
                        else:
                            currspaces += 4
                            seen_parents[currparent] = currspaces
            if node.name in taxon_coverage:
                pid = str(taxon_coverage[node.name][5]) + '%'
            else:
                pid = "NA"
            #total_buscos
            buscos = len(taxid_counts[node.name])
            seqs = sum([b[1] for b in taxid_counts[node.name]])
            total_buscos = len(full_seq_taxids[node.name][1])
            percent = round((buscos / total_buscos) * 100, 2)
            dest.write(
                str(buscos) + '\t' + str(total_buscos) + "\t" + str(percent) +
                '%\t' + str(pid) + '\t' + str(seqs) + '\t' + rank + '\t' +
                ' ' * currspaces + name + '\n')
    dest.close()
コード例 #28
0
ファイル: EPCNN_PS.py プロジェクト: Microbiods/WRF_EPCNN
#                             '1166016':'1905730'})

# here as we descriped in the paper, we PhyloT to generate the tree,
# since PhyloT is not free, so here we offer a free way to genetate by using ETE3

raw_id = known.columns.values.tolist()
ncbi = NCBITaxa()

# Also, we can use the Newick obtained file to get the tree by using PhyloT, just like the
# description in our paper

# import ete3
# tree=ete3.Tree("tree.txt",format=8)
# print(tree)

tree = ncbi.get_topology(raw_id)
print(tree.get_ascii(attributes=["taxid"]))

order = []
num = 1
for node in tree.traverse(strategy='levelorder'):
    if node.is_leaf():
        order.append(node.name)

postorder = []
num = 1
for node in tree.traverse(strategy='postorder'):
    if node.is_leaf():
        postorder.append(node.name)

known_Xl = known[order]
コード例 #29
0
    try:
        ncbi.update_taxonomy_database()
    except:
        pass

if options.input_species_filename is None:
    raise Exception('-s option must be specified, Species list in text format one species in each line')

with open(options.input_species_filename) as f:
    species_name = [_.strip().replace('_', ' ') for _ in f.readlines()]

name2taxid = ncbi.get_name_translator(species_name)

taxid = [name2taxid[_][0] for _ in species_name]

tree = ncbi.get_topology(taxid)

if options.treebest == "yes":
    inv_map = {str(v[0]): k.replace(" ", "") + "*" for k, v in name2taxid.items()}
else:
    inv_map = {str(v[0]): k for k, v in name2taxid.items()}


for leaf in tree:
    leaf.name = inv_map[leaf.name]

newickTree = tree.write(format=int(options.format))

if options.treebest == "yes":
    newickTree = newickTree.rstrip(';')
    newickTree = newickTree + "root;"
コード例 #30
0
    mode = args.mode
    newick = args.newick

    if newick:
        t = PhyloTree(args.newick)      
        species2taxid = dict([ line.split()[0], line.strip().split()[1] ] for line in open(infile))
        taxids = set(species2taxid.values())
    else:
        ncbi = NCBITaxa()
        taxids = set([ line.strip() for line in open(infile) ])


    if args.taxoncolors:
        taxon2color = dict([int(line.split()[0]), line.split()[1]] for line in open(args.taxoncolors))

    tNCBI = ncbi.get_topology(taxids, intermediate_nodes=True)
    tNCBI = tNCBI.search_nodes(name="2759")[0]
    ncbi.annotate_tree(tNCBI, taxid_attr="name")
    tax2node = dict([node.taxid, node] for node in tNCBI.traverse())

    if args.no_intermediate_nodes:
        for node in tNCBI.get_descendants():
            if len(node.children) == 1:
                node.delete()
        if len(tNCBI.children) == 1:
            tNCBI = tNCBI.children[0]
    
    tax2node = {}
    for node in tNCBI.traverse():
        tax2node[node.taxid] = node
        if args.taxoncolors:
コード例 #31
0
ファイル: makeETEFiles.py プロジェクト: yw595/GutMicrobiota
RefSeqSpecies = []
with open(
        '/home/ubuntu/MATLAB/GutMicrobiota/input/reference_genomes.txt') as f:
    next(f)
    for line in f:
        words = line.split('\t')
        RefSeqSpecies.append(words[0])
    f.close()

ncbi = NCBITaxa()
name2taxid = ncbi.get_name_translator(
    list(set(ZhangZhaoGenera + ForslundHildebrandGenera + RefSeqSpecies)))

tree = ncbi.get_topology(list(
    itertools.chain.from_iterable(list(name2taxid.values()))),
                         intermediate_nodes=True)
#print(tree.get_ascii(attributes=['sci_name']), file=open('/home/ubuntu/taxonomy.txt','w'))

#print(tree.name)
# fh = open('/home/ubuntu/MATLAB/GutMicrobiota/output/writeETEFiles/closestSpecies.txt','w')
# for genus in ZhangZhaoGenera+ForslundHildebrandGenera:
#     print(genus)
#     minDist = -1
#     minDistSpecies = ''
#     for species in RefSeqSpecies:
#         genusNode = tree.search_nodes(name=str(name2taxid[genus][0]))[0]
#         speciesNode = tree.search_nodes(name=str(name2taxid[species][0]))[0]
#         dist = tree.get_distance(speciesNode, genusNode)
#         if minDist == -1:
#             minDist = dist
コード例 #32
0
def get_rank_summary_statistics(rank='phylum'):
    '''

    Get phylogeny from the ncbi taxonomy database given the taxon list in the table pfam.refseq_ref_repres_genomes
    Keep rank phylogeny in the table pfam.phylogeny
    Calculate genome counts for each taxon at the specified rank. Save taxid2count in the table: pfam.<rank>_leaf2n_genomes

    :param rank:
    :return:
    '''

    import MySQLdb
    import os
    from ete3 import NCBITaxa, Tree, TextFace, TreeStyle, StackedBarFace
    ncbi = NCBITaxa()

    sqlpsw = os.environ['SQLPSW']
    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="eggnog")  # name of the data base
    cursor = conn.cursor()

    sql = 'create table if not exists eggnog.phylogeny (rank varchar(400), phylogeny TEXT)'
    cursor.execute(sql, )
    conn.commit()

    sql2 = 'CREATE table if not exists eggnog.leaf2n_genomes_%s(taxon_id INT, n_genomes INT)' % rank
    cursor.execute(sql2, )
    conn.commit()

    sql_taxid_list = 'select distinct taxon_id from eggnog.NOG_members_v451;'
    cursor.execute(sql_taxid_list, )
    taxid_list = [i[0] for i in cursor.fetchall()]

    tree = ncbi.get_topology(taxid_list, rank_limit=rank)

    taxon_id_list = [int(i.name) for i in tree.traverse("postorder")]
    taxon_id2scientific_name = ncbi.get_taxid_translator(taxon_id_list)

    sql = 'CREATE table if not exists eggnog.taxid2label_%s(taxon_id INT, scientific_name TEXT, rank TEXT)' % (
        rank)
    cursor.execute(sql, )

    taxon_id2rank = {}
    for taxon in taxon_id2scientific_name:
        ranks = ncbi.get_rank([taxon])

        try:
            r = ranks[max(ranks.keys())]
        except:
            r = '-'
        taxon_id2rank[taxon] = r

    for taxon in taxon_id2scientific_name:
        sql = 'insert into eggnog.taxid2label_%s values(%s, "%s", "%s")' % (
            rank, taxon, taxon_id2scientific_name[taxon], taxon_id2rank[taxon])

        cursor.execute(sql, )
    conn.commit()

    collapse = [
        'Opisthokonta', 'Alveolata', 'Amoebozoa', 'Stramenopiles',
        'Viridiplantae', 'Rhodophyta', 'Trypanosomatidae', 'Viruses',
        'unclassified Bacteria', 'Leptospiraceae',
        'unclassified Gammaproteobacteria', 'unclassified Alphaproteobacteria',
        'unclassified Epsilonproteobacteria',
        'unclassified Deltaproteobacteria',
        'unclassified Cyanobacteria (miscellaneous)',
        'unclassified Firmicutes sensu stricto',
        'unclassified Actinobacteria (class) (miscellaneous)',
        'unclassified Tissierellia', 'Dehalogenimonas'
    ]
    #def collapsed_leaf(node):
    #    collapse = ['Opisthokonta', 'Alveolata','Amoebozoa','Stramenopiles','Viridiplantae','Rhodophyta', 'Trypanosomatidae', 'Viruses']
    #    name = taxon_id2scientific_name[int(node.name)]
    #    if name in collapse:
    #       return True
    #    else:
    #       return False

    # colapse major euk clades some clades

    for node in tree.traverse("postorder"):
        name = taxon_id2scientific_name[int(node.name)]
        to_detach = []
        if name in collapse:
            to_detach.extend(node.children)
            print('ok-------------------', node.name)
        for n in to_detach:
            n.detach()
    leaves_list = [i.name for i in tree.iter_leaves()]
    leaf_taxon2n_species = {}
    leaf_taxon2n_species_with_domain = {}
    for leaf_taxon in leaves_list:
        print('leaf', leaf_taxon)
        leaf_taxon2n_species[leaf_taxon] = 0
        leaf_taxon2n_species_with_domain[leaf_taxon] = 0
        for taxon in taxid_list:
            lineage = ncbi.get_lineage(taxon)
            if int(leaf_taxon) in lineage:
                leaf_taxon2n_species[leaf_taxon] += 1
                #if taxon in taxid_with_domain_list:
                #    leaf_taxon2n_species_with_domain[leaf_taxon]+=1
    for leaf_taxon in leaf_taxon2n_species:
        sql = 'insert into eggnog.leaf2n_genomes_%s values(%s, %s)' % (
            rank, leaf_taxon, leaf_taxon2n_species[leaf_taxon])
        cursor.execute(sql, )
    conn.commit()

    sql = 'insert into eggnog.phylogeny values("%s","%s")' % (
        rank, tree.write(format=1))
    cursor.execute(sql, )
    conn.commit()
コード例 #33
0
ファイル: names.py プロジェクト: anhelinaosiieva/Project
import random
from ete3 import Tree, NodeStyle, TreeStyle, NCBITaxa, faces

ncbi = NCBITaxa()

my_tree = ncbi.get_topology([54263, 8324, 8323, 8327, 8325, 57571, 323754])

ts = TreeStyle()
ts.show_leaf_name = True

for n in my_tree.traverse():
    nstyle = NodeStyle()
    nstyle["fgcolor"] = "yellow"
    nstyle["size"] = 10
    n.set_style(nstyle)

my_tree.img_style["size"] = 20
my_tree.img_style["fgcolor"] = "green"

code_name = {
    "54263": "Ichthyosaura alpestris",
    "8324": "Lissotriton vulgaris",
    "8323": "Triturus cristatus",
    "8327": "Triturus dobrogicus",
    "8325": "Triturus karelinii ",
    "57571": "Salamandra salamandra ",
    "323754": "Lissotriton montandoni"
}


def mylayout(node):
コード例 #34
0
ファイル: ete_ncbiquery.py プロジェクト: Ward9250/ete
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" %sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" %(target_taxid))
            t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                              intermediate_nodes=args.full_lineage,
                              rank_limit=args.rank_limit,
                              collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage)))
        dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name",
                          "collapse_subspecies", "named_lineage"])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids)))
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit)
            print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''),
                             '|'.join(map(str, descendants)),
                             '|'.join(map(str, ncbi.translate_to_names(descendants)))]))

    elif args.info:
        print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))