def get_tree(taxonids, num_taxonids = 0): ncbi = NCBITaxa() if num_taxonids != 0: taxonids = taxonids[:num_taxonids] #smaller set of taxonids for tree construction and testing return ncbi.get_topology(taxonids) #5,360 total nodes for full dataset
def get_ncbi_taxonomy_species_tree(names_list): ncbi = NCBITaxa() name2taxid_dic = ncbi.get_name_translator(names_list) taxid_list = [] for i in range(len(names_list)): taxid_list.append(name2taxid_dic[names_list[i]][0]) return ncbi.get_topology(taxid_list)
def get_tax( cma_file ): #finds the 'lowest common ancestor' of species represented in a cma file ncbi = NCBITaxa() org_regex = r'\[(.*)\]' taxid_list = [] for line in open(cma_file, 'r'): if line.startswith(">"): find_org_name = re.search(org_regex, line) if find_org_name is not None: org_name = find_org_name.group(1) taxid = str(ncbi.get_name_translator([org_name])) taxid = re.sub(r'^.*\[', '', taxid) taxid = re.sub(r'\].*$', '', taxid) if taxid != '{}' and taxid != '32630' and taxid != '10239': #omit sequences from viruses and synthetic constructs' taxid_list.append(taxid) tax_list = ncbi.get_taxid_translator(taxid_list) tree = ncbi.get_topology(taxid_list) tree_labeled = tree.get_ascii(attributes=['sci_name', 'taxid']) lca_id = str(tree.get_tree_root) lca_id = re.sub(r"^.*node '", '', lca_id) lca_id = re.sub(r"'.*$", '', lca_id) lca_name = str(ncbi.get_taxid_translator([lca_id])) lca_name = re.sub(r"'}$", '', lca_name) lca_name = re.sub(r"^.*'", '', lca_name) return (lca_name, tax_list, tree_labeled)
def get_distance_to_common_ancestor(self, other): """ Calculate the number of links in the NCBI taxonomic tree between two taxa and their latest common ancestor Note: This distances depends on the granularity of the lineage of the taxon. For example, there are only 7 links between most bacteria species and the Bacteria superkingdom. However, there are 28 links between the H**o sapiens species and the Eukaryota superkingdom. Args: other (:obj:`Taxon`): a second taxon Returns: :obj:`int`: number of links between :obj:`self` and its latest common ancestor with :obj:`other` in the NCBI taxonomic tree """ if self.id_of_nearest_ncbi_taxon is None: return id_of_nearest_ncbi_taxon ncbi_taxa = NCBITaxa() tree = ncbi_taxa.get_topology( [self.id_of_nearest_ncbi_taxon, other.id_of_nearest_ncbi_taxon], intermediate_nodes=True) self_node = tree.search_nodes( name=str(self.id_of_nearest_ncbi_taxon))[0] other_node = tree.search_nodes( name=str(other.id_of_nearest_ncbi_taxon))[0] ancestor = tree.get_common_ancestor(self_node, other_node) return tree.get_distance( self_node, ancestor) + self.distance_from_nearest_ncbi_taxon
def create_ete3_tree(self): ncbi = NCBITaxa() taxids = set([e.taxo.taxid for e in self]) if None in taxids: raise Exception("Entries doesn't have taxids") tree = ncbi.get_topology(list(taxids)) # Complete Tree object with list of domains and proteins for each node node_list = [] for n in tree.traverse('postorder'): # Browse tree in postorder, starts from leaf and ascend to root n.sameDomainNode = set() node_list.append(n) n.domains = set([h.domain for e in self for h in e.hmmr if e.taxo.taxid == n.name]) n.proteins = set([e.prot for e in self if e.taxo.taxid == n.name]) if n.get_descendants(): for child in n.children: n.domains.update(child.domains) n.proteins.update(child.proteins) # Complete Tree object with list of nodes with same domains for each node c = 0 for i in range(len(node_list)): c += 1 for j in range(i+1, len(node_list)): n1 = node_list[i] n2 = node_list[j] if len(n1.domains) == len(n2.domains): if not n1.domains.difference(n2.domains): n1.sameDomainNode.add(n2) n2.sameDomainNode.add(n1) self.ete3_tree = tree
def build_complete_tree(tree, log): """Build the taxonomic tree including internal nodes (for rank dependent evaluation) Args: tree (fileobject): file name of the reference tree, without internal nodes log (fileobject): log file """ ncbi = NCBITaxa() original_tree = Tree(tree, format=1) taxa = [n.taxid for n in original_tree.traverse('postorder')] built = False while not built: try: complete_tree = ncbi.get_topology(taxa, intermediate_nodes=True) built = True except KeyError as e: # if a taxid is not found, try to build the tree without it taxid_not_found = int(e.args[0]) taxa.remove(taxid_not_found) if log: print('[prophyle_otu_table] ERROR: TaxID ' + str(taxid_not_found) + ' not found in ETE DB (try updating it)', file=log) pass return complete_tree
def get_NCBI_tree(self,seqidmap): ncbi = NCBITaxa() taxids = [] with open(seqidmap,"r") as f: for line in f: splited = line.split("\t") taxids.append(splited[1].strip("\n")) tree = ncbi.get_topology(taxids,intermediate_nodes=True) return tree
def extract_taxa(mpwt_taxon_file, taxon_output_file, tree_output_file): """From NCBI taxon ID, extract taxonomy rank and create a tree file Args: mpwt_taxon_file (str): mpwt taxon file for species in sbml folder taxon_output_file (str): path to phylum output file tree_output_file (str): path to tree output file """ ncbi = NCBITaxa() taxon_ids = [] phylum_count = {} with open(taxon_output_file, "w") as phylum_file: csvwriter = csv.writer(phylum_file, delimiter="\t") csvwriter.writerow([ "species", "taxid", "phylum_number", "phylum", "class", "order", "family", "genus", "species" ]) with open(mpwt_taxon_file, "r") as taxon_file: csvfile = csv.reader(taxon_file, delimiter="\t") for line in csvfile: if "taxon" not in line[1]: taxon_ids.append(line[1]) lineage = ncbi.get_lineage(line[1]) lineage2ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) ranks2lineage = dict( (rank, names[taxid]) for (taxid, rank) in lineage2ranks.items()) ranks = [ ranks2lineage.get(rank, "no_information") for rank in [ "phylum", "class", "order", "family", "genus", "species" ] ] if ranks[0] != "no_information": phylum = ranks[0][:4] else: phylum = "no_information" if phylum not in phylum_count: phylum_count[phylum] = 1 elif phylum == "no_information": phylum_count[phylum] = "" else: phylum_count[phylum] += 1 row = ([line[0], line[1]] + [phylum + str(phylum_count[phylum])] + ranks) csvwriter.writerow(row) tree = ncbi.get_topology(taxon_ids) with open(tree_output_file, "w") as tree_file: tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
def plot_taxids(taxids_list, tree_png, tree_nw, tax_db=None): if tax_db is not None: ncbi = NCBITaxa(dbfile=tax_db) else: ncbi=NCBITaxa() tree = ncbi.get_topology(taxids_list) ts = TreeStyle() ncbi.annotate_tree(tree, taxid_attr="sci_name") ts.show_leaf_name = False ts.mode = "c" ts.layout_fn = layout tree.render(tree_png, tree_style=ts) tree.write(format=1, outfile=tree_nw)
def get_off_target_last_common_taxon_rank(df, target_rank, target_taxon): ncbi = NCBITaxa() if (target_taxon != 0) & (df.loc[target_rank] != 0): if not pd.isnull(df.loc[target_rank]): last_common_taxon = ncbi.get_topology([df.loc[target_rank], target_taxon]) last_common_taxon_rank = last_common_taxon.rank if last_common_taxon_rank != 'no rank': lineage = ncbi.get_lineage(last_common_taxon.taxid) last_common_taxon_rank = ncbi.get_rank([lineage[-1]])[lineage[-1]] else: last_common_taxon_rank = 'no rank' else: last_common_taxon_rank = 'no rank' else: last_common_taxon_rank = 'no rank' return(last_common_taxon_rank)
def run_ete_ncbiquery_py(query): ncbi = NCBITaxa() query = query.split(',') final_query = [] for i in query: try: i.lstrip() i = int(i) final_query.append(i) except ValueError: i = i.lstrip() name2taxid = ncbi.get_name_translator([i])[i] final_query += name2taxid tree = ncbi.get_topology(final_query) return tree.get_ascii(attributes=["sci_name", "rank"])
def my_tree(): ncbi = NCBITaxa() my_tree = ncbi.get_topology([54263, 8324, 8323, 8327, 8325, 57571, 323754]) for n in my_tree.traverse(): n.add_features(weight=random.randint(0, 50)) ts = TreeStyle() ts.layout_fn = layout ts.mode = "c" ts.show_branch_length = True ts.show_branch_support = True my_tree.get_ascii(attributes=["sci_name", "rank"]) return my_tree, ts
def get_common_ancestor(self, other): """ Get the lastest common ancestor of two taxa Args: other (:obj:`Taxon`): a second taxon Returns: :obj:`Taxon`: latest common ancestor """ if self.id_of_nearest_ncbi_taxon is None: return id_of_nearest_ncbi_taxon ncbi_taxa = NCBITaxa() tree = ncbi_taxa.get_topology([self.id_of_nearest_ncbi_taxon, other.id_of_nearest_ncbi_taxon], intermediate_nodes=True) self_node = tree.search_nodes(name=str(self.id_of_nearest_ncbi_taxon))[0] other_node = tree.search_nodes(name=str(other.id_of_nearest_ncbi_taxon))[0] ancestor = tree.get_common_ancestor(self_node, other_node) cls = self.__class__ return cls(ncbi_id=float(ancestor.name))
def compute_upper_node_and_distance(self, core_domains=[]): ncbi = NCBITaxa() if not self.domain_entries: raise Exception("Compute domain_entries first.") if not self.ete3_tree: raise Exception("Compute ete3_tree first.") for d in self.domain_entries.values(): if d.name not in core_domains: distances = [] if len(d.taxo) == 1: taxo = list(d.taxo)[0] d.upper_node = self.ete3_tree.search_nodes(name=taxo.taxid)[0] d.mean_distance = 0 else: list_taxids = list(set([t.taxid for t in d.taxo])) domain_tree = ncbi.get_topology(list_taxids) traverse_generator = domain_tree.traverse() d.upper_node = next(traverse_generator) for i in range(len(list_taxids)): for j in range(i+1, len(list_taxids)): dist = self.ete3_tree.get_distance(list_taxids[i], list_taxids[j]) distances.append(dist) d.mean_distance = mean(distances)
def get_taxo_of(): if not request.is_json: return jsonify(success=False, reason="Bad content type"), 400 data = request.json if not 'taxids' in data: return jsonify(success=False, reason="Taxonomic IDs are missing"), 400 if not isinstance(data['taxids'], list): return jsonify( success=False, reason="Taxonomic IDs must be sended as a string array"), 400 ncbi = NCBITaxa( ) # Obligé de l'instancier à chaque requête; Requiert un SQLite qui demande un thread ID == thread appelant sended_list = data['taxids'] if not len(sended_list): return jsonify(success=False, reason="Query list is empty"), 400 # Convert every string ID of the list to integers # Map returns an iterator, so the value error will not append when creating it sended_list = map(lambda x: int(x), sended_list) try: # Use the iterator tree_topology = ncbi.get_topology(sended_list) except ValueError: return jsonify( success=False, reason="One of the sended IDs is not a valid integer"), 400 # Constructing root tree = {tree_topology.name: node_to_dict(ncbi, tree_topology)} return jsonify(success=True, tree=tree)
def main(argv): parser = argparse.ArgumentParser( description=textwrap.dedent("""\ Summarize and filter alignments by taxid. Required arguments are --dbfile, --inherited_markers, --taxid_link, --readcounts, --primarytab, --eukfrac, --alltab, --taxid_genelens """), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--dbfile", type=str, action="store", dest="dbfile", help="Eukdetect database folder", required=True) parser.add_argument("--inherited_markers", type=str, action="store", dest="inherited_markers", help="Eukdetect database folder", required=True) parser.add_argument("--taxid_link", type=str, action="store", dest="taxid_link", help="Eukdetect database folder", required=True) parser.add_argument("--readcounts", type=str, action="store", dest="readcounts", help="Read counts and mismatches file.", required=True) parser.add_argument("--eukfrac", type=str, action="store", dest="eukfrac", help="Eukaryotic abundance & fraction output file", required=True) parser.add_argument("--primarytab", type=str, action="store", dest="primarytab", help="Table output of filtered hits.", required=True) parser.add_argument("--alltab", type=str, action="store", dest="alltab", help="Table output of all hits.", required=True) parser.add_argument("--taxid_genelens", type=str, action="store", dest="taxid_genelens", help="Cumulative gene length per taxid", required=True) files = parser.parse_args() #initialize NCBI taxdb ncbi = NCBITaxa(files.dbfile) #taxid genelength correspondence #taxid_genelen = {taxid: length} taxid_genelen = { line.split('\t')[0]: int(line.split('\t')[1].strip('\n')) for line in open(files.taxid_genelens) } #create 2 dicts for ease of lookup #correspondence between taxid & marker gene name #taxid_seqs: {taxid: [seq1, seq2]}. Save every seen taxid and which seqs #seq_taxids = {seq: taxid, seq:taxid} Save every seq taxid_seqs = {} seq_taxids = {} for line in open(files.taxid_link): line = line.strip('\n') taxid = line.split('\t')[1] if taxid not in taxid_seqs: taxid_seqs[taxid] = [] seq = line.split('\t')[0] taxid_seqs[taxid].append(seq) seq_taxids[seq] = taxid #save contents of read_counts_and_mismatches file as dict per observed taxid #save observed genuses #taxid_counts: {taxid: [[marker, readcount, correct_bases, total_bases, seqlen, coverage, pid, busco]]} taxid_counts = {} counter = 0 countfile = open(files.readcounts) countfile.readline() genuses = {} above_species = [] #genuses: {genus:[taxid, taxid, taxid]} for line in countfile: counter += 1 line = line.strip('\n') seq = line.split('\t')[0] count = int(line.split('\t')[1]) correct_bases = int(line.split('\t')[2]) incorrect_bases = int(line.split('\t')[3]) total_bases = int(line.split('\t')[4]) subjlen = int(line.split('\t')[5]) coverage = float(line.split('\t')[6]) pid = float(line.split('\t')[7]) taxid = seq_taxids[seq] if "Collapse" not in seq: busco = re.findall('-\d*at\d*-', seq)[0].strip('-') else: busco = "Collapsed" #determine genus lineage = ncbi.get_lineage(int(taxid)) ranks = {value: key for (key, value) in ncbi.get_rank(lineage).items()} #lowest = list(ncbi.get_rank([lineage[-1]]).values())[0] if 'genus' in ranks and 'Collapse' not in seq and "species" in ranks: #lowest != "genus": #dont filter if it's at the genus level genus = ranks['genus'] if genus not in genuses: genuses[genus] = [] if taxid not in genuses[genus]: genuses[genus].append(taxid) elif "SSCollapse" not in seq: #don't add anything that's got SSCollapse in it above_species.append(taxid) #save info per sequence in seq_counts dict #seq_counts[seq] = [count, correct_bases, total_bases, subjlen, coverage, pid, busco] if taxid not in taxid_counts: taxid_counts[taxid] = [] #find the genus if not a spcollapsed gene taxid_counts[taxid].append([ seq, count, correct_bases, total_bases, subjlen, coverage, pid, busco ]) if counter == 0: message = "Empty read count file. Likely no aligned reads in sample." #print(message) #still have to write stuff f = open(files.eukfrac, 'w') f.write(message + '\n') f.close() f = open(files.alltab, 'w') f.write(message + '\n') f = open(files.primarytab, 'w') f.write(message + '\n') f.close() sys.exit() countfile.close() #done parsing read_counts_and_mismatches file #calculate stats for each observed taxid taxon_coverage = {} #taxon_coverage[taxon] = [ #observed_markers, #readcounts, #total_bases, #percentage_markers, #marker_coverage, #percent_id, #total_observed_marker_len, #buscos, #total_gene_length, #total_markers] seen_taxids = [] for tax in taxid_counts: mc = len(taxid_counts[tax]) counts = 0 bases = 0 correct = 0 total_bases = 0 subj_len = 0 buscos = [] for i in range(0, len(taxid_counts[tax])): busco = taxid_counts[tax][i][-1] if len(busco) > 1: buscos.append(busco) counts += taxid_counts[tax][i][1] bases += taxid_counts[tax][i][3] correct += taxid_counts[tax][i][2] total_bases += taxid_counts[tax][i][3] subj_len += taxid_counts[tax][i][4] percent_identity = round((correct / total_bases) * 100, 2) overall_coverage = round((total_bases / subj_len) * 100, 2) total_markers = len(taxid_seqs[tax]) marker_percentage = round(mc / total_markers * 100, 2) name = [ ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax]) ][0] taxid_len = taxid_genelen[tax] rpkg = counts / (taxid_len / 1000) if tax not in seen_taxids: seen_taxids.append(tax) taxon_coverage[tax] = [ mc, counts, total_bases, marker_percentage, overall_coverage, percent_identity, subj_len, buscos, taxid_len, total_markers ] #create tree structure for all observed taxids tree = ncbi.get_topology(seen_taxids) tree_root = tree.get_tree_root().name lineage = ncbi.get_lineage(tree_root) tree_taxids = seen_taxids + lineage full_tree = ncbi.get_topology(tree_taxids, intermediate_nodes=True) full_taxid_lineage = [node.name for node in full_tree.traverse()] #full_seq_taxids: {taxid: [[specific buscos], specific count, specific + inherited count]} full_seq_taxids = {} for line in open(files.inherited_markers): line = line.strip('\n') taxid = line.split('\t')[0] if taxid in full_taxid_lineage: buscos = [] for seq in line.split('\t')[1].split(','): if len(re.findall('-\d*at\d*-', seq)) > 0: busco = re.findall('-\d*at\d*-', seq)[0].strip('-') if busco not in buscos: buscos.append(busco) specific_count = len(line.split('\t')[1].split(',')) sp_and_inherited_count = len(line.split('\t')[2].split(',')) full_seq_taxids[taxid] = [ buscos, specific_count, sp_and_inherited_count ] #write full table marker_sorted = sorted(taxon_coverage.keys(), reverse=True, key=lambda x: taxon_coverage[x][3]) dest = open(files.alltab, 'w') dest.write( "Name\tTaxid\tRank\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\tAmount of marker length in EukDetect db\n" ) for tax in marker_sorted: rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] name = [ ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax]) ][0] if rank == "no rank": #parent rank parent = ncbi.get_lineage(tax)[-2] rank = [ ncbi.get_rank([parent])[e] for e in ncbi.get_rank([parent]) ][0] mc = taxon_coverage[tax][0] counts = taxon_coverage[tax][1] marker_percentage = taxon_coverage[tax][3] overall_coverage = taxon_coverage[tax][4] percent_identity = taxon_coverage[tax][5] total_marker_len = taxon_coverage[tax][6] blen = taxid_genelen[tax] dest.write(name + '\t' + str(tax) + '\t' + rank + '\t' + str(mc) + '\t' + str(counts) + '\t' + str(marker_percentage) + '%\t' + str(overall_coverage) + '%\t' + str(percent_identity) + '%\t' + str(blen) + '\n') dest.close() #determine primary and secondary hits #if MRCA is at the level of genus, consider whether one should be primary or secondary by looking at buscos primary = {} secondary = {} genus_secondary_hits = {} #structure: {genus: [secondary_hit_taxid]} for g in genuses: if len(genuses[g]) > 1: #multiple species in same genus taxids = genuses[g] reads = [taxon_coverage[taxid][1] for taxid in taxids] bases = [taxon_coverage[taxid][2] for taxid in taxids] #if one has more reads and more bases than all others, it is primary, others are secondary maxreads = max(reads) maxbases = max(bases) ptaxids = [] if (reads.count(maxreads) == 1 and bases.count(maxbases) == 1)\ and (reads.index(maxreads) == bases.index(maxbases)): #no ties, same ID maxtax = taxids[reads.index(maxreads)] primary[maxtax] = taxon_coverage[maxtax][0:5] ptaxids.append(maxtax) #ptaxids.append(taxids[reads.index(maxreads)]) #primary[ptaxid] = taxon_coverage[ptaxid][0:5] #p_buscos = full_seq_taxids[ptaxid][0] else: for t in taxids: if taxon_coverage[t][1] == maxreads or taxon_coverage[t][ 2] == maxbases: ptaxids.append(t) primary[t] = taxon_coverage[t][0:5] unsorted_ataxids = [t for t in taxids if t not in ptaxids] ataxids = sorted(unsorted_ataxids, key=lambda x: taxon_coverage[x][1], reverse=True) for ataxid in ataxids: is_secondary = False for ptaxid in primary: p_buscos = [b for b in full_seq_taxids[ptaxid][0]] a_buscos = taxon_coverage[ataxid][7] a_remain = [b for b in a_buscos if b in p_buscos] if len(a_remain) > 0: a_above = [] for b in a_remain: #it may not be a hit for the other one! check first #check that the pid for this hit is lower apid = [ seq[6] for seq in taxid_counts[ataxid] if seq[7] == b ] ppid = [ seq[6] for seq in taxid_counts[ptaxid] if seq[7] == b ] if len(ppid) > 0 and apid[0] >= ppid[0]: a_above.append(b) elif len(ppid) == 0: a_above.append(b) #if a_buscos is fewer than 5, all must be correct #print(a_above) if len(a_buscos) < 5: if len(a_above) < len(a_buscos): is_secondary = True else: if len(a_above) <= len( a_buscos ) / 2: #change: alt hit has to be half or busco hits being above is_secondary = True else: is_secondary = True if is_secondary: secondary[ataxid] = taxon_coverage[ataxid][0:5] + [ptaxid] genus = str(g) if genus not in genus_secondary_hits: genus_secondary_hits[genus] = [] genus_secondary_hits[genus].append(ataxid) #secondary_hit_reads[g].append([ataxid, taxon_coverage[ataxid][1], taxid_genelen[ataxid]]) else: primary[ataxid] = taxon_coverage[ataxid][0:5] else: #primary taxid = genuses[g][0] primary[taxid] = taxon_coverage[taxid][0:5] #add anything else for t in above_species: primary[t] = taxon_coverage[t][0:5] primary_sorted = sorted(primary.keys(), reverse=True, key=lambda x: primary[x][3]) #secondary_sorted = sorted(secondary.keys(), reverse=True, key=lambda x: secondary[x][3]) filter_passing_taxids = [] for tax in primary_sorted: rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] if rank == "no rank": prev = ncbi.get_lineage(tax)[-1] prevrank = [ ncbi.get_rank([prev])[e] for e in ncbi.get_rank([prev]) ][0] if prevrank == "species": rank = "species" name = [ ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax]) ][0] mc = taxon_coverage[tax][0] counts = taxon_coverage[tax][1] marker_percentage = taxon_coverage[tax][3] overall_coverage = taxon_coverage[tax][4] percent_identity = taxon_coverage[tax][5] #filter if int(mc) >= 2 and int(counts) >= 4: filter_passing_taxids.append(tax) #close if no filter passing taxids if len(filter_passing_taxids) == 0: message = "No taxa passing filter requirements." #print(message) #still have to write stuff f = open(files.primarytab, 'w') f.write(message + '\n') f.close() f = open(files.eukfrac, 'w') f.write(message + '\n') f.close() sys.exit() #create NCBI taxon tree of observed taxa + extend to cellular_org tree = ncbi.get_topology(filter_passing_taxids) tree_root = tree.get_tree_root().name lineage = ncbi.get_lineage(tree_root) primary_tree_taxids = [int(e) for e in filter_passing_taxids] + lineage primary_tree = ncbi.get_topology(primary_tree_taxids, intermediate_nodes=True) orphan_children = [] #phylum class order family genus species taxid_lendenoms = { } #for all species, get full marker possibilities, for higher rank, get just what's specific #find counts of seqs for internal nodes relab_levels = { 'species': [], 'genus': [], 'family': [], 'order': [], 'class': [], 'phylum': [] } ordered_labels = ["phylum", "class", "order", "family", "genus", "species"] lineages = {} #pre-add secondary hits to each genus for g in genus_secondary_hits: for s in genus_secondary_hits[g]: for seq in taxid_counts[s]: #if g not in taxid_lendenoms: # taxid_lendenoms[g] = 0 #taxid_lendenoms[g] += seq[4] if g not in taxid_counts: taxid_counts[g] = [] taxid_counts[g].append(seq) #calculate seqs and seqlens for each taxonomic node. this goes top-down for node in primary_tree.traverse(): #get lineage name lin_name = "" currname = [ ncbi.get_taxid_translator([node.name])[e] for e in ncbi.get_taxid_translator([node.name]) ][0] lineage = ncbi.get_lineage(node.name) names = ncbi.get_taxid_translator(ncbi.get_lineage(node.name)) ranks = ncbi.get_rank(ncbi.get_lineage(node.name)) ranks_rev = {ranks[e]: e for e in ranks} #print(lineage) #print(ranks) #print(ranks_rev) prev_rank = ranks[lineage[-2]] for i in ordered_labels: if i in ranks_rev: lin_name += i + "-" + names[ranks_rev[i]] + "|" lin_name = lin_name.strip('|').replace(' ', "_") lineages[node.name] = lin_name #init taxid_lendenoms and taxid_counts if does not exist if node.name not in taxid_lendenoms: taxid_lendenoms[node.name] = 0 if node.name not in taxid_counts: taxid_counts[node.name] = [] rank = [ ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name]) ][0] #if node.is_leaf() == False and rank != "species": #if rank != "species": if rank in relab_levels: relab_levels[rank].append(node.name) if rank != "species" and prev_rank != "species": #if not a species or a strain, add individuals #add indiv seqs for seq in taxid_counts[node.name]: taxid_lendenoms[node.name] += seq[4] if (rank == "species" or prev_rank == "species") and node.name in taxid_genelen: taxid_lendenoms[node.name] += taxid_genelen[node.name] for desc in node.iter_descendants(): if desc.name in taxid_counts: descrank = [ ncbi.get_rank([desc.name])[e] for e in ncbi.get_rank([desc.name]) ][0] dlineage = ncbi.get_lineage(node.name) dnames = ncbi.get_taxid_translator(ncbi.get_lineage(node.name)) dranks = ncbi.get_rank(ncbi.get_lineage(node.name)) d_prev_rank = dranks[dlineage[-2]] if descrank == "species" or d_prev_rank == "species": taxid_lendenoms[node.name] += taxid_genelen[ desc.name] #if sp add full markers else: for seq in taxid_counts[desc.name]: taxid_lendenoms[node.name] += seq[ 4] #if not sp add submarkers for seq in taxid_counts[desc.name]: if seq not in taxid_counts[node.name]: taxid_counts[node.name].append(seq) #elif node.is_leaf() == False and rank == "species": #stuff # x = 0 # else: #has to be a strain? #add full seq since is species #if node.name in taxid_genelen: #avoids case where there is a strain without dedicated taxid # taxid_lendenoms[node.name] += taxid_genelen[node.name] # relab_levels['species'].append(node.name) # if node.name not in taxid_counts: # orphan_children.append(node.name) #determine if all hits have all levels levels_to_remove = [] for tax in filter_passing_taxids: lin = lineages[tax] groups = [l.split('-')[0] for l in lin.split('|')] levels = ordered_labels[0:len(groups)] if levels != groups: #for g in groups: for l in levels: if l not in groups: if l not in levels_to_remove: levels_to_remove.append(l) for l in levels_to_remove: relab_levels.pop(l) #calculate relabs for each level relabs = {} #relabs[taxid] = [reads, amt_marker_sequence, rpks, eukfrac] for group in relab_levels: sum_rpks = 0 for tax in relab_levels[group]: reads = 0 for seq in taxid_counts[tax]: reads += seq[1] amt_marker_sequence = taxid_lendenoms[tax] rpks = reads / (amt_marker_sequence / 1000) sum_rpks += rpks relabs[tax] = [reads, amt_marker_sequence, rpks] for tax in relab_levels[group]: eukfrac = (relabs[tax][2] / sum_rpks) * 100 relabs[tax].append(eukfrac) dest = open(files.primarytab, 'w') dest.write( "Name\tRank\tLineage\tTaxid\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n" ) for tax in filter_passing_taxids: if tax in lineages: lin = lineages[tax] else: lin = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] if rank == "no rank": #parent rank parent = ncbi.get_lineage(tax)[-2] prevrank = [ ncbi.get_rank([parent])[e] for e in ncbi.get_rank([parent]) ][0] if prevrank == "species": rank = "species" #lin = lineages[tax] name = [ ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax]) ][0] mc = taxon_coverage[tax][0] counts = taxon_coverage[tax][1] marker_percentage = taxon_coverage[tax][3] overall_coverage = taxon_coverage[tax][4] percent_identity = taxon_coverage[tax][5] #filter dest.write(name + '\t' + rank + '\t' + lin + '\t' + str(tax) + '\t' + str(mc) + '\t' + str(counts) + '\t' + str(marker_percentage) + '%\t' + str(overall_coverage) + '%\t' + str(percent_identity) + '%\n') dest.close() #table with relative abundance for all levels dest = open(files.eukfrac, 'w') dest.write( "Lineage\tRank\tName\tTaxID\tRPKS\tEuk_fraction\tReads\tAmt_marker_sequence\n" ) for node in primary_tree.traverse("preorder"): rank = [ ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name]) ][0] name = [ ncbi.get_taxid_translator([node.name])[e] for e in ncbi.get_taxid_translator([node.name]) ][0] if node.name in lineages: lin = lineages[node.name] else: lin = [ ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name]) ][0] if rank == "no rank" and node.is_leaf(): continue #is strain, have already printed species at this point #rank = "species" if node.name in relabs: rpks = round(relabs[node.name][2], 4) eukfrac = round(relabs[node.name][3], 4) reads = relabs[node.name][0] markerseq = relabs[node.name][1] dest.write(lin + '\t' + rank + '\t' + name + '\t' + node.name + '\t' + str(rpks) + '\t' + str(eukfrac) + '\t' + str(reads) + '\t' + str(markerseq) + '\n') dest.close()
NCBI = False if NCBI : from ete3 import NCBITaxa ncbi = NCBITaxa() #ncbi.update_taxonomy_database() taxIDlist=[] for gene in geneList: name2taxID = ncbi.get_name_translator([gene.organism]) gene.taxID = name2taxID[gene.organism][0] for i in ncbi.get_lineage(gene.taxID): gene.addlineageid(i) taxIDlist.append(gene.taxID) #taxid2name = ncbi.get_taxid_translator([9606, 9443]) #print taxid2name tree = False if tree : tree = ncbi.get_topology(taxIDlist) print tree.get_ascii(attributes=["sci_name", "rank"])
def model2Tree(modelIDList): ncbi = NCBITaxa() tree = ncbi.get_topology(modelIDList) print tree.get_ascii(attributes=["sci_name", "rank"]) return tree
ncbiID = OMA2ncbiID[ID] confirmFamID_ncbiIDs[famID].append(ncbiID) fam_commonAncestorNode = defaultdict(list) for fam, ncbis in confirmFamID_ncbiIDs.items(): ncbi_list = [] ncbi_ancestor_list = [] taxa_list = [] node_list = [] for ID in ncbis: ncbiTaxa = str(ncbiID2taxa[ID]) taxa_list.append(ncbiTaxa) ncbiID = int(ID) ncbi_list.append(ncbiID) ncbi_ancestor_list.append(ID) tree = ncbi.get_topology(ncbi_list) taxa_tree = tree.get_ascii(attributes=["sci_name"]) ancestor = tree.get_common_ancestor(ncbi_ancestor_list) for node in tree.traverse("levelorder"): node_list.append(node.sci_name) MRCA_node = node_list[0] fam_commonAncestorNode[MRCA_node].append(fam) MRCA_dict = ast.literal_eval(json.dumps(fam_commonAncestorNode)) final_dict = dict() for k, v in MRCA_dict.items(): famCount = len(v) final_dict[k] = famCount df = pd.DataFrame(final_dict, index=[0]) sorted_df = df.sort_values(df.last_valid_index(), axis=1)
ref_model_file = '/home/acabbia/Documents/Muscle_Model/models/AGORA_universe.xml' models_taxonomy = pd.read_csv( '/home/acabbia/Documents/Muscle_Model/GSMM-distance/agora_taxonomy.tsv', sep='\t').sort_values(by='organism') #%% ##### # MAKE REFERENCE NCBI TAXONOMY TREE #### from ete3 import NCBITaxa ncbi = NCBITaxa() ncbi.update_taxonomy_database() NCBI_ID = list(models_taxonomy['ncbiid'].dropna().values) NCBI_tree = ncbi.get_topology(NCBI_ID) # Ugly way to convert "phyloTree" obj into "Tree" obj for comparison with other trees NCBI_tree.write( format=1, outfile="/home/acabbia/Documents/Muscle_Model/GSMM-distance/NCBI_tree.nw") NCBI_tree = Tree( "/home/acabbia/Documents/Muscle_Model/GSMM-distance/NCBI_tree.nw", format=1) #%% ##### # MAKE GK TREE #### graphList = []
def main(argv): parser = argparse.ArgumentParser( description=textwrap.dedent("""\ Summarize and filter alignments by taxid. Required arguments are --dbfile, --inherited_markers, --taxid_link, --readcounts, --primarytax, --primarytab, --alltab """), formatter_class = argparse.RawDescriptionHelpFormatter ) parser.add_argument( "--dbfile", type=str, action="store", dest="dbfile", help= "Eukdetect database folder", required=True ) parser.add_argument( "--inherited_markers", type=str, action="store", dest="inherited_markers", help= "Eukdetect database folder", required=True ) parser.add_argument( "--taxid_link", type=str, action="store", dest="taxid_link", help= "Eukdetect database folder", required=True ) parser.add_argument( "--readcounts", type=str, action="store", dest="readcounts", help= "Read counts and mismatches file.", required=True ) parser.add_argument( "--primarytax", type=str, action="store", dest="primarytax", help= "Taxonomy output of filtered hits", required=True ) parser.add_argument( "--primarytab", type=str, action="store", dest="primarytab", help= "Table output of filtered hits.", required=True ) parser.add_argument( "--alltab", type=str, action="store", dest="alltab", help= "Table output of all hits.", required=True ) files = parser.parse_args() #initialize NCBI taxdb ncbi = NCBITaxa(files.dbfile) #create 2 dicts for ease of lookup #correspondence between taxid & marker gene name #taxid_seqs: {taxid: [seq1, seq2]}. Save every seen taxid and which seqs #seq_taxids = {seq: taxid, seq:taxid} Save every seq taxid_seqs = {} seq_taxids = {} for line in open(files.taxid_link): line = line.strip('\n') taxid = line.split('\t')[1] if taxid not in taxid_seqs: taxid_seqs[taxid] = [] seq = line.split('\t')[0] taxid_seqs[taxid].append(seq) seq_taxids[seq] = taxid #save contents of read_counts_and_mismatches file as dict per observed taxid #save observed genuses #taxid_counts: {taxid: [[marker, readcount, correct_bases, total_bases, seqlen, coverage, pid, busco]]} taxid_counts = {} counter = 0 countfile = open(files.readcounts) countfile.readline() genuses = {} above_species = [] #genuses: {genus:[taxid, taxid, taxid]} for line in countfile: counter += 1 line = line.strip('\n') seq = line.split('\t')[0] count = int(line.split('\t')[1]) correct_bases = int(line.split('\t')[2]) incorrect_bases = int(line.split('\t')[3]) total_bases = int(line.split('\t')[4]) subjlen = int(line.split('\t')[5]) coverage = float(line.split('\t')[6]) pid = float(line.split('\t')[7]) taxid = seq_taxids[seq] if "Collapse" not in seq: busco = re.findall('-\d*at\d*-', seq)[0].strip('-') else: busco = "Collapsed" #determine genus lineage = ncbi.get_lineage(int(taxid)) ranks = {value: key for (key, value) in ncbi.get_rank(lineage).items()} #lowest = list(ncbi.get_rank([lineage[-1]]).values())[0] if 'genus' in ranks and 'Collapse' not in seq and "species" in ranks: #lowest != "genus": #dont filter if it's at the genus level genus = ranks['genus'] if genus not in genuses: genuses[genus] = [] if taxid not in genuses[genus]: genuses[genus].append(taxid) elif "SSCollapse" not in seq: #don't add anything that's got SSCollapse in it above_species.append(taxid) #save info per sequence in seq_counts dict #seq_counts[seq] = [count, correct_bases, total_bases, subjlen, coverage, pid, busco] if taxid not in taxid_counts: taxid_counts[taxid] = [] #find the genus if not a spcollapsed gene taxid_counts[taxid].append([seq, count, correct_bases, total_bases, subjlen, coverage, pid, busco]) if counter == 0: message = "Empty read count file. Likely no aligned reads in sample." #print(message) #still have to write stuff f = open(files.primarytax, 'w') f.write(message + '\n') f.close() f = open(files.alltab, 'w') f.write(message + '\n') f = open(files.primarytab, 'w') f.write(message + '\n') f.close() sys.exit() countfile.close() #done parsing read_counts_and_mismatches file #calculate stats for each observed taxid taxon_coverage = {} #taxon_coverage[taxon] = [observed_markers, #readcounts, #total_bases, #percentage_markers, #marker_coverage, #percent_id, #buscos] seen_taxids = [] for tax in taxid_counts: mc = len(taxid_counts[tax]) counts = 0 bases = 0 correct = 0 total_bases = 0 subj_len = 0 buscos = [] for i in range(0, len(taxid_counts[tax])): busco = taxid_counts[tax][i][-1] if len(busco) > 1: buscos.append(busco) counts += taxid_counts[tax][i][1] bases += taxid_counts[tax][i][3] correct += taxid_counts[tax][i][2] total_bases += taxid_counts[tax][i][3] subj_len += taxid_counts[tax][i][4] percent_identity = round((correct / total_bases) * 100, 2) overall_coverage = round((total_bases / subj_len ) * 100, 2) total_markers = len(taxid_seqs[tax]) marker_percentage = round( mc / total_markers * 100, 2) name = [ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax])][0] if tax not in seen_taxids: seen_taxids.append(tax) taxon_coverage[tax] = [mc, counts, total_bases, marker_percentage, overall_coverage, percent_identity, buscos] #create tree structure for all observed taxids tree = ncbi.get_topology(seen_taxids) tree_root = tree.get_tree_root().name lineage = ncbi.get_lineage(tree_root) tree_taxids = seen_taxids + lineage full_tree = ncbi.get_topology(tree_taxids, intermediate_nodes=True) full_taxid_lineage = [node.name for node in full_tree.traverse()] #full_seq_taxids: {taxid: [[specific buscos], specific count, specific + inherited count]} full_seq_taxids = {} for line in open(files.inherited_markers): line = line.strip('\n') taxid = line.split('\t')[0] if taxid in full_taxid_lineage: buscos = [] for seq in line.split('\t')[1].split(','): if len(re.findall('-\d*at\d*-', seq)) > 0: busco = re.findall('-\d*at\d*-',seq)[0].strip('-') if busco not in buscos: buscos.append(busco) specific_count = len(line.split('\t')[1].split(',')) sp_and_inherited_count = len(line.split('\t')[2].split(',')) full_seq_taxids[taxid] = [buscos, specific_count, sp_and_inherited_count] #determine primary and secondary hits #if MRCA is at the level of genus, consider whether one should be primary or secondary by looking at buscos primary = {} secondary = {} for g in genuses: if len(genuses[g]) > 1: #multiple species in same genus taxids = genuses[g] reads = [taxon_coverage[taxid][1] for taxid in taxids] bases = [taxon_coverage[taxid][2] for taxid in taxids] #if one has more reads and more bases than all others, it is primary, others are secondary maxreads = max(reads) maxbases = max(bases) ptaxids = [] if (reads.count(maxreads) == 1 and bases.count(maxbases) == 1)\ and (reads.index(maxreads) == bases.index(maxbases)): #no ties, same ID maxtax = taxids[reads.index(maxreads)] primary[maxtax] = taxon_coverage[maxtax][0:5] ptaxids.append(maxtax) #ptaxids.append(taxids[reads.index(maxreads)]) #primary[ptaxid] = taxon_coverage[ptaxid][0:5] #p_buscos = full_seq_taxids[ptaxid][0] else: for t in taxids: if taxon_coverage[t][1] == maxreads or taxon_coverage[t][2] == maxbases: ptaxids.append(t) primary[t] = taxon_coverage[t][0:5] unsorted_ataxids = [t for t in taxids if t not in ptaxids] ataxids = sorted(unsorted_ataxids, key = lambda x: taxon_coverage[x][1], reverse = True) for ataxid in ataxids: #print(ataxid) is_secondary = False for ptaxid in primary: p_buscos = [b for b in full_seq_taxids[ptaxid][0]] a_buscos = taxon_coverage[ataxid][-1] a_remain = [b for b in a_buscos if b in p_buscos] #print(a_buscos) #print(a_remain) if len(a_remain) > 0: a_above = [] for b in a_remain: #it may not be a hit for the other one! check first #check that the pid for this hit is lower apid = [seq[6] for seq in taxid_counts[ataxid] if seq[7] == b] ppid = [seq[6] for seq in taxid_counts[ptaxid] if seq[7] == b] if len(ppid) > 0 and apid[0] >= ppid[0]: a_above.append(b) elif len(ppid) == 0: a_above.append(b) #if a_buscos is fewer than 5, all must be correct #print(a_above) if len(a_buscos) < 5: if len(a_above) < len(a_buscos): is_secondary = True else: if len(a_above) <= len(a_buscos)/2: #change: alt hit has to be half or busco hits being above is_secondary = True else: is_secondary = True if is_secondary: secondary[ataxid] = taxon_coverage[ataxid][0:5] + [ptaxid] else: primary[ataxid] = taxon_coverage[ataxid][0:5] else: #primary taxid = genuses[g][0] primary[taxid] = taxon_coverage[taxid][0:5] #add anything else for t in above_species: primary[t] = taxon_coverage[t][0:5] #write full table marker_sorted = sorted(taxon_coverage.keys(), reverse = True, key = lambda x: taxon_coverage[x][3]) dest = open(files.alltab, 'w') dest.write("Name\tTaxid\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n") for tax in marker_sorted: rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] name = [ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax])][0] mc = taxon_coverage[tax][0] counts = taxon_coverage[tax][1] marker_percentage = taxon_coverage[tax][3] overall_coverage = taxon_coverage[tax][4] percent_identity = taxon_coverage[tax][5] dest.write(name + '\t' + str(tax) + '\t' + str(mc) + '\t' + str(counts) + '\t' + str(marker_percentage) + '%\t' + str(overall_coverage) + '%\t' + str(percent_identity) + '%\n') dest.close() dest = open(files.primarytab, 'w') dest.write("Name\tTaxid\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n") #TODO: implement filters primary_sorted = sorted(primary.keys(), reverse = True, key = lambda x: primary[x][3]) #secondary_sorted = sorted(secondary.keys(), reverse=True, key=lambda x: secondary[x][3]) filter_passing_taxids = [] for tax in primary_sorted: rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] name = [ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax])][0] mc = taxon_coverage[tax][0] counts = taxon_coverage[tax][1] marker_percentage = taxon_coverage[tax][3] overall_coverage = taxon_coverage[tax][4] percent_identity = taxon_coverage[tax][5] #filter if int(mc) >= 2 and int(counts) >= 4: filter_passing_taxids.append(tax) dest.write(name + '\t' + str(tax) + '\t' + str(mc) + '\t' + str(counts) + '\t' + str(marker_percentage) + '%\t' + str(overall_coverage) + '%\t' + str(percent_identity) + '%\n') dest.close() #close if no filter passing taxids if len(filter_passing_taxids) == 0: message = "No taxa passing filter requirements." #print(message) #still have to write stuff f = open(files.primarytab, 'w') f.write(message + '\n') f.close() f = open(files.primarytax, 'w') f.write(message + '\n') f.close() sys.exit() #create NCBI taxon tree of observed taxa + extend to cellular_org tree = ncbi.get_topology(filter_passing_taxids) tree_root = tree.get_tree_root().name lineage = ncbi.get_lineage(tree_root) primary_tree_taxids = [int(e) for e in filter_passing_taxids] + lineage primary_tree = ncbi.get_topology(primary_tree_taxids, intermediate_nodes=True) #write the tree structure to file orphan_children = [] #find counts of seqs for internal nodes for node in full_tree.traverse(): if node.is_leaf() == False: if node.name not in taxid_counts: taxid_counts[node.name] = [] for desc in node.iter_descendants(): if desc.name in taxid_counts: for seq in taxid_counts[desc.name]: if seq not in taxid_counts[node.name]: taxid_counts[node.name].append(seq) else: if node.name not in taxid_counts: orphan_children.append(node.name) #create new tree of filter passing hits level_counts = [] currspaces = 0 currparent = '' seen_parents = {} dest = open(files.primarytax, 'w') dest.write("Markers_Obs\tTotal_Markers\tPercent_Makers_Obs\tPercent_ID\tMarker_read_count\tRank\tName\n") for node in primary_tree.traverse("preorder"): if node.name not in orphan_children and node.name in full_seq_taxids: rank = [ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name])][0] name = [ncbi.get_taxid_translator([node.name])[e] for e in ncbi.get_taxid_translator([node.name])][0] if node.is_root(): currspaces = 0 else: if currparent == '': currparent = node.up.name currspaces += 4 else: if currparent != node.up.name: currparent = node.up.name if currparent in seen_parents: currspaces = seen_parents[currparent] else: currspaces += 4 seen_parents[currparent] = currspaces if node.name in taxon_coverage: pid = str(taxon_coverage[node.name][5]) + '%' else: pid = "NA" #total_buscos buscos = len(taxid_counts[str(node.name)]) seqs = sum([b[1] for b in taxid_counts[node.name]]) total_buscos = full_seq_taxids[node.name][2] percent = round((buscos/total_buscos)*100,2) dest.write(str(buscos) + '\t' + str(total_buscos) + "\t" + str(percent) + '%\t' + str(pid) + '\t' + str(seqs) + '\t' + rank + '\t' + ' ' * currspaces + name + '\n') dest.close()
from ete3 import NCBITaxa #The first time this will download the taxonomic NCBI database and save a parsed version #of it in `~/.etetoolkit/taxa.sqlite`.May take some minutes ncbi = NCBITaxa() print("ncbi.dbfile", ncbi.dbfile) with open(snakemake.input[0], 'r', encoding='utf8') as fh: genus_list = fh.read().strip().split('\n') genus_to_taxid = ncbi.get_name_translator(genus_list) tax_id_vals = genus_to_taxid.values() tree = ncbi.get_topology( [genus_id for subls in tax_id_vals for genus_id in subls], intermediate_nodes=True) # `get_ascii()` has a bug, prints the taxons before to genus without any separation between them, so a way to avoid that is using extra attribues, `dist` seems to be less invasive. Also, numbers from 'dist' are replaced with open(snakemake.output[0], mode='w', encoding='utf8') as fh: print(tree.get_ascii(attributes=["dist", "sci_name"]).replace('1.0,', '-'), file=fh)
# Load the species names try: with open(args.taxons, 'r') as taxFile: listTaxa = taxFile.readlines() listTaxa = [x.strip() for x in listTaxa] listTaxa = [x.split(" ") for x in listTaxa] listTaxa = list(set(itertools.chain(*listTaxa))) listTaxa = [x.replace("_", " ") for x in listTaxa] except FileNotFoundError: print("File does not exist") sys.exit(1) # Retrieve TaxId from species names IdTaxList = (ncbi.get_name_translator(listTaxa)).values() IdTaxList = list(itertools.chain(*IdTaxList)) # Create un dictionary with IdTax as key and species names as value idTaxa2names = ncbi.get_taxid_translator(IdTaxList) # Retrieve phylogenetic tree tree = ncbi.get_topology(IdTaxList) # Change idTax to names leaves = tree.get_leaves() for i in range(len(leaves)): leaves[i].name = idTaxa2names[int(leaves[i].name)] # write newick with open(args.output, 'w') as treeFile: treeFile.write(tree.write(format=9).replace(" ", "_"))
list_name = [x.strip('\n') for x in file_name.readlines()] ref_levels = [ 'subspecies', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' ] myList = [] for linea in list_name: taxid_dirty_dir = (ncbi.get_name_translator([linea])) if len(taxid_dirty_dir) > 0: nude = str(taxid_dirty_dir.values()[0][0]) myList.insert(0, nude) t = ncbi.get_topology(myList, intermediate_nodes=True) linaje = ncbi.get_lineage(t.get_common_ancestor(myList).name) pairs = ncbi.get_rank(linaje) flag = 0 for each_ref in ref_levels: if each_ref in pairs.values() and flag == 0: print ncbi.get_taxid_translator([ (list(pairs.keys())[list(pairs.values()).index(each_ref)]) ]).values() flag = 1 if flag == 0: print "Unclassified" sys.stdout = orig_stdout
'steps_from_Eukaryota' + '\n') outputFile3 = open('ontology.tab', 'w') #Here I open the file that Matt script creates and loops in each line and get the taxids with open('SP_by_taxa.tab', 'r') as fo: for line in fo: line = line.rstrip() (uniprotid, taxids) = line.split('\t') one_taxid = taxids.split( ',') # divide the list of taxids to diff taxids 'strings' tax_dict[uniprotid] = one_taxid one_taxid_int = [] for i in range(len(one_taxid)): one_taxid_int.append(int( one_taxid[i])) #ete3 take a list of taxid integers #print(one_taxid) tree = ncbi.get_topology( one_taxid) #creates the tree of taxids for each uniprot id outputFile.write(uniprotid + '\t' + tree.write(format=3) + '\n') #writing tab file of uniprot/tree_string tree_dict[uniprotid] = tree one_taxid_str = [] for i in range(len(one_taxid_int)): one_taxid_str.append(str( one_taxid_int[i])) # returning to list of strings again #print(one_taxid) #get the first common ancestor of each uniprotid as taxid first_common_ancestor_taxid = tree.get_tree_root() #print(first_common_ancestor_taxid.name) if first_common_ancestor_taxid.name in ontology: uniprotid_set = ontology[first_common_ancestor_taxid.name] uniprotid_set.add(uniprotid)
#export PATH=~/anaconda_ete/bin:$PATH from ete3 import NCBITaxa ncbi = NCBITaxa() ####### BRAINCODE viruses taxonomy tree ######## fp_in = open("/PHShome/tw786/localView/overview/Tree/BRAINCODE_viruses.txt") viruses1 = fp_in.readlines() viruses1 = [x.strip() for x in viruses1] viruses_taxid = ncbi.get_name_translator(viruses1) viruses_taxid = [x[0] for x in viruses_taxid.values()] tree = ncbi.get_topology(viruses_taxid) file_path = "/PHShome/tw786/localView/overview/Tree/BRAINCODE_viruses_tree.txt" fp = open(file_path, 'w') print >> fp, tree.get_ascii(attributes=['sci_name', 'rank']) fp_in.close() fp.close() ####### GTEx viruses taxonomy tree ######## fp_in = open("/PHShome/tw786/localView/overview/Tree/GTEx_viruses.txt") viruses2 = fp_in.readlines() viruses2 = [x.strip() for x in viruses2] viruses_taxid = ncbi.get_name_translator(viruses2) viruses_taxid = [x[0] for x in viruses_taxid.values()] tree = ncbi.get_topology(viruses_taxid) file_path = "/PHShome/tw786/localView/overview/Tree/GTEx_viruses_tree.txt" fp = open(file_path, 'w') print >> fp, tree.get_ascii(attributes=['sci_name', 'rank']) fp_in.close() fp.close() ####### BRAINCODE + GTEx viruses taxonomy tree ######## viruses_merge = viruses1 + viruses2 viruses_merge = list(set(viruses_merge))
def main(argv): #read in taxonomy info for each BUSCO species_taxids = [] #species_taxids[marker_id] = taxid for line in open(sys.argv[1]): tax = line.split('\t')[1].strip('\n') if tax not in species_taxids: species_taxids.append(tax) #initialize NCBI taxdb ncbi = NCBITaxa(sys.argv[2]) #create 2 dicts for ease of lookup #taxid_seqs: {taxid: [seq1, seq2]}. Save every seen taxid and which seqs #seq_taxids = {seq: taxid, seq:taxid} Save every seq taxid_seqs = {} seq_taxids = {} for line in open(sys.argv[1]): line = line.strip('\n') taxid = line.split('\t')[1] if taxid not in taxid_seqs: taxid_seqs[taxid] = [] seq = line.split('\t')[0] taxid_seqs[taxid].append(seq) seq_taxids[seq] = taxid #iterate over idxstats file and save counts #seq_counts[seq] = [readcount, correct_bases, total_bases, seqlen, coverage] seq_counts = {} seen_taxids = [] counter = 0 countfile = open(sys.argv[4]) countfile.readline() for line in countfile: counter += 1 line = line.strip('\n') seq = line.split('\t')[0] count = int(line.split('\t')[1]) correct_bases = int(line.split('\t')[2]) incorrect_bases = int(line.split('\t')[3]) total_bases = int(line.split('\t')[4]) subjlen = int(line.split('\t')[5]) coverage = float(line.split('\t')[6]) seq_counts[seq] = [ count, correct_bases, total_bases, subjlen, coverage ] taxid = seq_taxids[seq] if taxid not in seen_taxids: seen_taxids.append(int(taxid)) if counter == 0: message = "Empty read count file. Likely no aligned reads in sample." print(message) #still have to write stuff f = open(sys.argv[5], 'w') f.write(message + '\n') f.close() f = open(sys.argv[6], 'w') f.write(message + '\n') f.close() sys.exit() #done parsing idxstats file #create NCBI taxon tree of observed taxa + extend to cellular_org tree = ncbi.get_topology(seen_taxids) tree_root = tree.get_tree_root().name lineage = ncbi.get_lineage(tree_root) full_taxids = seen_taxids + lineage full_tree = ncbi.get_topology(full_taxids, intermediate_nodes=True) full_seq_taxids = { line.split('\t')[0]: [ line.split('\t')[1].split(','), line.split('\t')[-1].strip('\n').split(',') ] for line in open(sys.argv[3]) } #full_seq_taxids: {taxid: [[specific buscos], [specific + inherited buscos]]} #determine seq counts #taxid_counts: {taxid: [[marker, readcount, correct_bases, total_bases, seqlen, coverage]]} taxid_counts = {} for seq in seq_counts: taxid = seq_taxids[seq] if taxid not in taxid_counts: taxid_counts[taxid] = [] taxid_counts[taxid].append([ seq, int(seq_counts[seq][0]), int(seq_counts[seq][1]), seq_counts[seq][2], seq_counts[seq][3], seq_counts[seq][4] ]) #write just observed taxid seqs taxon_coverage = {} #taxon_coverage[taxon] = [observed_markers, readcounts, total_bases, percentage_markers, marker_coverage, percent_id ] #dest = open(sys.argv[6], 'w') #dest.write("Name\tNCBI_Rank\tTaxID\tObserved_markers\tRead_counts\tPercent_observed_markers\tMarker_coverage\tPercent_identity\n") for tax in taxid_counts: mc = len(taxid_counts[tax]) counts = 0 bases = 0 correct = 0 total_bases = 0 subj_len = 0 for i in range(0, len(taxid_counts[tax])): counts += taxid_counts[tax][i][1] bases += taxid_counts[tax][i][3] correct += taxid_counts[tax][i][2] total_bases += taxid_counts[tax][i][3] subj_len += taxid_counts[tax][i][4] percent_identity = round((correct / total_bases) * 100, 2) overall_coverage = round((total_bases / subj_len) * 100, 2) total_markers = len(taxid_seqs[tax]) marker_percentage = round(mc / total_markers * 100, 2) name = [ ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax]) ][0] #rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] #dest.write(name + '\t' # + rank + '\t' # + tax + '\t' # + str(mc) + '\t' # + str(counts) + '\t' # + str(marker_percentage) + '%\t' # + str(overall_coverage) + '%\t' # + str(percent_identity) + '%\n') taxon_coverage[tax] = [ mc, counts, total_bases, marker_percentage, overall_coverage, percent_identity ] #dest.close() dest = open(sys.argv[6], 'w') dest.write( "Name\tObserved_markers\tRead_counts\tPercent_observed_markers\tTotal_marker_coverage\tPercent_identity\n" ) marker_sorted = sorted(taxon_coverage.keys(), reverse=True, key=lambda x: taxon_coverage[x][3]) for tax in marker_sorted: rank = [ncbi.get_rank([tax])[e] for e in ncbi.get_rank([tax])][0] name = [ ncbi.get_taxid_translator([tax])[e] for e in ncbi.get_taxid_translator([tax]) ][0] mc = taxon_coverage[tax][0] counts = taxon_coverage[tax][1] marker_percentage = taxon_coverage[tax][3] overall_coverage = taxon_coverage[tax][4] percent_identity = taxon_coverage[tax][5] dest.write(name + '\t' + str(mc) + '\t' + str(counts) + '\t' + str(marker_percentage) + '%\t' + str(overall_coverage) + '%\t' + str(percent_identity) + '%\n') orphan_children = [] #find counts of seqs for internal nodes for node in full_tree.traverse(): if node.is_leaf() == False: if node.name not in taxid_counts: taxid_counts[node.name] = [] for desc in node.iter_descendants(): if desc.name in taxid_counts: for seq in taxid_counts[desc.name]: if seq not in taxid_counts[node.name]: taxid_counts[node.name].append(seq) else: if node.name not in taxid_counts: orphan_children.append(node.name) #print the tree level_counts = [] currspaces = 0 currparent = '' seen_parents = {} dest = open(sys.argv[5], 'w') dest.write( "Markers_Obs\tTotal_Markers\tPercent_Makers_Obs\tPercent_ID\tMarker_read_count\tRank\tName\n" ) for node in full_tree.traverse("preorder"): if node.name not in orphan_children: rank = [ ncbi.get_rank([node.name])[e] for e in ncbi.get_rank([node.name]) ][0] name = [ ncbi.get_taxid_translator([node.name])[e] for e in ncbi.get_taxid_translator([node.name]) ][0] if node.is_root(): currspaces = 0 else: if currparent == '': currparent = node.up.name currspaces += 4 else: if currparent != node.up.name: currparent = node.up.name if currparent in seen_parents: currspaces = seen_parents[currparent] else: currspaces += 4 seen_parents[currparent] = currspaces if node.name in taxon_coverage: pid = str(taxon_coverage[node.name][5]) + '%' else: pid = "NA" #total_buscos buscos = len(taxid_counts[node.name]) seqs = sum([b[1] for b in taxid_counts[node.name]]) total_buscos = len(full_seq_taxids[node.name][1]) percent = round((buscos / total_buscos) * 100, 2) dest.write( str(buscos) + '\t' + str(total_buscos) + "\t" + str(percent) + '%\t' + str(pid) + '\t' + str(seqs) + '\t' + rank + '\t' + ' ' * currspaces + name + '\n') dest.close()
# '1166016':'1905730'}) # here as we descriped in the paper, we PhyloT to generate the tree, # since PhyloT is not free, so here we offer a free way to genetate by using ETE3 raw_id = known.columns.values.tolist() ncbi = NCBITaxa() # Also, we can use the Newick obtained file to get the tree by using PhyloT, just like the # description in our paper # import ete3 # tree=ete3.Tree("tree.txt",format=8) # print(tree) tree = ncbi.get_topology(raw_id) print(tree.get_ascii(attributes=["taxid"])) order = [] num = 1 for node in tree.traverse(strategy='levelorder'): if node.is_leaf(): order.append(node.name) postorder = [] num = 1 for node in tree.traverse(strategy='postorder'): if node.is_leaf(): postorder.append(node.name) known_Xl = known[order]
try: ncbi.update_taxonomy_database() except: pass if options.input_species_filename is None: raise Exception('-s option must be specified, Species list in text format one species in each line') with open(options.input_species_filename) as f: species_name = [_.strip().replace('_', ' ') for _ in f.readlines()] name2taxid = ncbi.get_name_translator(species_name) taxid = [name2taxid[_][0] for _ in species_name] tree = ncbi.get_topology(taxid) if options.treebest == "yes": inv_map = {str(v[0]): k.replace(" ", "") + "*" for k, v in name2taxid.items()} else: inv_map = {str(v[0]): k for k, v in name2taxid.items()} for leaf in tree: leaf.name = inv_map[leaf.name] newickTree = tree.write(format=int(options.format)) if options.treebest == "yes": newickTree = newickTree.rstrip(';') newickTree = newickTree + "root;"
mode = args.mode newick = args.newick if newick: t = PhyloTree(args.newick) species2taxid = dict([ line.split()[0], line.strip().split()[1] ] for line in open(infile)) taxids = set(species2taxid.values()) else: ncbi = NCBITaxa() taxids = set([ line.strip() for line in open(infile) ]) if args.taxoncolors: taxon2color = dict([int(line.split()[0]), line.split()[1]] for line in open(args.taxoncolors)) tNCBI = ncbi.get_topology(taxids, intermediate_nodes=True) tNCBI = tNCBI.search_nodes(name="2759")[0] ncbi.annotate_tree(tNCBI, taxid_attr="name") tax2node = dict([node.taxid, node] for node in tNCBI.traverse()) if args.no_intermediate_nodes: for node in tNCBI.get_descendants(): if len(node.children) == 1: node.delete() if len(tNCBI.children) == 1: tNCBI = tNCBI.children[0] tax2node = {} for node in tNCBI.traverse(): tax2node[node.taxid] = node if args.taxoncolors:
RefSeqSpecies = [] with open( '/home/ubuntu/MATLAB/GutMicrobiota/input/reference_genomes.txt') as f: next(f) for line in f: words = line.split('\t') RefSeqSpecies.append(words[0]) f.close() ncbi = NCBITaxa() name2taxid = ncbi.get_name_translator( list(set(ZhangZhaoGenera + ForslundHildebrandGenera + RefSeqSpecies))) tree = ncbi.get_topology(list( itertools.chain.from_iterable(list(name2taxid.values()))), intermediate_nodes=True) #print(tree.get_ascii(attributes=['sci_name']), file=open('/home/ubuntu/taxonomy.txt','w')) #print(tree.name) # fh = open('/home/ubuntu/MATLAB/GutMicrobiota/output/writeETEFiles/closestSpecies.txt','w') # for genus in ZhangZhaoGenera+ForslundHildebrandGenera: # print(genus) # minDist = -1 # minDistSpecies = '' # for species in RefSeqSpecies: # genusNode = tree.search_nodes(name=str(name2taxid[genus][0]))[0] # speciesNode = tree.search_nodes(name=str(name2taxid[species][0]))[0] # dist = tree.get_distance(speciesNode, genusNode) # if minDist == -1: # minDist = dist
def get_rank_summary_statistics(rank='phylum'): ''' Get phylogeny from the ncbi taxonomy database given the taxon list in the table pfam.refseq_ref_repres_genomes Keep rank phylogeny in the table pfam.phylogeny Calculate genome counts for each taxon at the specified rank. Save taxid2count in the table: pfam.<rank>_leaf2n_genomes :param rank: :return: ''' import MySQLdb import os from ete3 import NCBITaxa, Tree, TextFace, TreeStyle, StackedBarFace ncbi = NCBITaxa() sqlpsw = os.environ['SQLPSW'] conn = MySQLdb.connect( host="localhost", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="eggnog") # name of the data base cursor = conn.cursor() sql = 'create table if not exists eggnog.phylogeny (rank varchar(400), phylogeny TEXT)' cursor.execute(sql, ) conn.commit() sql2 = 'CREATE table if not exists eggnog.leaf2n_genomes_%s(taxon_id INT, n_genomes INT)' % rank cursor.execute(sql2, ) conn.commit() sql_taxid_list = 'select distinct taxon_id from eggnog.NOG_members_v451;' cursor.execute(sql_taxid_list, ) taxid_list = [i[0] for i in cursor.fetchall()] tree = ncbi.get_topology(taxid_list, rank_limit=rank) taxon_id_list = [int(i.name) for i in tree.traverse("postorder")] taxon_id2scientific_name = ncbi.get_taxid_translator(taxon_id_list) sql = 'CREATE table if not exists eggnog.taxid2label_%s(taxon_id INT, scientific_name TEXT, rank TEXT)' % ( rank) cursor.execute(sql, ) taxon_id2rank = {} for taxon in taxon_id2scientific_name: ranks = ncbi.get_rank([taxon]) try: r = ranks[max(ranks.keys())] except: r = '-' taxon_id2rank[taxon] = r for taxon in taxon_id2scientific_name: sql = 'insert into eggnog.taxid2label_%s values(%s, "%s", "%s")' % ( rank, taxon, taxon_id2scientific_name[taxon], taxon_id2rank[taxon]) cursor.execute(sql, ) conn.commit() collapse = [ 'Opisthokonta', 'Alveolata', 'Amoebozoa', 'Stramenopiles', 'Viridiplantae', 'Rhodophyta', 'Trypanosomatidae', 'Viruses', 'unclassified Bacteria', 'Leptospiraceae', 'unclassified Gammaproteobacteria', 'unclassified Alphaproteobacteria', 'unclassified Epsilonproteobacteria', 'unclassified Deltaproteobacteria', 'unclassified Cyanobacteria (miscellaneous)', 'unclassified Firmicutes sensu stricto', 'unclassified Actinobacteria (class) (miscellaneous)', 'unclassified Tissierellia', 'Dehalogenimonas' ] #def collapsed_leaf(node): # collapse = ['Opisthokonta', 'Alveolata','Amoebozoa','Stramenopiles','Viridiplantae','Rhodophyta', 'Trypanosomatidae', 'Viruses'] # name = taxon_id2scientific_name[int(node.name)] # if name in collapse: # return True # else: # return False # colapse major euk clades some clades for node in tree.traverse("postorder"): name = taxon_id2scientific_name[int(node.name)] to_detach = [] if name in collapse: to_detach.extend(node.children) print('ok-------------------', node.name) for n in to_detach: n.detach() leaves_list = [i.name for i in tree.iter_leaves()] leaf_taxon2n_species = {} leaf_taxon2n_species_with_domain = {} for leaf_taxon in leaves_list: print('leaf', leaf_taxon) leaf_taxon2n_species[leaf_taxon] = 0 leaf_taxon2n_species_with_domain[leaf_taxon] = 0 for taxon in taxid_list: lineage = ncbi.get_lineage(taxon) if int(leaf_taxon) in lineage: leaf_taxon2n_species[leaf_taxon] += 1 #if taxon in taxid_with_domain_list: # leaf_taxon2n_species_with_domain[leaf_taxon]+=1 for leaf_taxon in leaf_taxon2n_species: sql = 'insert into eggnog.leaf2n_genomes_%s values(%s, %s)' % ( rank, leaf_taxon, leaf_taxon2n_species[leaf_taxon]) cursor.execute(sql, ) conn.commit() sql = 'insert into eggnog.phylogeny values("%s","%s")' % ( rank, tree.write(format=1)) cursor.execute(sql, ) conn.commit()
import random from ete3 import Tree, NodeStyle, TreeStyle, NCBITaxa, faces ncbi = NCBITaxa() my_tree = ncbi.get_topology([54263, 8324, 8323, 8327, 8325, 57571, 323754]) ts = TreeStyle() ts.show_leaf_name = True for n in my_tree.traverse(): nstyle = NodeStyle() nstyle["fgcolor"] = "yellow" nstyle["size"] = 10 n.set_style(nstyle) my_tree.img_style["size"] = 20 my_tree.img_style["fgcolor"] = "green" code_name = { "54263": "Ichthyosaura alpestris", "8324": "Lissotriton vulgaris", "8323": "Triturus cristatus", "8327": "Triturus dobrogicus", "8325": "Triturus karelinii ", "57571": "Salamandra salamandra ", "323754": "Lissotriton montandoni" } def mylayout(node):
def run(args): # add lineage profiles/stats import re from ete3 import PhyloTree, NCBITaxa # dump tree by default if not args.tree and not args.info and not args.descendants: args.tree = True ncbi = NCBITaxa() all_taxids = {} all_names = set() queries = [] if not args.search: log.error('Search terms should be provided (i.e. --search) ') sys.exit(-1) for n in args.search: queries.append(n) try: all_taxids[int(n)] = None except ValueError: all_names.add(n.strip()) # translate names name2tax = ncbi.get_name_translator(all_names) all_taxids.update([(v, None) for v in list(name2tax.values())]) not_found_names = all_names - set(name2tax.keys()) if args.fuzzy and not_found_names: log.warn("%s unknown names", len(not_found_names)) for name in not_found_names: # enable extension loading tax, realname, sim = ncbi.get_fuzzy_name_translation(name, args.fuzzy) if tax: all_taxids[tax] = None name2tax[name] = tax name2realname[name] = realname name2score[name] = "Fuzzy:%0.2f" %sim if not_found_names: log.warn("[%s] could not be translated into taxids!" %','.join(not_found_names)) if args.tree: if len(all_taxids) == 1: target_taxid = next(all_taxids.keys()) log.info("Dumping NCBI descendants tree for %s" %(target_taxid)) t = ncbi.get_descendant_taxa(target_taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit, return_tree=True) else: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) t = ncbi.get_topology(list(all_taxids.keys()), intermediate_nodes=args.full_lineage, rank_limit=args.rank_limit, collapse_subspecies=args.collapse_subspecies) id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()]) for n in t.traverse(): n.add_features(taxid=n.name) n.add_features(sci_name=str(id2name.get(int(n.name), "?"))) n.name = "%s - %s" %(id2name.get(int(n.name), n.name), n.name) lineage = ncbi.get_lineage(n.taxid) n.add_features(named_lineage = '|'.join(ncbi.translate_to_names(lineage))) dump(t, features=["taxid", "name", "rank", "bgcolor", "sci_name", "collapse_subspecies", "named_lineage"]) elif args.descendants: log.info("Dumping NCBI taxonomy of %d taxa..." %(len(all_taxids))) print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "descendant_taxids", "descendant_names"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid in all_taxids: descendants = ncbi.get_descendant_taxa(taxid, collapse_subspecies=args.collapse_subspecies, rank_limit=args.rank_limit) print('\t'.join([str(taxid), translator.get(taxid, taxid), ranks.get(taxid, ''), '|'.join(map(str, descendants)), '|'.join(map(str, ncbi.translate_to_names(descendants)))])) elif args.info: print('# ' + '\t'.join(["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"])) translator = ncbi.get_taxid_translator(all_taxids) ranks = ncbi.get_rank(all_taxids) for taxid, name in six.iteritems(translator): lineage = ncbi.get_lineage(taxid) named_lineage = ','.join(ncbi.translate_to_names(lineage)) lineage_string = ','.join(map(str, lineage)) print('\t'.join([str(taxid), name, ranks.get(taxid, ''), named_lineage, lineage_string]))