def createSubgroupsOfGenes(tree_path, pathForSubgroups): #setOfGenes = set() setOfGenes = [] tree = Tree(tree_path) dicNodesLeaves = tree.get_cached_content() for node in tree.traverse("postorder"): #setOfGenes.add(dicNodesLeaves[node]) currSet = dicNodesLeaves[node] setOfNames = set() for n in currSet: setOfNames.add(n.name) if (len(setOfNames) == 1): continue setOfGenes.append(setOfNames) with open(pathForSubgroups, 'wb') as handle: pickle.dump(setOfGenes, handle) return
# open tree file with open(args.tree_file, "r") as f: # each line is a bootstrap / mcmc tree for line in f: # update total tree count total_tree_count += 1 # read in tree t = Tree(line) # cache tree ## keys = node objects, values = sets of leaf names cache = t.get_cached_content(store_attr="name") # store all leave names in set all_leaves = set([leaf.name for leaf in t.iter_leaves()]) # traverse nodes in bootstrap tree for node in t.traverse("postorder"): # skip if leaf node if node.is_leaf(): continue # define 3 sets of taxa that emerge from node ## taxa associated with branch one: children_one = cache.get(node.children[0]) ## taxa associated with branch two: children_two = cache.get(node.children[1])
class UniFrac(object): """ note that the whole association between metadata and leave nodes works by .loc: the nodes are named according to the dataframe index and we look up a nodes metadata with df_metadata.loc[node.name] """ def __init__(self, datamatrix, df_metadata): super(UniFrac, self).__init__() "make sure that the dataframe index is unique" assert len(df_metadata) == len( set(df_metadata.index) ), 'row-index is not unique, but we need uniqueness to associate the metadata with the leaves in the tree' self.datamatrix = datamatrix self.df_metadata = df_metadata self.tree = None self._linkage = None # just kept to do the cut_trees call self.cluster_roots = None # just kept to do the cut_trees call self.nodes2leaves = None # for caching leaf lookups, however this return a set!! def _update_leave_metadata(self): "puts the metadata in self.metadata as features into the trees leaves" assert self.tree # to speed things up, query the dataframe only once leaves = self.tree.get_leaves() leavenames = [leave.name for leave in leaves] meta = self.df_metadata.loc[ leavenames].values # sorts the metadata in the same order as leavenames featurenames = self.df_metadata.columns.values for i, leaf in enumerate(leaves): leaf.add_features(**dict(zip(featurenames, meta[i, :]))) #TODO not sure if this overwrites previous features (thats what i want) or just adds additional features! def build_tree(self, method, metric): """ constructs the hierarchical clustering tree, but no clustering (corresponding to some tree pruning) in here """ self._linkage = linkage(self.datamatrix, method=method, metric=metric) # turn it into a ete tree leave_labels = self.df_metadata.index.values newick_tree = linkage_to_newick(self._linkage, labels=leave_labels) self.tree = Tree(newick_tree) self.nodes2leaves = self.tree.get_cached_content( ) # makes it easy to lookup leaves of a node # populate the leaves with metadatqa self._update_leave_metadata() def cluster(self, n_clusters): """ prunes the hierarchical clustering tree to get clusters of data this clustering is also added to the metadata also adds the self.cluster_roots (caching it, we need it in unifrac calls) """ assert self.tree clustering_prune = cut_tree(self._linkage, n_clusters) self.df_metadata['clustering'] = clustering_prune self._update_leave_metadata() self.cluster_roots = find_cluster_roots(self.tree) for i, cluster_root in enumerate(self.cluster_roots): cluster_root.add_features( **{ 'is_cluster_root': i, 'n_datapoints': len(self.nodes2leaves[cluster_root]) }) def unifrac_distance(self, group1, group2, randomization=None): """ calculates the uniFrac distance of the two sample-groups group1: list of nodenames (i.e. indices of the metadata) group2: ---"--- randomization: (int) how many times to compute the 'randomized' uniFrac distance to get a pvalue """ assert 'clustering' in self.df_metadata.columns and self.cluster_roots, "run cluster() first" # all_leaves = self.tree.get_leaves() # TODO this is a performance hog the_Root = self.tree.get_tree_root() all_leaves = self.nodes2leaves[ the_Root] # for performance reasons this is better then the line above # make sure all group elements are in hte tree leaf_names = [_.name for _ in all_leaves] assert all([_ in leaf_names for _ in group1]) assert all([_ in leaf_names for _ in group2]) # t.get_leaves_by_name(group1) group1_nodes = set([_ for _ in all_leaves if _.name in group1 ]) # sets for faster `in` lookup group2_nodes = set([_ for _ in all_leaves if _.name in group2 ]) # TODO replace by search_nodes?! the_distance = self._unifrac_dist(group1_nodes, group2_nodes) if randomization and randomization > 0: G1 = len(group1_nodes) G2 = len(group2_nodes) all_nodes = list( group1_nodes | group2_nodes) # union, but turn into list for partioning later randomized_distances = [] for i in range(randomization): shuffle(all_nodes) # inplace shuffle group1_nodes_random = set(all_nodes[:G1]) group2_nodes_random = set(all_nodes[G1:]) randomized_distances.append( self._unifrac_dist(group1_nodes_random, group2_nodes_random)) randomized_distances = np.array(randomized_distances) # pvalue p = 1 - stats.norm( loc=randomized_distances.mean(-1), scale=randomized_distances.std(-1)).cdf(the_distance) p2 = np.sum(randomized_distances > the_distance) / len( randomized_distances) # print(p, p2) return the_distance, randomized_distances, p2 else: return the_distance def _unifrac_dist(self, group1_nodes, group2_nodes): "given two node lists, calculate the unifrac distance" At, Bt = len(group1_nodes), len(group2_nodes) nom = {} denom = {} for i, current_cluster_root in enumerate(self.cluster_roots): leafs = list(self.nodes2leaves[current_cluster_root] ) # all the datapoitns in the cluster Ai = len([_ for _ in leafs if _ in group1_nodes]) Bi = len([_ for _ in leafs if _ in group2_nodes]) distance2root = current_cluster_root.distance2root # cached already nom[i] = distance2root * np.abs(Ai / At - Bi / Bt) denom[i] = distance2root * np.abs(Ai / At + Bi / Bt) n_clusters = len(nom) summed_nom = sum([nom[i] for i in range(n_clusters)]) summed_denom = sum([denom[i] for i in range(n_clusters)]) unifrac_distance = summed_nom / summed_denom return unifrac_distance def visualize(self, group1=None, group2=None): import matplotlib import matplotlib.pyplot as plt # annotate the cluster roots with their fractions if group1 or group2: for i, cluster_root in enumerate(self.cluster_roots): # count downstream conditions in the leafs datapoints_in_cluster = list(self.nodes2leaves[cluster_root]) cluster_root.add_face( TextFace(f"Group1: {len(group1)}// Group2:{len(group2)}"), column=0, position="branch-right") def _custom_layout(node): cmap_cluster = plt.cm.tab10( np.linspace(0, 1, len(self.cluster_roots))) cmap_treated = plt.cm.viridis(np.linspace(0, 1, 2)) if node.is_leaf(): c_cluster = matplotlib.colors.rgb2hex( cmap_cluster[node.clustering, :]) c_treat = matplotlib.colors.rgb2hex( cmap_treated[node.treated, :]) node.img_style["fgcolor"] = c_treat node.img_style["bgcolor"] = c_cluster if 'is_cluster_root' in node.features: c_cluster = matplotlib.colors.rgb2hex( cmap_cluster[node.is_cluster_root, :]) node.img_style["bgcolor"] = c_cluster node.img_style["draw_descendants"] = False node.add_face(TextFace(f"#data:{node.n_datapoints}"), column=0, position="branch-right") ts = TreeStyle() ts.mode = "r" ts.show_leaf_name = False ts.arc_start = -180 # 0 degrees = 3 o'clock ts.arc_span = 270 ts.layout_fn = _custom_layout self.tree.show(tree_style=ts)
default='EN', help='language chosen. FR for french, EN (default) for english', choices=['EN', 'FR']) args = parser.parse_args() sys.stdout.write("\nLoading tree... \r") sys.stdout.flush() t = Tree(args.filename) #read the input tree. nbsp = len(t) ## get nb of tips sys.stdout.write("Loading tree... DONE [the tree has %d tips] \n" % nbsp) sys.stdout.flush() ## sys.stdout.write("Storing tree nodes for faster lookup... \r") sys.stdout.flush() node2leaves = t.get_cached_content() sys.stdout.write("Storing tree nodes for faster lookup... DONE\n") sys.stdout.flush() t.x = 6.0 t.y = 9.660254 - 10.0 t.alpha = 30.0 t.ray = 30.0 t.zoomview = np.ceil(np.log2(30 / t.ray)) maxZoomView = 0 ## ##FUNCTIONS #getattr(t,n) def rad(deg):
#incomplete line print("Incomplete processing of ancestor lineages!") print(ancestors_d) try: ancestor = t.get_common_ancestor(ancestors_d[filename]) #print(ancestor) t.set_outgroup(ancestor) except KeyError: print("Root not selected!") print(t.get_tree_root()) quit() t.ladderize(direction=1) #select scale 0-1.0 or 0-100 for support values supportscache = t.get_cached_content(store_attr="support") supportslist = [x.support for x in supportscache] if max(supportslist) == 1: minsupport = 0.85 else: minsupport = 85 find_supported(t, support=minsupport) #find non-terminal nodes with high support ################## ### MAIN ### ################## #create a dictionary of taxon data and a list of all localities locdata = {} localization = set()
def get_species_tree(biodb): from ete3 import Tree,TreeStyle server, db = manipulate_biosqldb.load_db(biodb) species2n_complete_genomes, species2n_draft_genomes, species2completeness = get_species_data(server, biodb) sql_tree = 'select tree from reference_phylogeny t1 inner join biodatabase t2 on t1.biodatabase_id=t2.biodatabase_id ' \ ' where t2.name="%s";' % biodb server, db = manipulate_biosqldb.load_db(biodb) complete_tree = Tree(server.adaptor.execute_and_fetchall(sql_tree,)[0][0]) R = complete_tree.get_midpoint_outgroup() complete_tree.set_outgroup(R) sql = 'select distinct taxon_id,species from taxid2species_%s t1 ' \ ' inner join species_curated_taxonomy_%s t2 on t1.species_id=t2.species_id;' % (biodb, biodb) taxon_id2species_id = manipulate_biosqldb.to_dict(server.adaptor.execute_and_fetchall(sql,)) # changing taxon id to species id for leaf in complete_tree.iter_leaves(): #print '%s --> %s' % (leaf.name, str(taxon_id2species_id[str(leaf.name)])) leaf.name = "%s" % str(taxon_id2species_id[str(leaf.name)]) # attributing unique id to each node # if all node descendant have the same name, use that name as node name n = 0 for node in complete_tree.traverse(): if node.name=='': desc_list = list(set([i.name for i in node.iter_descendants()])) try: desc_list.remove('') except ValueError: pass if len(desc_list) != 1: node.name = '%sbb' % n else: node.name = desc_list[0] n+=1 # Collapsing nodes while traversing # http://etetoolkit.org/docs/latest/tutorial/tutorial_trees.html#collapsing-nodes-while-traversing-custom-is-leaf-definition node2labels = complete_tree.get_cached_content(store_attr="name") def collapsed_leaf(node): if len(node2labels[node]) == 1: return True else: return False species_tree = Tree(complete_tree.write(is_leaf_fn=collapsed_leaf)) for lf_count, lf in enumerate(species_tree.iter_leaves()): try: n_complete_genomes = species2n_complete_genomes[lf.name] except: n_complete_genomes = False try: n_draft_genomes = species2n_draft_genomes[lf.name] except: n_draft_genomes = False if n_draft_genomes: c1 = round(species2completeness[lf.name][0]) c2 = round(species2completeness[lf.name][1]) if c1 == c2: completeness = "%s%%" % c1 else: completeness = "%s-%s%%" % (c1, c2) if n_complete_genomes and n_draft_genomes: lf.name = "%s (%sc/%sd, %s)" % (lf.name, n_complete_genomes, n_draft_genomes, completeness) if n_complete_genomes and not n_draft_genomes: lf.name = "%s (%sc)" % (lf.name, n_complete_genomes) if not n_complete_genomes and n_draft_genomes: lf.name = "%s (%sd, %s)" % (lf.name, n_draft_genomes, completeness) return complete_tree, species_tree
def get_inconsistent_trees(tree, ali, outgroups, all_families, sfile, octr, otr, oal, stats=None, discard_sp=None): """ For a given ensembl tree, check whether synteny-derived constrained topologies are consistent with it. If not, the corresponding constrained trees, ensembl subtrees and ensembl sub-alignments will be saved to file. Args: tree (str): ensembl tree in newick format ali (str): ensembl ali in fasta format outgroups (list): list of outgroup species used in the synteny-analysis all_families (dict of OrthologyFamily instances): for each outgroup genes (key) an OrthologyFamily instance (synteny-derived orthogroups and constrained tree topology) cfile (str): file to write name of synteny consistent subtrees mfile (str): file to write name of multigenic subtrees stats (dict, optional): dict to count the number of consistent and inconsistent trees """ whole_tree = Tree(tree, format=1) for leaf in whole_tree.get_leaves(): namesp = leaf.name + '_' + leaf.S leaf.prev_name = leaf.name leaf.name = namesp cached_whole_tree = whole_tree.get_cached_content( store_attr=['name', 'prev_name', 'S']) outgr_leaves = [ i for i in cached_whole_tree[whole_tree] if i[2] in outgroups ] #for each gene of the outgroup present in tree for outgr_leaf in outgr_leaves: #if we have a corresponding constrained tree topology if outgr_leaf[1] in all_families: ctree = all_families[outgr_leaf[1]].ctree cached_ctree = ctree.get_cached_content(store_attr=['name']) #fast way to flatten list of tuple ctree_leaves = set(sum(cached_ctree[ctree], ())) lca = whole_tree.get_common_ancestor(ctree_leaves) leaves = cached_whole_tree[lca] leaves_in_fam = [i for i in leaves\ if i[1] in all_families[outgr_leaf[1]].genes_in_orthotable\ or i[0] in ctree_leaves] leavesnames_in_fam = {i[0] for i in leaves_in_fam} #keep all genes present in the family if len(ctree_leaves) < len(leaves): to_replace_inside = leavesnames_in_fam.difference(ctree_leaves) all_families[outgr_leaf[1]].update_constrained_tree( to_replace_inside, lca) if discard_sp: keep = [ i for i in ctree_leaves if i.split('_')[-1] not in discard_sp ] ctree.prune(keep) lca = lca.copy() lca.prune(keep) leavesnames_in_fam = set(keep) if len(leavesnames_in_fam) <= 2: sfile.write(outgr_leaf[1] + "\t" + "Too few genes" + '\n') stats['Too few genes'] = stats.get('Too few genes', 0) + 1 continue comparison = ctree.compare(lca) #check if the constraint is present in the tree if comparison['source_edges_in_ref'] != 1: #check if family is not too multigenic, in whih case correction is difficult if not all_families[outgr_leaf[1]].is_multigenic(): #we make a copy in case more than 1 subtree is inconsistent #"newick-extended" copy is iterative (based on ete3 load/write) #This way we do not risk to hit recusrion limit ori_tree = lca.copy("newick-extended") ori_tree.prune(leavesnames_in_fam) #write original subtrees ori_tree.write(outfile=otr+'/'+outgr_leaf[1]+'.nh',\ format=9, features=["D"]) #write constrained tree topology ctree.write(outfile=octr + '/C_' + outgr_leaf[1] + '.nh', format=9, features=["D"]) #write corresponding sub-alignment gene_species_mapping = dict( (name, sp) for namesp, name, sp in leaves_in_fam) seq = ut.get_subali(ali, gene_species_mapping, gene_species_mapping) ut.write_fasta(seq, oal + '/' + outgr_leaf[1] + '.fa') sfile.write(outgr_leaf[1] + "\t" + "Inconsistent" + '\n') stats['Inconsistent'] = stats.get('Inconsistent', 0) + 1 else: sfile.write(outgr_leaf[1] + "\t" + "Inconsistent_multigenic" + '\n') stats['Inconsistent_multigenic'] = stats.get( 'Inconsistent_multigenic', 0) + 1 else: sfile.write(outgr_leaf[1] + "\t" + "Consistent" + '\n') stats['Consistent'] = stats.get('Consistent', 0) + 1
tree = TreeNode.read('/home/meike/tests/Files/tree/test.nwk') i = 1 for node in tree.levelorder(): if not node.is_tip(): if node.name is not None and node.name != '': node.name = '%s:N%d' % (node.name, i) else: node.name = 'N%d' % i i += 1 t2 = tree.write('/home/meike/tests/Files/tree/test_with_ids.nwk') # We create a cache with every node content node2labels = t.get_cached_content(store_attr="name") collapsed_nodes = [] for node in t.traverse(): if collapsed_leaf(node) == 1: collapsed_nodes.append(node.name) with open('/home/meike/tests/Files/tree/collapsing.txt', 'w') as f: f.write('COLLAPSE\nDATA\n') for name in collapsed_nodes: f.write(name + '\n') #t.show() #t.write(is_leaf_fn=collapsed_leaf) # We can even load the collapsed version as a new tree
values = col[col.nonzero()].data[0] counter = Counter(values) if counter: most_common = float(counter.most_common()[0][1]) con.append(most_common / len(values)) con = np.array(con) return con.mean(), gappyness # loads alg, tree alg, alg_index = load_alg(sys.argv[2]) name2algindex = {name: i for i, name in enumerate(alg_index)} tree = Tree(sys.argv[1]) tree.set_outgroup(tree.get_midpoint_outgroup()) print(tree) tree.show() # Creates a node to sequence cache for leaf in tree: leaf.seqindex = name2algindex[leaf.name] n2seqs = tree.get_cached_content(store_attr="seqindex") # Iters each internal node in the tree and calculate sub-alg quality for n in tree.traverse("level_order"): if n.children: con = alg_conservation(alg, list(n2seqs[n])) print(n, con) input()
for r in SeqIO.parse(fname, format="fasta"): index.append(r.id) alg.append(seq2vector(r.seq)) named_index = {name: i for i, name in enumerate(index)} return np.array(alg), named_index, index tree_file = sys.argv[1] alg_file = sys.argv[2] thr = float(sys.argv[3]) alg, index, i2name = load_alg(alg_file) tree = Tree(tree_file) tree.set_outgroup(tree.get_midpoint_outgroup()) node2content = tree.get_cached_content(store_attr="name") for n in tree.traverse("levelorder"): if n.children: ch1 = n.children[0] ch2 = n.children[1] leaves_left = [index[name] for name in node2content[ch1]] leaves_right = [index[name] for name in node2content[ch2]] if len(leaves_left) < 3 or len(leaves_right) < 3: continue rows, cols = alg[tuple(leaves_left), :].nonzero() colres_left = Counter(cols) cols_left = set([c for c, count in colres_left.items() if count >= 2]) #>= 0.1 * len(leaves_left) ])
def orthologies_with_outgroup(forest, duplicated_sp, outgroup, dict_genes, out): """ Browses a gene tree forest and searches for orthologs with the outgroup. Writes genes without phylogenetic orthologs to a file. Also writes files with high-confidence orthologs and paralogs to use to otpimize the synteny support threshold to call orthology. Args: forest (str): Name of the gene trees forest file duplicated_sp (list of str): List of all duplicated species for the considered WGD outgroup (str): Non-duplicated outgroup dict_genes (dict of GeneSpeciesPosition tuples): All gene positions for each species out (str): Output file to write genes without phylogenetic orthologs Returns: dict: Orthologs of outgroup genes in each duplicated species Note (FIXME): Written to work within scorpios as orthologs and paralogs file names are derived from output file patterns, assuming it contains an '_'. """ ortho = {e: {} for e in duplicated_sp} orthofile = out.replace(out.split("/")[-1].split('_')[0], "orthologs") parafile = out.replace(out.split("/")[-1].split('_')[0], "paralogs") with open(out, 'w') as outfile, open(forest, 'r') as infile, open(parafile, 'w') as out_para,\ open(orthofile, 'w') as out_ortho: sys.stderr.write("Browsing gene trees for orthologies with the outgroup...\n") for tree in ut.read_multiple_objects(infile): #load tree tree = Tree(tree.strip(), format=1) node2leaves = tree.get_cached_content() leaves = [i for i in tree.get_leaves()] #add a tag to genes of duplicated species tag_duplicated_species(leaves, duplicated_sp) #find all clades with only genes of duplicated species subtrees = tree.get_monophyletic(values=["Y"], target_attr="duplicated") #find all outgroup genes outgroup_genes = [i for i in leaves if i.S == outgroup] #search for an ortholog gene in the outgroup for all clades of teleost genes for subtree in subtrees: seen = {} subtree_leaves = subtree.get_leaves() found = False #browse all outgroup genes for j in outgroup_genes: #find the node that splits the outgroup gene and duplicated species genes lca = tree.get_common_ancestor(subtree, j) topo_distance = len(node2leaves[lca]) # if it is a speciation or dubious duplication node --> speciation if org.is_speciation(lca): branch_distance = tree.get_distance(subtree, j) if subtree not in seen: seen[subtree] = [] seen[subtree].append((topo_distance, branch_distance, j)) found = True # if no 'true' ortholog # check if all descendants include only outgroup + duplicated species if not found: for j in outgroup_genes: lca = tree.get_common_ancestor(subtree, j) for gene in lca.get_leaves(): if gene.duplicated != "Y" and gene.S != outgroup: break #if no break, it means all descendants are outgroup or dup. else: topo_distance = len(node2leaves[lca]) branch_distance = tree.get_distance(subtree, j) seen[subtree] = seen.get(subtree, []) seen[subtree].append((topo_distance, branch_distance, j)) # if an ortholog was found, add it to the orthology dict if seen: content = [] seen[subtree].sort(key=lambda x: (x[0], x[1])) outgroup_gene = seen[subtree][0] outgroup_gene = outgroup_gene[2].name for species in duplicated_sp: genes = [i.name for i in subtree_leaves if i.S == species] genes = get_genes_positions(genes, species, dict_genes) ortho[species][outgroup_gene] = ortho[species].get(outgroup_gene, []) ortho[species][outgroup_gene] += genes content += [g.name+'_'+species.replace(' ', '.')+\ '|'+str(g.chromosome)+\ '|'+str(g.index) for g in genes] all_ortho = [i[2].name for i in seen[subtree]] paralogs = [i.name for i in outgroup_genes if i.name not in all_ortho] if paralogs: paralog = random.choice(paralogs) if paralog in dict_genes[outgroup]\ and outgroup_gene in dict_genes[outgroup]: tmp_dict = dict_genes[outgroup] out_ortho.write(' '.join(content)+'\t') out_ortho.write(str(outgroup_gene)+'|'+\ str(tmp_dict[outgroup_gene].chromosome)+'|'+\ str(tmp_dict[outgroup_gene].index)+'|'+str(0)+'|'+\ str(0)+'\n') out_para.write(' '.join(content)+'\t') out_para.write(str(paralog)+'|'+\ str(tmp_dict[paralog].chromosome)+'|'+\ str(tmp_dict[paralog].index)+'|'+\ str(0)+'|'+str(0)+'\n') # if no ortholog found # write genes without ortholog along with all outgroup genes in tree # (potential candidate for orthology) elif any(i.name in dict_genes[outgroup] for i in outgroup_genes): #genes without orthologs missed_genes = [] for species in duplicated_sp: genes = [i.name for i in subtree_leaves if i.S == species] genes = get_genes_positions(genes, species, dict_genes) missed_genes += [g.name+'_'+species.replace(' ', '.')+\ '|'+str(g.chromosome)+\ '|'+str(g.index) for g in genes] if missed_genes: outfile.write(' '.join(missed_genes)+'\t') #candidate orthologs in the outgroup outgr_genes = [i.name for i in outgroup_genes] in_paralogs = [] for pair in itertools.combinations(outgr_genes, 2): if tree.get_distance(pair[0], pair[1], topology_only=True) == 1: in_paralogs.append(pair[0]+'|'+pair[1]) outgr_write = [] genome = dict_genes[outgroup] for gene in outgr_genes: if gene in genome: lca = tree.get_common_ancestor(subtree, gene) branch_distance = tree.get_distance(subtree, gene) topo_distance = len(node2leaves[lca]) outgr_write.append(str(gene)+'|'+str(genome[gene].chromosome)+'|'+\ str(genome[gene].index)+'|'+str(topo_distance)+\ '|'+str(branch_distance)) outfile.write(' '.join(outgr_write)+'\t'+' '.join(in_paralogs)+'\n') sys.stderr.write("Phylogenetic orthologies with the outgroup OK\n") return ortho
def Tree_analysis(tree,tabla,out,analysis_type,out2): ###Al subsequents variables could be modified binomial_value = float(0.05) #Default value for the option 2 of the core evaluation method for the tree p_value = float(0.05) #p-value threeshold for the binomial method (2 method) percentage = float(0.9) #Minimun percentage threeshold of subjects requiered to defined a core taxo_p = float(0.9) #Minimun percentage of the same taxonomic group within all OTUs contained into the same Node output_file=open(out, 'w') output_file_2=open(out2, 'w') tree = Tree(tree, quoted_node_names=True, format=1) #Here we load the 97_otus tree table = {} cont = 1 for line in open(tabla): if (line.startswith('#')): output_file_2.write(str(line)) else: fields = list(map(str.strip, line.split('\t'))) #We create a dictionary with all the keys and values of the OTU table against reference table[fields[0]] = list(map(float, fields[1:-1])) table2 = {} for line in open(tabla): if (line.startswith('#')): continue else: fields2 = list(map(str.strip, line.split('\t'))) #Here we load a dictionary with the taxonomy information from the picked OTUs table2[fields2[0]] = list(map(str, fields2[(len(fields2)-1):len(fields2)])) table_final_res = [0] * len(fields[1:-1]) table_final_res = ([float(i) for i in table_final_res]) sum_abun_rela = 0 cores = 0 for leaf in tree: if leaf.name not in table: leaf.vector = None else: leaf.vector = table[leaf.name] #Create value vectors for each of the tree tips of the tree with the values of the OTU table previously generated node2content = tree.get_cached_content() flag=0 for node in tree.traverse(): #This loop is used to add values into de vectors created before if not node.is_leaf(): leaf_vectors = np.array([leaf.vector for leaf in node2content[node] if leaf.vector is not None]) node.vector = leaf_vectors.sum(axis=0) if(flag == 0): save_node1=node.vector total_saved_leaves = np.array([leaf.name for leaf in node2content[node]]) flag=1 if(analysis_type==4): #This method only prints the information of the tree, only for information of the tree purpouse print(tree.get_ascii(show_internal=True)) output_file.write(tree.get_ascii(show_internal=True) + '\n' + '\n') for node in tree.traverse("preorder"): print (node.name, node.vector) output_file.write(node.name + '\t' + str(node.vector) + '\n') if(analysis_type!=4): output_file.write("Core" + '\t' + "Prevalence" + '\t' + "Abundance" + '\t' + "Relative abundances" + '\t' + "Min" + '\t' + "Max" + '\t' + "Average" + '\t' + "SD" + '\t' + "Leaves" + '\t' + "Taxonomy" + '\t' + "Leaves number" + '\n') if(analysis_type==1 or analysis_type==2 or analysis_type==3): #Here we evaluate the tree traversally using one of the choosen methods: 100% core, binomial or percentage for node in tree.traverse("postorder"): tot_cont=np.count_nonzero(node.vector) #Count the number ob subjects in this study with one ore more ocurrence in the vector for a certain node tot_cont2=np.asarray(node.vector).size #Count the total vector array size a=stats.binom_test(tot_cont, n=tot_cont2, p=binomial_value, alternative='greater') #Binomial test that uses the binomial_value rela=(tot_cont/tot_cont2) if(analysis_type==1 and np.all(node.vector) or (analysis_type==2 and a <= p_value) or (analysis_type==3 and rela >= percentage)): #Depending on the method used to go through the tree, we will evaluate different parameters to check if the node should be or not taken into account node.vector=([float(i) for i in node.vector]) #Transform all the values contained in node.vector to float, to perform operations efficiently abundance=node.vector/save_node1 #Relative abundance of each subject in the node over the terminal node (sum of all nodes) abundance =([float(i) for i in abundance]) mean_abun=np.mean([float(i) for i in abundance]) #Mean abundance of the node std_abun=np.std([float(i) for i in abundance]) #Standard deviation of the node abundance_rela=sum(node.vector)/sum(save_node1) #Global relative abundance of the node over the terminal node table_final_res=list(map(sum, zip(table_final_res, abundance))) #Getting all the results for each node into a final result table sum_abun_rela=sum_abun_rela+abundance_rela #The sum of all global relative abundance cores=cores+1 #Total number of cores node2content = tree.get_cached_content() output_file_2.write(str(node.name) + '\t') for x in range(len(abundance)): output_file_2.write(str(abundance[x]) + '\t'), output_file_2.write('\n') output_file.write(node.name + '\t' + str(rela) + '\t' + str(node.vector) + '\t' + str(abundance) + '\t' + str(min(abundance)) + '\t' + str(max(abundance)) + '\t' + str(mean_abun) + '\t' + str(std_abun) + '\t') conteo_hojas=nodes_eval(node,tree,output_file,table2,taxo_p,total_saved_leaves) #With this line we can assign a taxonomy to each node based in the taxonomy of each OTU, dependig on the minimun taxonomy percentage level stablished before output_file.write(str(conteo_hojas) + '\n') #Print the total number of leaves of this node tree=erase_node(node,tree) #Once a node has been evaluated, this line erase that node from the tree to simplify the calculations of the next nodes G = tree.search_nodes(name=node.name)[0] removed_node = G.detach() output_file.write(str(cores) + '\t' + '\t' + '\t' + str(table_final_res) + '\t' + str(min(table_final_res)) + '\t' + str(max(table_final_res)) + '\t' + str(np.mean([float(i) for i in table_final_res])) + '\t' + str(np.std([float(i) for i in table_final_res])) + '\n')