def pruning(inputtree, inputfasta, tree_outfilename): #This function remove sequences from a FASTA from a larger tree #Full initial tree - to be pruned k = open(inputtree, "r").read() #ete3 Tree format f = Tree(inputtree) #List of IDs to be picked from the full FASTA IDlist=[] fasta = open(inputfasta, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) for recordID in record_dict.keys(): print recordID IDlist.append(recordID) print IDlist tree_outfile=open(tree_outfilename, "w") print "pruning...", inputfasta f.prune(IDlist, preserve_branch_length=True) f.write(format=0, outfile=tree_outfilename) print "pruned", inputfasta
def migrate(db_path): if db_path is None: raise ConfigError("No database path is given.") # make sure someone is not being funny utils.is_profile_db(db_path) # make sure the version is accurate profile_db = db.DB(db_path, None, ignore_version = True) if str(profile_db.get_version()) != current_version: raise ConfigError("Version of this profile database is not %s (hence, this script cannot really do anything)." % current_version) # migrate item orders item_orders = profile_db.get_table_as_dict(item_orders_table_name) for order_name in item_orders: if item_orders[order_name]['type'] == 'newick': newick = Tree(item_orders[order_name]['data'], format=1) newick = newick.write(format=2) profile_db._exec("""UPDATE %s SET "data" = ? WHERE "name" LIKE ?""" % item_orders_table_name, (newick, order_name)) # migrate layer orders layer_orders = profile_db.get_table_as_dict(layer_orders_table_name) for order_name in layer_orders: if layer_orders[order_name]['data_type'] == 'newick': newick = Tree(layer_orders[order_name]['data_value'], format=1) newick = newick.write(format=2) profile_db._exec("""UPDATE %s SET "data_value" = ? WHERE "data_key" LIKE ?""" % layer_orders_table_name, (newick, order_name)) # set the version profile_db.remove_meta_key_value_pair('version') profile_db.set_version(next_version) # bye profile_db.disconnect() progress.end()
def sanitizeByType(container, sanitizeby='tsv', onlycolumns=False): '''for a iterable of strings, carry out sanitizeString by: line, tsv (all or onlycolumns), fasta headers, or leaf in nwk''' assert sanitizeby in set(['line', 'tsv', 'newick', 'fasta']) if sanitizeby=='line': for line in container: print(sanitizeString(line.strip("\r\n"), False)) if sanitizeby=='tsv': for line in container: if onlycolumns: newline = line.strip("\r\n").split("\t") for i in onlycolumns: newline[i-1]=sanitizeString(newline[i-1], False) else: newline=[sanitizeString(item.strip("\r\n"), False) for item in line.split("\t")] print("\t".join(newline)) if sanitizeby=='newick': from ete3 import Tree t=Tree("".join(container)) for l in t: l.name=sanitizeString(l.name, False) print(t.write()) if sanitizeby=='fasta': from Bio import SeqIO from io import StringIO from sys import stdout fasta = StringIO("".join(container)) for seq_record in SeqIO.parse(fasta, "fasta"): seq_record.id=sanitizeString(seq_record.description, False) seq_record.description='' SeqIO.write(seq_record, stdout, "fasta")
def createImg(filename, thres=0, samples=1): count = parseLineage(filename) suffix, matrix, taxo = getSuffixandMatrixandNewick(count,thres,samples) newick = convert(taxo,suffix) newick += ';' t = Tree(newick, format=1) ct = ClusterTree(t.write(), text_array=matrix) addColors(ct) # nodes are linked to the array table array = ct.arraytable # Calculates some stats on the matrix. Needed to establish the color gradients. matrix_dist = [i for r in xrange(len(array.matrix))for i in array.matrix[r] if np.isfinite(i)] matrix_max = np.max(matrix_dist) matrix_min = np.min(matrix_dist) matrix_avg = (matrix_max+matrix_min)/2 # Creates a profile face that will represent node's profile as a heatmap profileFace = ProfileFace(matrix_max, matrix_min, matrix_avg, 200, 14, "heatmap",colorscheme=3) # Creates my own layout function that uses previous faces def mylayout(node): # If node is a leaf if node.is_leaf(): # And a line profile add_face_to_node(profileFace, node, 0, aligned=True) node.img_style["size"]=2 # Use my layout to visualize the tree ts = TreeStyle() ts.layout_fn = mylayout # ct.show(tree_style=ts) filedir = '/'.join(filename.split('/')[:-1]) # t.write(format=9, outfile="output/newick/"+param+".nw") ct.render(filedir+'/phylo.png',tree_style=ts)
def get_example_tree(): # Random tree t = Tree() t.populate(20, random_branches=True) # Some random features in all nodes for n in t.traverse(): n.add_features(weight=random.randint(0, 50)) # Create an empty TreeStyle ts = TreeStyle() # Set our custom layout function ts.layout_fn = layout # Draw a tree ts.mode = "c" # We will add node names manually ts.show_leaf_name = False # Show branch data ts.show_branch_length = True ts.show_branch_support = True return t, ts
def main(): args = parse_args() # Use the extension specified by the user if present if args.extension: ext = args.extension else: ext = '.mod.tre' # Load the tree t = Tree(args.tree) # Iterate over the nodes, convert to desired value for node in t.iter_search_nodes(): if args.decimal: if node.support >= 1: node.support = float(node.support * 0.01) else: print >> sys.stderr, 'bootstrap value in {} is < 1, \ ignoring.'.format(args.tree) else: if node.support <= 1: node.support = int(node.support * 100) else: print >> sys.stderr, 'bootstrap value in {} is > 1, \ ignoring.'.format(args.tree) # If the replace flag is set, replace the input file with the output file. # Otherwise create a new file with the '.mod.tre' extension if args.replace: out = args.tree else: out = args.tree + ext t.write(format=0, outfile=out)
def show_tree(experiment_folder): model = MDPD.Hierachical_MDPD(1) model.load(os.path.join(experiment_folder, 'model.p')) width, depth = model.width, model.depth root = Tree() cache = [(0, root)] for i in range(depth + 1): foo = [] for idx, node in cache: paren = int((idx - 1) / width) kid = idx - paren * width face = faces.ImgFace(os.path.join(experiment_folder, 'images', '{}_{}_{}.png'.format(idx, paren, kid))) node.add_face(face, 0) if i < depth: for k in range(width): foo.append((idx * width + k + 1, node.add_child())) cache = foo ts = TreeStyle() ts.mode = "c" root.render(os.path.join(experiment_folder, 'images', 'tree_plot.png'), tree_style=ts) return root
def ete_draw(self, fname=None): """ Draws the tree and saves it to a file. If `fname` is None, show the tree instead of saving it. Args: fname: filename to save to (default=None) """ if Cfg.USE_ETE3: def layout(node): faces.add_face_to_node(AttrFace("name"), node, column=0, position="branch-right") ts = TreeStyle() ts.show_leaf_name = False ts.layout_fn = layout ts.rotation = 90 tree = EteTree(self.ete_str(), format=8) if fname: tree.render(fname, tree_style=ts) else: tree.show(tree_style=ts) else: # TODO maybe throw an error? pass
def main(treefile, to, metric): with open(treefile) as fh: for treeline in fh: tree = Tree(treeline) tree = alphbetise_names(tree) tree = normalise_tree(tree, to, metric) print(tree.write(format=5))
def tree_distances(file): t = Tree(file) branch_len_out = open(file + ".patristic-dist.tsv", "w") avg_distance_leaves = 0 # Computing patristic distance matrix header = "" all_leaves = t.get_leaves() for i in all_leaves: header = header + "\t" + i.name nb_of_distances = 0 max_len = 0 min_len = 9999999999999999 branch_len_out.write(header+"\n") for leaf1 in all_leaves: row = "" row += str(leaf1.name) for leaf2 in all_leaves: distance = np.clip(leaf1.get_distance(leaf2), 0.0, 99999999999999999999999999) avg_distance_leaves += distance row += "\t%f" % distance nb_of_distances += 1 if distance > max_len: max_len = distance if distance < min_len and distance > 0: min_len = distance branch_len_out.write(row+"\n") branch_len_out.close()
def get_example_tree(): t = Tree() ts = TreeStyle() ts.layout_fn = layout ts.mode = "r" ts.show_leaf_name = False t.populate(10) return t, ts
def parseTree(root): tree = Tree() tree.name = root['Name'] tree.add_face(TextFace(root['Split'], fgcolor="red"), column=0, position="branch-bottom") if root['Children']: for child in root['Children']: tree.children.append(parseTree(child)) return tree
def run(args): import random from ete3 import Tree for n in range(args.number): t = Tree() t.populate(args.size, random_branches=args.random_branches) dump(t)
def __init__(self, *args, **kargs): kargs["format"] = 1 Tree.__init__(self, *args, **kargs) for n in self.traverse(): if n.name != "NoName": n.constraint = n.name.replace("{", "(").replace("}", ")").replace("@", "__target").replace("|", ",") else: n.constraint = None
def nhx2key(nhxtree): """Parse a PHYLDOG nhx file or string and create key for each node.""" t = Tree(nhxtree) keyD = {} for node in t.traverse(): k = "|".join(sorted([n for n in node.get_leaf_names()])) keyD[k] = node.ND return(keyD)
def revBayesTree2key(file): """Parse a revBayes node index tree file and create a key for each node.""" t=Tree(file, format = 1) keyD = {} for node in t.traverse(): k = "|".join(sorted([re.sub('\[&index=\d+\]','', n) for n in node.get_leaf_names()])) keyD[k] = nodeIndexFromString(node.name) return(keyD)
def get_example_tree(): t = Tree() t.populate(8, reuse_names=False) ts = TreeStyle() ts.layout_fn = master_ly ts.title.add_face(faces.TextFace("Drawing your own Qt Faces", fsize=15), 0) return t, ts
def get_example_tree(): t = Tree() t.populate(10) ts = TreeStyle() ts.rotation = 45 ts.show_leaf_name = False ts.layout_fn = rotation_layout return t, ts
def builtTree(phylo_tree_pic, paralogs_file): print("Aligning top 50 genes for phylogenetic tree...") # maken van alignment clustalw_cline = ClustalwCommandline("clustalw", infile=paralogs_file) stdout, stderr = clustalw_cline() # importeren van boom bestand tree = Tree(paralogs_file[:-6] + ".dnd") # bouwen en weggschrijven van boom tree.render(phylo_tree_pic)
def get_example_tree(): t = Tree() ts = TreeStyle() ts.layout_fn = layout ts.mode = "c" ts.show_leaf_name = True ts.min_leaf_separation = 15 t.populate(100) return t, ts
def ete_print(self): """ Pretty print. TODO Debug and document better for case USE_ETE3 == False """ if Cfg.USE_ETE3: t = EteTree(self.ete_str(), format=1) print(t.get_ascii(show_internal=True)) else: return str(self)
def balanceplot(balances, tree, layout=None, mode='c'): """ Plots balances on tree. Parameters ---------- balances : np.array A vector of internal nodes and their associated real-valued balances. The order of the balances will be assumed to be in level order. tree : skbio.TreeNode A strictly bifurcating tree defining a hierarchical relationship between all of the features within `table`. layout : function, optional A layout for formatting the tree visualization. Must take a `ete.tree` as a parameter. mode : str Type of display to show the tree. ('c': circular, 'r': rectangular). Note ---- The `tree` is assumed to strictly bifurcating and whose tips match `balances. See Also -------- TreeNode.levelorder """ # The names aren't preserved - let's pray that the topology is consistent. ete_tree = Tree(str(tree)) # Some random features in all nodes i = 0 for n in ete_tree.traverse(): if not n.is_leaf(): n.add_features(weight=balances[-i]) i += 1 # Create an empty TreeStyle ts = TreeStyle() # Set our custom layout function if layout is None: ts.layout_fn = default_layout else: ts.layout_fn = layout # Draw a tree ts.mode = mode # We will add node names manually ts.show_leaf_name = False # Show branch data ts.show_branch_length = True ts.show_branch_support = True return ete_tree, ts
def smart_reroot(treefile, outgroupfile, outfile, format=0): """ simple function to reroot Newick format tree using ete2 Tree reading format options see here: http://packages.python.org/ete2/tutorial/tutorial_trees.html#reading-newick-trees """ tree = Tree(treefile, format=format) leaves = [t.name for t in tree.get_leaves()][::-1] outgroup = [] for o in must_open(outgroupfile): o = o.strip() for leaf in leaves: if leaf[:len(o)] == o: outgroup.append(leaf) if outgroup: break if not outgroup: print("Outgroup not found. Tree {0} cannot be rerooted.".format(treefile), file=sys.stderr) return treefile try: tree.set_outgroup(tree.get_common_ancestor(*outgroup)) except ValueError: assert type(outgroup) == list outgroup = outgroup[0] tree.set_outgroup(outgroup) tree.write(outfile=outfile, format=format) logging.debug("Rerooted tree printed to {0}".format(outfile)) return outfile
def treeorder(treefile): from ete3 import Tree from ete3.treeview import faces, TreeStyle, NodeStyle, AttrFace t = Tree(treefile) rt = t.get_tree_root() nameorder = [] for desc in rt.iter_descendants("preorder"): if not desc.is_leaf(): continue nameorder.append(desc.name) return nameorder
def parse_newick(self): try: self.tree = Tree(self.nw_str) except NewickError: try: self.tree = Tree(self.nw_str, format=1) except NewickError as e: return "Newick Parsing Error: "+str(e) self.init_nodeids() return True
def replace_names(tree_file, replacer): tree = Tree(tree_file) errored = False for tip in tree.iter_leaves(): try: newname = replacer[tip.name.strip("'")] tip.name = newname except KeyError as exc: print("ERROR: Tip is missing from replacement file: '{}'".format( tip.name), file=sys.stderr) errored = True return tree.write()
def update_newick(t, labels): langs_in_tree = set(str(l.label) for l in labels if l.languageTree_id == t.id) if not langs_in_tree: return False try: tree = Tree(t.newick_string, format=1) prune(tree, langs_in_tree, const_depth=t.name.startswith('glottolog_')) t.newick_string = tree.write(format=1) return True except TreeError: return False
def load_ncbi_tree_from_dump(tar): from ete3 import Tree # Download: ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz parent2child = {} name2node = {} node2taxname = {} synonyms = set() name2rank = {} node2common = {} print("Loading node names...") for line in tar.extractfile("names.dmp"): line = str(line.decode()) fields = list(map(str.strip, line.split("|"))) nodename = fields[0] name_type = fields[3].lower() taxname = fields[1] if name_type == "scientific name": node2taxname[nodename] = taxname if name_type == "genbank common name": node2common[nodename] = taxname elif name_type in set(["synonym", "equivalent name", "genbank equivalent name", "anamorph", "genbank synonym", "genbank anamorph", "teleomorph"]): synonyms.add( (nodename, taxname) ) print(len(node2taxname), "names loaded.") print(len(synonyms), "synonyms loaded.") print("Loading nodes...") for line in tar.extractfile("nodes.dmp"): line = str(line.decode()) fields = line.split("|") nodename = fields[0].strip() parentname = fields[1].strip() n = Tree() n.name = nodename n.taxname = node2taxname[nodename] if nodename in node2common: n.common_name = node2common[nodename] n.rank = fields[2].strip() parent2child[nodename] = parentname name2node[nodename] = n print(len(name2node), "nodes loaded.") print("Linking nodes...") for node in name2node: if node == "1": t = name2node[node] else: parent = parent2child[node] parent_node = name2node[parent] parent_node.add_child(name2node[node]) print("Tree is loaded.") return t, synonyms
def _get_motif_tree(tree, data, circle=True, vmin=None, vmax=None): try: from ete3 import Tree, NodeStyle, TreeStyle except ImportError: print("Please install ete3 to use this functionality") sys.exit(1) t = Tree(tree) # Determine cutoff for color scale if not(vmin and vmax): for i in range(90, 101): minmax = np.percentile(data.values, i) if minmax > 0: break if not vmin: vmin = -minmax if not vmax: vmax = minmax norm = Normalize(vmin=vmin, vmax=vmax, clip=True) mapper = cm.ScalarMappable(norm=norm, cmap="RdBu_r") m = 25 / data.values.max() for node in t.traverse("levelorder"): val = data[[l.name for l in node.get_leaves()]].values.mean() style = NodeStyle() style["size"] = 0 style["hz_line_color"] = to_hex(mapper.to_rgba(val)) style["vt_line_color"] = to_hex(mapper.to_rgba(val)) v = max(np.abs(m * val), 5) style["vt_line_width"] = v style["hz_line_width"] = v node.set_style(style) ts = TreeStyle() ts.layout_fn = _tree_layout ts.show_leaf_name= False ts.show_scale = False ts.branch_vertical_margin = 10 if circle: ts.mode = "c" ts.arc_start = 180 # 0 degrees = 3 o'clock ts.arc_span = 180 return t, ts
def ReadTreeFromFile(filepath): """ Uses ete3 to read a newick tree file, and converts this to a Scoary-readable nested list """ try: myTree = Tree(filepath) except NewickError as e: sys.exit("Corrupted or non-existing custom tree file? %s" % e) myTree.resolve_polytomy(recursive=True) myTreeList, members = RecTree2List(myTree,Members=None) return myTreeList, members
or row.str.contains('Smith').any() or row.str.contains('Branstetter').any() or row.str.contains('Crawford').any() or row.str.contains('Leache').any() or row.str.contains('uce').any()): return 'UCE' return 'other' if __name__ == '__main__': print( "In order to run this script all files must have the same name and extension and they should be saved in directories that have the datasets name. Please see an example below" ) diagram = Tree( "((----->alignmentFileName.nex, ----->IQtreeFileName.iqtree)----->dataset1Dir, (----->alignmentFileName.nex, ----->IQtreeFileName.iqtree)----->dataset2Dir, (----->alignmentFileName.nex, ----->IQtreeFileName.iqtree)----->dataset3Dir)rootDir;", format=1) print(diagram.get_ascii(show_internal=True)) proceed = input("do you want to proceed? Y/N\n") if proceed == 'Y': rootDir = '/data/Suha/GTR_parameters_dist/DNA/' #the rootDir name to the directories that contain the tree files IQtreeFileName = 'alignment.nex.iqtree' #the name of the iqtree file with .iqtree extension alignmentFileName = 'alignment.nex' #the name of the alignment file with extension parametersFile = 'GTRparam.csv' #the name of the GTR parameters output file with .csv extension df = pd.DataFrame() for DirName, subdirList, fileList in os.walk(rootDir): if IQtreeFileName in fileList: '''if you didn't allow different GTR models for each partition, please use parameters2 function instead of parametres function''' try:
for file in treefiles: if "newick" not in file: continue label = file.split(".")[0] patient = label.split("_")[0] if "all" not in file and hasSix(patient): #Skip 4-tip versions of the trees. continue if onlysomepatients and label not in somepatients: continue trees = [] for line in open(treedir + file, "r"): if "#" in line: continue tree = Tree(line.rstrip()) trees.append(tree) for index, tree in enumerate(trees): print(tree) for branch in tree.traverse(): branch.dist = round(branch.dist) line = tree.write(format=1) rootlen = round(tree.get_tree_root().dist) line = line[:-1] + ":" + str(rootlen) + ";" for branch in tree.traverse(): if branch.name != "": name_face = AttrFace("name", fsize=30) branch.add_face(name_face, column=3, position="branch-right") pngfile = sigpngs[label][tuple( branch.name.split("-")[0].split("_")[0:1])]
#!/homes/carlac/anaconda_ete/bin/python # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute # Copyright [2016-2019] EMBL-European Bioinformatics Institute # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys, os from ete3 import Tree infile = sys.argv[1] if not os.path.isfile(infile): sys.stderr.write("File %s not found", infile) sys.exit(1) t = Tree(infile) root = t.get_tree_root() root.unroot() print(root.write(format=5))
def orthologies_with_outgroup(forest, duplicated_sp, outgroup, dict_genes, out): """ Browses a gene tree forest and searches for orthologs with the outgroup. Writes genes without phylogenetic orthologs to a file. Also writes files with high-confidence orthologs and paralogs to use to otpimize the synteny support threshold to call orthology. Args: forest (str): name of the gene trees forest file duplicated_sp (list of str): list of all duplicated species for the considered WGD outgroup (str): non-duplicated outgroup dict_genes (dict of GeneSpeciesPosition tuples): all gene positions for each species out (str): output file to write genes without phylogenetic orthologs Returns: dict: orthologs of outgroup genes in each duplicated species Note: #FIXME Written to work within scorpios as orthologs and paralogs file names are derived from output file patterns, assuming it contains an '_'. """ ortho = {e: {} for e in duplicated_sp} orthofile = out.replace(out.split("/")[-1].split('_')[0], "orthologs") parafile = out.replace(out.split("/")[-1].split('_')[0], "paralogs") with open(out, 'w') as outfile, open(forest, 'r') as infile, open(parafile, 'w') as out_para,\ open(orthofile, 'w') as out_ortho: sys.stderr.write( "Browsing gene trees for orthologies with the outgroup...\n") for tree in ut.read_multiple_objects(infile): #load tree tree = Tree(tree.strip(), format=1) node2leaves = tree.get_cached_content() leaves = [i for i in tree.get_leaves()] #add a tag to genes of duplicated species tag_duplicated_species(leaves, duplicated_sp) #find all clades with only genes of duplicated species subtrees = tree.get_monophyletic(values=["Y"], target_attr="duplicated") #find all outgroup genes outgroup_genes = [i for i in leaves if i.S == outgroup] #search for an ortholog gene in the outgroup for all clades of teleost genes for subtree in subtrees: seen = {} subtree_leaves = subtree.get_leaves() found = False #browse all outgroup genes for j in outgroup_genes: #find the node that splits the outgroup gene and duplicated species genes lca = tree.get_common_ancestor(subtree, j) topo_distance = len(node2leaves[lca]) # if it is a speciation or dubious duplication node --> speciation if org.is_speciation(lca): branch_distance = tree.get_distance(subtree, j) if subtree not in seen: seen[subtree] = [] seen[subtree].append( (topo_distance, branch_distance, j)) found = True # if no 'true' ortholog # check if all descendants include only outgroup + duplicated species if not found: for j in outgroup_genes: lca = tree.get_common_ancestor(subtree, j) for gene in lca.get_leaves(): if gene.duplicated != "Y" and gene.S != outgroup: break #if no break, it means all descendants are outgroup or dup. else: topo_distance = len(node2leaves[lca]) branch_distance = tree.get_distance(subtree, j) seen[subtree] = seen.get(subtree, []) seen[subtree].append( (topo_distance, branch_distance, j)) # if an ortholog was found, add it to the orthology dict if seen: content = [] seen[subtree].sort(key=lambda x: (x[0], x[1])) outgroup_gene = seen[subtree][0] outgroup_gene = outgroup_gene[2].name for species in duplicated_sp: genes = [ i.name for i in subtree_leaves if i.S == species ] genes = get_genes_positions(genes, species, dict_genes) ortho[species][outgroup_gene] = ortho[species].get( outgroup_gene, []) ortho[species][outgroup_gene] += genes content += [g.name+'_'+species.replace(' ', '.')+\ '|'+str(g.chromosome)+\ '|'+str(g.index) for g in genes] all_ortho = [i[2].name for i in seen[subtree]] paralogs = [ i.name for i in outgroup_genes if i.name not in all_ortho ] if paralogs: paralog = random.choice(paralogs) if paralog in dict_genes[outgroup]\ and outgroup_gene in dict_genes[outgroup]: tmp_dict = dict_genes[outgroup] out_ortho.write(' '.join(content) + '\t') out_ortho.write(str(outgroup_gene)+'|'+\ str(tmp_dict[outgroup_gene].chromosome)+'|'+\ str(tmp_dict[outgroup_gene].index)+'|'+str(0)+'|'+\ str(0)+'\n') out_para.write(' '.join(content) + '\t') out_para.write(str(paralog)+'|'+\ str(tmp_dict[paralog].chromosome)+'|'+\ str(tmp_dict[paralog].index)+'|'+\ str(0)+'|'+str(0)+'\n') # if no ortholog found # write genes without ortholog along with all outgroup genes in tree # (potential candidate for orthology) elif any(i.name in dict_genes[outgroup] for i in outgroup_genes): #genes without orthologs missed_genes = [] for species in duplicated_sp: genes = [ i.name for i in subtree_leaves if i.S == species ] genes = get_genes_positions(genes, species, dict_genes) missed_genes += [g.name+'_'+species.replace(' ', '.')+\ '|'+str(g.chromosome)+\ '|'+str(g.index) for g in genes] if missed_genes: outfile.write(' '.join(missed_genes) + '\t') #candidate orthologs in the outgroup outgr_genes = [i.name for i in outgroup_genes] in_paralogs = [] for pair in itertools.combinations(outgr_genes, 2): if tree.get_distance(pair[0], pair[1], topology_only=True) == 1: in_paralogs.append(pair[0] + '|' + pair[1]) outgr_write = [] genome = dict_genes[outgroup] for gene in outgr_genes: if gene in genome: lca = tree.get_common_ancestor(subtree, gene) branch_distance = tree.get_distance( subtree, gene) topo_distance = len(node2leaves[lca]) outgr_write.append(str(gene)+'|'+str(genome[gene].chromosome)+'|'+\ str(genome[gene].index)+'|'+str(topo_distance)+\ '|'+str(branch_distance)) outfile.write(' '.join(outgr_write) + '\t' + ' '.join(in_paralogs) + '\n') sys.stderr.write("Phylogenetic orthologies with the outgroup OK\n") return ortho
#!/usr/bin/env python from ete3 import Tree, PhyloTree from random import * # GET THE 1st BOOTSRAP SAMPLE TREE filename = "for_isaac/RAxML_bootstrap.orfg1" file = open(filename, "r") first_tree = file.readline()[:-1] # [:-1] Gets ride of newline at the end of the line # MAKE IT INTO AN ETE TREE t = Tree(first_tree, format=1) print "ORIGINAL TREE\n" print t # GET A LIST OF THE LEAVES (by name or node class) print "\n LEAVES" # leaves = t.get_leaves() leaves = t.get_leaf_names() for index, leaf in enumerate(leaves): print (index, leaf) # GET 4 RANDOM INDICES TO PRUNE indices = sample(range(0, len(leaves)), 4) print "\nRANDOM 4 INDICES: " + ', '.join(str(x) for x in indices) # USE THOSE INDICES TO GET 4 RANDOM NODES to_prune = [] for index in indices: to_prune.append(leaves[index])
def Max_cut(taxa, trip_d): #connections=graph._graph_good t = Tree() ####print(taxa) if len(taxa) == 2: # Creates an empty tree #node=t.add_child() taxa = list(taxa) A = t.add_child( name=taxa[0]) # Adds a new child to the current tree root # and returns it B = t.add_child(name=taxa[1]) return t if len(taxa) == 1: leaf = taxa.pop() #t.add_child(name=leaf) return leaf triplets = [] good = [] bad = [] d = {ni: indi for indi, ni in enumerate(taxa)} rows, cols = (len(taxa), len(taxa)) triplets_dict = defaultdict(list) good_mat = [[0 for i in range(cols)] for j in range(rows)] bad_mat = [[0 for i in range(cols)] for j in range(rows)] for keys in trip_d: words = keys.split(',') ####print(words) if (set(words).issubset(set(taxa))): triplets = trip_d[keys] for tri in triplets: if bad_mat[d[tri[1][0]]][d[tri[1][1]]] < 1: bad.append(tri[1]) if good_mat[d[tri[0]]][d[tri[1][0]]] < 1: good.append((tri[0], tri[1][0])) if good_mat[d[tri[0]]][d[tri[1][1]]] < 1: good.append((tri[0], tri[1][1])) bad_mat[d[tri[1][0]]][d[tri[1][1]]] += 1 bad_mat[d[tri[1][1]]][d[tri[1][0]]] += 1 good_mat[d[tri[0]]][d[tri[1][0]]] += 1 good_mat[d[tri[0]]][d[tri[1][1]]] += 1 good_mat[d[tri[1][0]]][d[tri[0]]] += 1 good_mat[d[tri[1][1]]][d[tri[0]]] += 1 taxa = set(taxa) ####print(triplets) ####print(good) ####print(bad) g = Graph(good, bad, good_mat, bad_mat, d, directed=False) cc = clades_from_graph(set(taxa), g) cc_cut = cc ####print(cc) if len(cc_cut) > 1: for c in cc_cut: sub_t = Max_cut(c, trip_d) ####print(sub_t) if isinstance(sub_t, str): t.add_child(name=sub_t) else: t.add_child(sub_t) else: ####print("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[") cut = findCut(set(taxa), g, d) ####print(cut) for c in cut: new_child = Max_cut(c, trip_d) if isinstance(new_child, str): t.add_child(name=new_child) else: t.add_child(new_child) return t
from ete3 import Tree, NodeStyle, TreeStyle #output_dir = "C:/Users/ItayM5/Google Drive/MSc/posters and presentations/presentations/supplementary_materials/" output_dir = "/groups/itay_mayrose/halabikeren/graphics/" output_name = output_dir + "tree.png" tree_str = Tree('((S1:1,S2:1)N1:1,(S3:1,S4:1)N2:1;)') tree = Tree(tree_str) # Basic tree style ts = TreeStyle() ts.show_leaf_name = True ts.show_scale = True ts.rotation = 90 ts.branch_vertical_margin = 5 internals = NodeStyle() internals["hz_line_type"] = 0 internals["vt_line_type"] = 0 internals["vt_line_width"] = 2 internals["hz_line_width"] = 2 internals["hz_line_color"] = "Silver" #454545" darker gray internals["vt_line_color"] = "Silver" internals["shape"] = "circle" internals["size"] = 3 internals["fgcolor"] = "Silver" clade = NodeStyle() clade["hz_line_type"] = 0 clade["vt_line_type"] = 0 clade["vt_line_width"] = 2
def generateGuestPoints(self, pipeWidth=75): #Add in loss nodes for leaf in self.guest: node = leaf while node != self.guest: host_me = self.nodemap[node] host_parent = self.nodemap[node.up] if len(node.children) == 2: lchild = self.nodemap[ node.children[0]] == host_me and self.nodemap[ node.children[1]] != host_me rchild = self.nodemap[ node.children[1]] == host_me and self.nodemap[ node.children[0]] != host_me if len(node.children) == 2 and (lchild or rchild): if self.nodemap[node.children[0]] == host_me: tofix = node.children[1] else: tofix = node.children[0] temp = Tree() temp.name = "L_" + node.name nodemap[temp] = host_me temp.up = tofix.up temp.children = [tofix] tofix.up = temp if tofix == node.children[0]: node.children[0] = temp else: node.children[1] = temp if host_me != host_parent and host_me.up != host_parent: #Add loss nodes in dist = host_parent.get_distance(host_me, topology_only=True) guest_parent = node.up curr = node for i in range(int(dist)): temp = Tree() temp.name = "L_" + str(i) + "_" + guest_parent.name nodemap[temp] = nodemap[curr].up temp.up = curr.up temp.children = [curr] curr.up = temp if curr == guest_parent.children[0]: guest_parent.children[0] = temp else: guest_parent.children[1] = temp curr = temp guest_parent = node else: node = node.up #Add levels for node in self.guest.traverse(): node.add_feature('level', -1) for leaf in self.guest: node = leaf node.level = 0 currmap = self.nodemap[node] currlevel = 0 node = node.up while node != None: mymap = self.nodemap[node] if mymap == currmap: node.level = max(node.level, currlevel + 1) else: node.level = max(node.level, 0) currlevel = node.level currmap = mymap node = node.up #How many points at each level of a node in the host tree? rmap = {} #map of host -> guest for key in self.nodemap: rkey = self.nodemap[key] if rkey in rmap: rmap[rkey].append(key) else: rmap[rkey] = [key] hostlevels = {} # hostnode -> levelcounts usedlevels = { } # same as hostlevels, but will count how many of each level have been used so far for key in rmap: nodes = rmap[key] maxlevel = 0 for node in nodes: maxlevel = max(maxlevel, node.level) levelsizes = [0 for _ in range(maxlevel + 1)] for node in nodes: levelsizes[node.level] += 1 hostlevels[key] = levelsizes usedlevels[key] = [0 for _ in range(maxlevel + 1)] #Generate Points - this only works for generateSpeciesTree2 for node in self.guest.traverse(): hostnode, level = self.nodemap[node], node.level used = usedlevels[hostnode][level] maxlevel = len(usedlevels[hostnode]) usedlevels[hostnode][level] += 1 bottom = hostnode.coord if hostnode == self.host: top = list(self.host.coord) top[1] -= 100 else: top = hostnode.up.coord ydiff = bottom[1] - top[1] yused = ydiff * level / maxlevel y = bottom[1] - yused xlow, xhigh = bottom[0], top[0] xmid = int(xlow + (xhigh - xlow) * (yused / float(ydiff))) xused = int(pipeWidth * 2 * (used + 1) / (float(hostlevels[hostnode][level]) + 1)) x = xmid + xused - pipeWidth node.add_feature('coord', (x, y))
def changeTree(inTree, outTree): rawTree = Tree(inTree) for node in rawTree.iter_descendants(): node.dist = 1 rawTree.write(outfile=outTree)
def write_resolved_tree(orthog_tree, outgr_gene_name, out): """ Writes solution trees for orthogroup with only 2 genes. Args: orthogroup tree (ete3.Treeode) : Node with the 2 descendants of the orthogroup. outgr_gene_name (str): full outgroup gene name (with species tag). outfile (str): filename to write the tree. """ new_tree = Tree() new_tree.add_child(orthog_tree) new_tree.add_child(name=outgr_gene_name) new_tree.prune([i for i in new_tree.get_leaves()]) new_tree.write(outfile=out, format=1)
def main(arg1, arg2): list_splits1 = [] list_splits2 = [] t1 = Tree() tree1 = Tree(arg1) tree2 = Tree(arg2) node_midpoint = getRandomNode(tree1) tree1.set_outgroup(node_midpoint) tree2.set_outgroup(node_midpoint) t1, tree2 = tree2.get_tree_root().children t1, tree1 = tree1.get_tree_root().children count = 0 for leaf in tree1.traverse("postorder"): if (leaf.name.strip()): count += 1 leaf.add_features(order=count) CurrentNode2 = tree2 & leaf.name CurrentNode2.add_features(order=count) elif (leaf.name != node_midpoint): leaf.name = "int" for node in tree2.traverse("postorder"): if (node.name == ""): node.name = "int" Num_splits1 = 0 Num_splits2 = 0 Num_shared = 0 for node in tree1.traverse("postorder"): list_leaves = [] if ((node.name == "int")): Num_splits1 += 1 cmin = float("+inf") cmax = 0 d1, d2 = node.get_children() subtree = Tree() subtree.add_child(d1) subtree.add_child(d2) for leaf in subtree: list_leaves.append(leaf.name) if ((leaf.name != "int")): CurrentNode2 = tree1 & leaf.name cmin = min(CurrentNode2.order, cmin) cmax = max(CurrentNode2.order, cmax) if ((node.is_root() == False)): node.name = "[" + str(cmin) + ":" + str(cmax) + "]" list_splits1.append(sorted(list_leaves)) for node in tree2.traverse("postorder"): list_leaves = [] if ((node.name == "int") and (node.is_root() == False)): Num_splits2 += 1 cmin = float("+inf") cmax = 0 size = 0 d1, d2 = node.get_children() subtree2 = Tree() subtree2.add_child(d1) subtree2.add_child(d2) for leaf in subtree2: size += 1 list_leaves.append(leaf.name) if ((leaf.name != "int") and (leaf.name != node_midpoint)): CurrentNode2 = tree2 & leaf.name cmin = min(CurrentNode2.order, cmin) cmax = max(CurrentNode2.order, cmax) if (size == (cmax - cmin + 1)): node.name = "[" + str(cmin) + ":" + str(cmax) + "]" if (tree1.search_nodes(name=node.name)): Num_shared += 1 list_splits1.remove(sorted(list_leaves)) else: list_splits2.append(sorted(list_leaves)) global leaf_num ts = TreeStyle() ts.show_leaf_name = True style1 = NodeStyle() style1["hz_line_color"] = "#ff0000" leaf_num = len(tree2.get_leaves()) rf_dist = Num_splits1 + Num_splits2 - (2 * Num_shared) tree1 = Tree(arg1) L = [] for leaf in tree1: L.append(leaf.name) L = sorted(L) for i in list_splits1: rem = set(L) - set(i) print(i, "||", list(rem)) #print(list_splits1) #print(list_splits2) print(rf_dist) return rf_dist
newick_file = os.path.join(args.outdir,args.prefix+'_newick.tre') fp = open(newick_file,'w') fp.write(newick) fp.close() tr = LoadTree(treestring=newick) #dendrogram = UnrootedDendrogram(tr) #print dendrogram #dendrogram.showFigure() print(tr.asciiArt()) ts = TreeStyle() ts.show_leaf_name = True ts.show_branch_length = True ts.show_branch_support = True print('NEWICK='+json.dumps(newick)) rooted_tree = Tree( newick ) #svgfile = os.path.join('/Users/avoorhis/programming/jupyter/VAMPS_API',args.prefix+'_dendrogram.svg') svgfile = os.path.join(args.outdir,args.prefix+'_dendrogram.svg') print(os.getcwd()) #print svgfile print('rendering0') rooted_tree.render(svgfile, tree_style=ts) # writes file to tmp if args.function == 'pcoa_3d': print('starting pcoa_3d') from skbio import DistanceMatrix dm = DistanceMatrix(dm1) print(dm) print('end')
def plot_phylo_tree(rdata, colname, name, workdir, outdir): """ Generate the phylogenetic tree (dendrogram) for the PSC method. A dendrogram is generated using domain pairwise scores and written in the newick format to a file in the workdir. The file is then read in for generating the phylogenetic visualizations if the number of domains in the dataset is less than 300. :param rdata: (dataframe) Pairwise similarity scores data :param colname: (string) Name of column to take similarity scores from :param name: (string) Name of PSC method :param workdir: (string) Path to output directory where intermediate processing data files can be stored :param outdir: (string) Path to output directory where processed data files can be found :rtype: None """ print('\t', colname) dist_file = '%s%sdist.csv' % (workdir, os.path.sep) dendro_path = '%s%s%s_dendro.nw' % (outdir, os.path.sep, name) tree_path = "figures%s%s_ptree.png" % (os.path.sep, name) # create pivot table for similarity scores of psc method try: p = rdata.pivot(index='dom1', columns='dom2', values=colname) except: print('pivot not generated for %s' % colname) return # write pivot table to file for i in range(len(p)): p.iloc[i][i] = 1 # (convert to distance matrix) p = 1 - p p.to_csv(dist_file) # make name to class matrix dom_classification = dict(rdata[['dom1', 'cath1']].as_matrix()) classes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'] # SCOP cl = [ 'blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'black', 'black', 'black', 'black', 'black' ] class_color = dict(list(zip(classes, cl))) # make SCOP Class domains scop_class_domains = {} for k, v in dom_classification.items(): scop_class = v[:1] if scop_class_domains.get(scop_class) is None: scop_class_domains[scop_class] = [k] else: scop_class_domains[scop_class].append(k) # create domain dendrogram from pivot data and store to file try: pdm1 = dendropy.PhylogeneticDistanceMatrix.from_csv( src=open(dist_file), delimiter=",") except: print('error reading file', dist_file) return nj_tree = pdm1.nj_tree() nj_tree.write(file=open(dendro_path, 'w'), schema='newick') if len(p) > 300: return True # make the tree to visualize if the number of domains is less than 300 t = Tree(str(nj_tree) + ';') # Creates an independent node style for each node, which is # initialized with a foreground color depending on node class. for n in t.traverse(): if not n.is_leaf(): continue dom_class = dom_classification[n.name.replace('\'', '')] nstyle = NodeStyle() nstyle["fgcolor"] = class_color[dom_class[0]] nstyle["size"] = 25 n.set_style(nstyle) circular_style = TreeStyle() circular_style.mode = "c" t.render(tree_path, tree_style=circular_style) # calculate all-to-all distances # same class different classes # total distance, number of nodes, total distance, number of nodes f_distances = [ [0, 0, 0, 0], # a [0, 0, 0, 0], # b [0, 0, 0, 0], # c [0, 0, 0, 0], # d [0, 0, 0, 0] ] # e class_idx = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4} for n_start in t.traverse(): if not n_start.is_leaf(): continue n_start_class = dom_classification[n_start.name.replace('\'', '')][0] f_distance = f_distances[class_idx[n_start_class]] for n_end in t.traverse(): if not n_end.is_leaf(): continue if n_start == n_end: continue d = n_start.get_distance(n_end, topology_only=True) n_end_class = dom_classification[n_end.name.replace('\'', '')][0] if n_start_class == n_end_class: f_distance[0] += d f_distance[1] += 1 else: f_distance[2] += d f_distance[3] += 1 return True
def main(arg1, arg2): if arg2 == 2: with open(arg1) as f: content = f.readlines() else: content = arg1 ####print(content) # you may also want to remove whitespace characters like `\n` at the end of each line #content = [x.strip() for x in content] t2 = Tree(content[0]) #print(t2) triplets = [] taxa = [] good = [] bad = [] for i in range(0, len(content)): ####print(i) t1 = Tree(content[i]) #t1.show() ###print(t1.write(format=9)) t1.resolve_polytomy() for leaf in t1: if leaf.is_leaf() is True: taxa.append(leaf.name) leaves, triplets = tp.triplet_decompose(t1, triplets) #print("leaves",leaves) #t2=Tree(content[i]) ####print(triplets) #taxa+=leaves taxa = set(taxa) #print(triplets) #print("taxa",taxa) d = {ni: indi for indi, ni in enumerate(taxa)} rows, cols = (len(taxa), len(taxa)) triplets_dict = defaultdict(list) for t in triplets: trip_key = [t[1][0], t[1][1], t[0]] trip_key.sort() trip_key = str(trip_key).strip('[]') trip_key = trip_key.replace("'", '') trip_key = trip_key.replace(" ", "") if (t in triplets_dict[trip_key]) is False: triplets_dict[trip_key].append(t) #print(triplets_dict) ####print(good) ####print(bad) ####print(clades) ####print("__________________________________________________________") ####print(outgroup) #g = Graph(connections, outgroup, good, bad, directed=True) ####print(g._graph_good) ####print(g._weights) #triplets_dict=defaultdict(list) supertree = Max_cut(taxa, triplets_dict) #print("supertree",supertree) #supertree.show() '''supertree_triplets=[] ST_triplets_dict=defaultdict(list) st_leaves,supertree_triplets,ST_triplets_dict=tp_d.triplet_decompose(supertree,supertree_triplets,ST_triplets_dict) total=1 overlap=0 inconsist=0 for keys in triplets_dict: overlap += len(intersection(triplets_dict[keys],ST_triplets_dict[keys])) total+=len(triplets_dict[keys]) if len(triplets_dict[keys])>1: inconsist+=len(triplets_dict[keys]) overlap_per=overlap/total inconsist_per=inconsist/total #t2=Tree("((ah, ((ae, ab), ai)), ((ag, aa), (ac, (ad, (aj, af)))));") ####print(t2) #triplet_test.main(arg1,supertree,True,25) ##print(supertree)''' return supertree, 0, 0, triplets_dict
for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) score=np.floor((100-alignment.percentIdentity())* len(np.array(alignment))/100) print(i,'-',j,':',score) Matrix[i,j]=score Matrix=Matrix.T #np.save('M.npy',Matrix) #%% #Matrix=np.load('M.npy',allow_pickle=True) #%% import upgma tree=upgma.UPGMA(Matrix, name) from ete3 import Tree unrooted_tree = Tree(tree+';') print (unrooted_tree) #%% To_plot=0 if To_plot==1: from ete3 import Tree, TreeStyle ts = TreeStyle() ts.show_leaf_name = True ts.branch_vertical_margin = 10 unrooted_tree.show(tree_style=ts)
class Species(object): """Represents a collection of genomes in `path` :param path: Path to the directory of related genomes you wish to analyze. :param max_unknowns: Number of allowable unknown bases, i.e. not [ATCG] :param contigs: Acceptable deviations from median number of contigs :param assembly_size: Acceptable deviations from median assembly size :param mash: Acceptable deviations from median MASH distances :param assembly_summary: a pandas DataFrame with assembly summary information """ path = attr.ib(default=Path(), converter=Path) max_unknowns = attr.ib(default=200) # TODO These are really about attrib names contigs = attr.ib(default=3.0) assembly_size = attr.ib(default=3.0) mash = attr.ib(default=3.0) assembly_summary = attr.ib(default=None) metadata = attr.ib(default=None) def __attrs_post_init__(self): self.log = logbook.Logger(self.path.name) self.label = "-".join( map(str, [ self.max_unknowns, self.contigs, self.assembly_size, self.mash ])) self.paths = config.Paths( root=self.path, subdirs=[ "qc", ("results", f"qc/{self.label}"), ("passed", f"qc/{self.label}/passed"), ".logs", ], ) self.stats_path = os.path.join(self.paths.qc, "stats.csv") self.nw_path = os.path.join(self.paths.qc, "tree.nw") self.dmx_path = os.path.join(self.paths.qc, "dmx.csv") self.failed_path = os.path.join(self.paths.qc, "failed.csv") self.summary_path = os.path.join(self.paths.qc, "qc_summary.txt") self.paste_file = os.path.join(self.paths.qc, "all.msh") # Figure out if defining these as None is necessary self.tree = None self.stats = None if os.path.isfile(self.stats_path): self.stats = pd.read_csv(self.stats_path, index_col=0) if os.path.isfile(self.nw_path): self.tree = Tree(self.nw_path, 1) if os.path.isfile(self.failed_path): self.failed_report = pd.read_csv(self.failed_path, index_col=0) self.tolerance = { "unknowns": self.max_unknowns, "contigs": self.contigs, "assembly_size": self.assembly_size, "distance": self.mash, } self.passed = self.stats self.failed = {} self.med_abs_devs = {} self.dev_refs = {} self.allowed = {"unknowns": self.max_unknowns} def __str__(self): self.message = [ "Species: {}".format(self.path.name), "Maximum Unknown Bases: {}".format(self.max_unknowns), "Acceptable Deviations:", "Contigs, {}".format(self.contigs), "Assembly Size, {}".format(self.assembly_size), "MASH: {}".format(self.mash), ] return "\n".join(self.message) @property def genome_paths(self, ext="fasta"): """Returns a generator for every file ending with `ext` :param ext: File extension of genomes in species directory :returns: Generator of Genome objects for all genomes in species dir :rtype: generator """ return [ os.path.join(self.path, genome) for genome in os.listdir(self.path) if genome.endswith(ext) ] @property def sketches(self): return Path(self.paths.qc).glob("GCA*msh") @property def total_sketches(self): return len(list(self.sketches)) @property def genome_names(self): ids = [i.name for i in self.genomes] return pd.Index(ids) @property def biosample_ids(self): ids = self.assembly_summary.df.loc[ self.accession_ids].biosample.tolist() return ids # may be redundant. see genome_names attrib @property def accession_ids(self): ids = [ i.accession_id for i in self.genomes if i.accession_id is not None ] return ids def get_tree(self): from ete3.coretype.tree import TreeError import numpy as np from skbio.tree import TreeNode from scipy.cluster.hierarchy import weighted ids = self.dmx.index.tolist() triu = np.triu(self.dmx.as_matrix()) hclust = weighted(triu) t = TreeNode.from_linkage_matrix(hclust, ids) nw = t.__str__().replace("'", "") self.tree = Tree(nw) try: # midpoint root tree self.tree.set_outgroup(self.tree.get_midpoint_outgroup()) except TreeError: self.log.error("Unable to midpoint root tree") self.tree.write(outfile=self.nw_path) @property def stats_files(self): return Path(self.paths.qc).glob("GCA*csv") def MAD(self, df, col): """Get the median absolute deviation for col""" MAD = abs(df[col] - df[col].median()).mean() return MAD def MAD_ref(MAD, tolerance): """Get the reference value for median absolute deviation""" dev_ref = MAD * tolerance return dev_ref def bound(df, col, dev_ref): lower = df[col].median() - dev_ref upper = df[col].median() + dev_ref return lower, upper def filter_unknown_bases(self): """Filter out genomes with too many unknown bases.""" self.failed["unknowns"] = self.stats.index[ self.stats["unknowns"] > self.tolerance["unknowns"]] self.passed = self.stats.drop(self.failed["unknowns"]) # TODO Don't use decorator; perform this logic in self.filter def check_passed_count(f): """ Count the number of genomes in self.passed. Commence with filtering only if self.passed has more than five genomes. """ @functools.wraps(f) def wrapper(self, *args): if len(self.passed) > 5: f(self, *args) else: self.allowed[args[0]] = "" self.failed[args[0]] = "" self.log.info("Not filtering based on {}".format(f.__name__)) return wrapper # todo remove unnecessary criteria parameter @check_passed_count def filter_contigs(self, criteria): """ Only look at genomes with > 10 contigs to avoid throwing off the median absolute deviation. Median absolute deviation - Average absolute difference between number of contigs and the median for all genomes. Extract genomes with < 10 contigs to add them back in later. """ eligible_contigs = self.passed.contigs[self.passed.contigs > 10] not_enough_contigs = self.passed.contigs[self.passed.contigs <= 10] # TODO Define separate function for this med_abs_dev = abs(eligible_contigs - eligible_contigs.median()).mean() self.med_abs_devs["contigs"] = med_abs_dev # Define separate function for this # The "deviation reference" dev_ref = med_abs_dev * self.contigs self.dev_refs["contigs"] = dev_ref self.allowed["contigs"] = eligible_contigs.median() + dev_ref self.failed["contigs"] = eligible_contigs[ abs(eligible_contigs - eligible_contigs.median()) > dev_ref].index eligible_contigs = eligible_contigs[ abs(eligible_contigs - eligible_contigs.median()) <= dev_ref] eligible_contigs = pd.concat([eligible_contigs, not_enough_contigs]) eligible_contigs = eligible_contigs.index self.passed = self.passed.loc[eligible_contigs] @check_passed_count def filter_MAD_range(self, criteria): """ Filter based on median absolute deviation. Passing values fall within a lower and upper bound. """ # Get the median absolute deviation med_abs_dev = abs(self.passed[criteria] - self.passed[criteria].median()).mean() dev_ref = med_abs_dev * self.tolerance[criteria] lower = self.passed[criteria].median() - dev_ref upper = self.passed[criteria].median() + dev_ref allowed_range = (str(int(x)) for x in [lower, upper]) allowed_range = "-".join(allowed_range) self.allowed[criteria] = allowed_range self.failed[criteria] = self.passed[ abs(self.passed[criteria] - self.passed[criteria].median()) > dev_ref].index self.passed = self.passed[abs( self.passed[criteria] - self.passed[criteria].median()) <= dev_ref] @check_passed_count def filter_MAD_upper(self, criteria): """ Filter based on median absolute deviation. Passing values fall under the upper bound. """ # Get the median absolute deviation med_abs_dev = abs(self.passed[criteria] - self.passed[criteria].median()).mean() dev_ref = med_abs_dev * self.tolerance[criteria] upper = self.passed[criteria].median() + dev_ref self.failed[criteria] = self.passed[ self.passed[criteria] > upper].index self.passed = self.passed[self.passed[criteria] <= upper] upper = "{:.4f}".format(upper) self.allowed[criteria] = upper def base_node_style(self): from ete3 import NodeStyle, AttrFace nstyle = NodeStyle() nstyle["shape"] = "sphere" nstyle["size"] = 2 nstyle["fgcolor"] = "black" for n in self.tree.traverse(): n.set_style(nstyle) if re.match(".*fasta", n.name): nf = AttrFace("name", fsize=8) nf.margin_right = 150 nf.margin_left = 3 n.add_face(nf, column=0) # Might be better in a layout function def style_and_render_tree(self, file_types=["svg"]): from ete3 import TreeStyle, TextFace, CircleFace ts = TreeStyle() title_face = TextFace(snakemake.config["species"].replace("_", " "), fsize=20) title_face.margin_bottom = 10 ts.title.add_face(title_face, column=0) ts.branch_vertical_margin = 10 ts.show_leaf_name = True # Legend ts.legend.add_face(TextFace(""), column=1) for category in ["Allowed", "Deviations", "Filtered", "Color"]: category = TextFace(category, fsize=8, bold=True) category.margin_bottom = 2 category.margin_right = 40 ts.legend.add_face(category, column=1) for i, criteria in enumerate(CRITERIA, 2): title = criteria.replace("_", " ").title() title = TextFace(title, fsize=8, bold=True) title.margin_bottom = 2 title.margin_right = 40 cf = CircleFace(4, COLORS[criteria], style="sphere") cf.margin_bottom = 5 filtered_count = len( list(filter(None, self.failed_report.criteria == criteria))) filtered = TextFace(filtered_count, fsize=8) filtered.margin_bottom = 5 allowed = TextFace(self.allowed[criteria], fsize=8) allowed.margin_bottom = 5 allowed.margin_right = 25 # TODO Prevent tolerance from rendering as a float tolerance = TextFace(self.tolerance[criteria], fsize=8) tolerance.margin_bottom = 5 ts.legend.add_face(title, column=i) ts.legend.add_face(allowed, column=i) ts.legend.add_face(tolerance, column=i) ts.legend.add_face(filtered, column=i) ts.legend.add_face(cf, column=i) for f in file_types: out_tree = os.path.join(self.paths.qc, "tree.{}".format(f)) self.tree.render(out_tree, tree_style=ts) def color_tree(self): from ete3 import NodeStyle self.base_node_style() for failed_genome in self.failed_report.index: n = self.tree.get_leaves_by_name(failed_genome).pop() nstyle = NodeStyle() nstyle["fgcolor"] = COLORS[self.failed_report.loc[failed_genome, "criteria"]] nstyle["size"] = 9 n.set_style(nstyle) self.style_and_render_tree() def filter(self): self.filter_unknown_bases() self.filter_contigs("contigs") self.filter_MAD_range("assembly_size") self.filter_MAD_upper("distance") self.summary() self.write_failed_report() def write_failed_report(self): if os.path.isfile(self.failed_path): os.remove(self.failed_path) ixs = chain.from_iterable([i for i in self.failed.values()]) self.failed_report = pd.DataFrame(index=ixs, columns=["criteria"]) for criteria in self.failed.keys(): if type(self.failed[criteria]) == pd.Index: self.failed_report.loc[self.failed[criteria], "criteria"] = criteria self.failed_report.to_csv(self.failed_path) def summary(self): summary = [ self.path.name, "Unknown Bases", f"Allowed: {self.allowed['unknowns']}", f"Tolerance: {self.tolerance['unknowns']}", f"Filtered: {len(self.failed['unknowns'])}", "\n", "Contigs", f"Allowed: {self.allowed['contigs']}", f"Tolerance: {self.tolerance['contigs']}", f"Filtered: {len(self.failed['contigs'])}", "\n", "Assembly Size", f"Allowed: {self.allowed['assembly_size']}", f"Tolerance: {self.tolerance['assembly_size']}", f"Filtered: {len(self.failed['assembly_size'])}", "\n", "MASH", f"Allowed: {self.allowed['distance']}", f"Tolerance: {self.tolerance['distance']}", f"Filtered: {len(self.failed['distance'])}", "\n", ] summary = "\n".join(summary) with open(os.path.join(self.summary_path), "w") as f: f.write(summary) return summary def link_genomes(self): for passed_genome in self.passed.index: src = next(self.path.glob(f"*/*/*/{passed_genome}")).absolute() name = rename_genome(passed_genome, summary) dst = (self.paths.qc / name).absolute() try: dst.symlink_to(src) except FileExistsError: continue def qc(self): self.filter() self.link_genomes() self.get_tree() self.color_tree() self.log.info("QC finished") def select_metadata(self, metadata): try: self.metadata = metadata.joined.loc[self.biosample_ids] self.metadata.to_csv(self.metadata_path) except KeyError: self.log.exception("Metadata failed")
def precision_matrix(tree, d, branch_length): """ :param tree_name: path of the ete3 tree file :param d: dimension of latent space :param: branch_length: constant branch length along the tree, or dict of branch lengths :return: the covariance matrix of the gaussian vector induced by the tree, after inversion and post processing of the constructed precision matrix """ # load tree if type(tree) == str: suffix = tree.split('.')[-1] if suffix == "txt": with open(tree, "r") as myfile: tree_string = myfile.readlines() tree = Tree(tree_string[0], 1) else: tree = Tree(tree, 1) # introduce an index for all the nodes parents = {} N = 0 for idx, node in enumerate(tree.traverse("levelorder")): N += 1 # set node index node.add_features(index=idx) # ancestor indexing + branch length dict dist = {} for n in tree.traverse("levelorder"): if not n.is_root(): ancestor = n.up.index parents[n.index] = ancestor if type(branch_length) == dict: dist[n.up.index] = n.up.dist # Intitalize precision matrix inverse_covariance = np.zeros((N * d, N * d)) # the branch length is either constant along the tree, or a dictionary if type(branch_length) != dict: t = 1 / branch_length for i in parents: pi_ind = parents[i] inverse_covariance[i * d:(i + 1) * d, i * d:(i + 1) * d] += np.identity(d) * t inverse_covariance[pi_ind * d:(pi_ind + 1) * d, pi_ind * d:(pi_ind + 1) * d] += np.identity(d) * t inverse_covariance[pi_ind * d:(pi_ind + 1) * d, i * d:(i + 1) * d] += -np.identity(d) * t inverse_covariance[i * d:(i + 1) * d, pi_ind * d:(pi_ind + 1) * d] += -np.identity(d) * t inverse_covariance[0:d, 0:d] += np.identity(d) else: for i in parents: pi_ind = parents[i] #t = 1 / branch_length[str(pi_ind)] t = 1 / branch_length[str(i)] inverse_covariance[i * d:(i + 1) * d, i * d:(i + 1) * d] += np.identity(d) * t inverse_covariance[pi_ind * d:(pi_ind + 1) * d, pi_ind * d:(pi_ind + 1) * d] += np.identity(d) * t inverse_covariance[pi_ind * d:(pi_ind + 1) * d, i * d:(i + 1) * d] += -np.identity(d) * t inverse_covariance[i * d:(i + 1) * d, pi_ind * d:(pi_ind + 1) * d] += -np.identity(d) * t inverse_covariance[0:d, 0:d] += np.identity(d) # invert precision matrix full_covariance = np.linalg.inv(inverse_covariance) leaves_covariance = marginalize_internal(full_covariance, tree, d) return leaves_covariance, full_covariance
def Tree_analysis(tree,tabla,out,analysis_type,out2): ###Al subsequents variables could be modified binomial_value = float(0.05) #Default value for the option 2 of the core evaluation method for the tree p_value = float(0.05) #p-value threeshold for the binomial method (2 method) percentage = float(0.9) #Minimun percentage threeshold of subjects requiered to defined a core taxo_p = float(0.9) #Minimun percentage of the same taxonomic group within all OTUs contained into the same Node output_file=open(out, 'w') output_file_2=open(out2, 'w') tree = Tree(tree, quoted_node_names=True, format=1) #Here we load the 97_otus tree table = {} cont = 1 for line in open(tabla): if (line.startswith('#')): output_file_2.write(str(line)) else: fields = list(map(str.strip, line.split('\t'))) #We create a dictionary with all the keys and values of the OTU table against reference table[fields[0]] = list(map(float, fields[1:-1])) table2 = {} for line in open(tabla): if (line.startswith('#')): continue else: fields2 = list(map(str.strip, line.split('\t'))) #Here we load a dictionary with the taxonomy information from the picked OTUs table2[fields2[0]] = list(map(str, fields2[(len(fields2)-1):len(fields2)])) table_final_res = [0] * len(fields[1:-1]) table_final_res = ([float(i) for i in table_final_res]) sum_abun_rela = 0 cores = 0 for leaf in tree: if leaf.name not in table: leaf.vector = None else: leaf.vector = table[leaf.name] #Create value vectors for each of the tree tips of the tree with the values of the OTU table previously generated node2content = tree.get_cached_content() flag=0 for node in tree.traverse(): #This loop is used to add values into de vectors created before if not node.is_leaf(): leaf_vectors = np.array([leaf.vector for leaf in node2content[node] if leaf.vector is not None]) node.vector = leaf_vectors.sum(axis=0) if(flag == 0): save_node1=node.vector total_saved_leaves = np.array([leaf.name for leaf in node2content[node]]) flag=1 if(analysis_type==4): #This method only prints the information of the tree, only for information of the tree purpouse print(tree.get_ascii(show_internal=True)) output_file.write(tree.get_ascii(show_internal=True) + '\n' + '\n') for node in tree.traverse("preorder"): print (node.name, node.vector) output_file.write(node.name + '\t' + str(node.vector) + '\n') if(analysis_type!=4): output_file.write("Core" + '\t' + "Prevalence" + '\t' + "Abundance" + '\t' + "Relative abundances" + '\t' + "Min" + '\t' + "Max" + '\t' + "Average" + '\t' + "SD" + '\t' + "Leaves" + '\t' + "Taxonomy" + '\t' + "Leaves number" + '\n') if(analysis_type==1 or analysis_type==2 or analysis_type==3): #Here we evaluate the tree traversally using one of the choosen methods: 100% core, binomial or percentage for node in tree.traverse("postorder"): tot_cont=np.count_nonzero(node.vector) #Count the number ob subjects in this study with one ore more ocurrence in the vector for a certain node tot_cont2=np.asarray(node.vector).size #Count the total vector array size a=stats.binom_test(tot_cont, n=tot_cont2, p=binomial_value, alternative='greater') #Binomial test that uses the binomial_value rela=(tot_cont/tot_cont2) if(analysis_type==1 and np.all(node.vector) or (analysis_type==2 and a <= p_value) or (analysis_type==3 and rela >= percentage)): #Depending on the method used to go through the tree, we will evaluate different parameters to check if the node should be or not taken into account node.vector=([float(i) for i in node.vector]) #Transform all the values contained in node.vector to float, to perform operations efficiently abundance=node.vector/save_node1 #Relative abundance of each subject in the node over the terminal node (sum of all nodes) abundance =([float(i) for i in abundance]) mean_abun=np.mean([float(i) for i in abundance]) #Mean abundance of the node std_abun=np.std([float(i) for i in abundance]) #Standard deviation of the node abundance_rela=sum(node.vector)/sum(save_node1) #Global relative abundance of the node over the terminal node table_final_res=list(map(sum, zip(table_final_res, abundance))) #Getting all the results for each node into a final result table sum_abun_rela=sum_abun_rela+abundance_rela #The sum of all global relative abundance cores=cores+1 #Total number of cores node2content = tree.get_cached_content() output_file_2.write(str(node.name) + '\t') for x in range(len(abundance)): output_file_2.write(str(abundance[x]) + '\t'), output_file_2.write('\n') output_file.write(node.name + '\t' + str(rela) + '\t' + str(node.vector) + '\t' + str(abundance) + '\t' + str(min(abundance)) + '\t' + str(max(abundance)) + '\t' + str(mean_abun) + '\t' + str(std_abun) + '\t') conteo_hojas=nodes_eval(node,tree,output_file,table2,taxo_p,total_saved_leaves) #With this line we can assign a taxonomy to each node based in the taxonomy of each OTU, dependig on the minimun taxonomy percentage level stablished before output_file.write(str(conteo_hojas) + '\n') #Print the total number of leaves of this node tree=erase_node(node,tree) #Once a node has been evaluated, this line erase that node from the tree to simplify the calculations of the next nodes G = tree.search_nodes(name=node.name)[0] removed_node = G.detach() output_file.write(str(cores) + '\t' + '\t' + '\t' + str(table_final_res) + '\t' + str(min(table_final_res)) + '\t' + str(max(table_final_res)) + '\t' + str(np.mean([float(i) for i in table_final_res])) + '\t' + str(np.std([float(i) for i in table_final_res])) + '\n')
def readTreeFileFirstLine(filename): file = open(filename, "r") first_tree = file.readline()[:-1] # [:-1] Gets ride of newline at the end of the line return Tree(first_tree, format=1)
result_trs = perform_SPR(in_tr=tmp_cpy, selection=tmp_lst[i]) if result_trs == None: print("This rearrangement resulted in the same tree") else: tree_list.extend(result_trs) # for Tr in tree_list: # print Tr # print len(tree_list) return tree_list if __name__ == "__main__": #### NOTE: each of the internal nodes must have a name for this rearrangement ### Toy example ### SPR must return 64 different topologies for this toy example t = Tree(name="root") Z = t.add_child(name="Z") Y = Z.add_child(name="Y") F = Z.add_child(name="F") X = Y.add_child(name="X") V = Y.add_child(name="V") A = V.add_child(name="A") B = V.add_child(name="B") E = X.add_child(name="E") W = X.add_child(name="W") D = W.add_child(name="D") C = W.add_child(name="C")
#!/usr/bin/env python """ cleanup_parsnp_newick.py Takes a .nwk file produced by parsnp and cleans up the leaf labels If [regex] is given, will also delete all [regex] matches from leaf labels USAGE: python cleanup_parsnp_newick.py parsnp.nwk output.nwk [regex] """ import sys import re from ete3 import Tree if len(sys.argv) < 3: print __doc__ sys.exit(1) t = Tree(sys.argv[1]) for leaf in t.get_leaves(): leaf.name = re.sub(r'(\.\w+)+$', '', leaf.name.strip("'")) if len(sys.argv) >= 4: leaf.name = re.sub(sys.argv[3], '', leaf.name) t.write(format=0, outfile=sys.argv[2])
import sys from ete3 import Tree import re venom = sys.argv[1] control = sys.argv[2] out = sys.argv[3] venomTree = Tree(venom, quoted_node_names=True) controlTree = Tree(control, quoted_node_names=True) outputTree = venomTree venomList = [] for node in venomTree.traverse(strategy="levelorder"): venomList.append(node) controlList = [] for node in controlTree.traverse(strategy="levelorder"): controlList.append(node) outputList = [] for node in outputTree.traverse(strategy="levelorder"): outputList.append(node) print(len(venomList)) print(len(controlList)) print(len(outputList)) for i in range(len(venomList)): #print(venomList[i].name) venomSupp = float(venomList[i].support)
params = {'sigma': sigma, 'beta': beta, 'd': d, 'gamma': gamma, 's': s, 'rho': rho, 'dt': dt, 'time_intervals': 0} "Provide dict with estimated params" est_params = {'sigma': False, 'random_branch_effects': True, 'site_effects': False, 'beta': False, 'd': False, 'gamma': False, 's': False, 'rho': False} "Import tree and sequences" path = './covid-analysis/' "For spike features" features_file = path + 'feature-files/hcov_oct2020_bestTree_byRegion_allFeatures.csv' pastml_path = path + 'pastml/collected_preSep1_dated_pastml/' tree_file = pastml_path + 'named.tree_phylogeny_mle_cleaned_collected_preSep1_dated_cleaned.nwk' absolute_time = 2020.67 # absolute time of last sample "Set up tree for run" tree = Tree(tree_file, format=1) tree, tree_times = TreeUtils.add_tree_times(tree) "Set up time intervals" final_time = max(tree_times) root_time = absolute_time - final_time date_time_intervals = ['2020-01-01', '2020-02-15', '2020-03-15', '2020-04-15', '2020-05-15', '2020-06-15', '2020-07-15', '2020-08-15'] time_intervals = date2FloatYear(date_time_intervals) time_intervals = np.array(time_intervals) - root_time
parse.add_argument( "--tree", type=str, help="name of tree file with branch lengths and internal node names", required=True) parse.add_argument("--outgroup", type=str, help="name of outgroup species name in tree", required=True) args = parse.parse_args() #Load tree t = Tree( args.tree, format=1) #Format 1 loads tree with branch lengths and internal node names print('Loading tree...' + '\n' + 'Make sure your tree has branch lengths and internal node names') #Traverse the tree to get a list of internal and tip node names nodes1 = [] for node in t.traverse("levelorder"): nodes1.append(node.name) #outgroup = str(args.outgroup) #For each node in the tree get the distance from that node to the outgroup tip node, create a dictionary of this information for each node in the tree node_age_dict = {} for node in t.traverse("levelorder"): node_name = node.name
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('host', type=str, help='The input host tree in newick format') parser.add_argument('guest', type=str, help='The input guest tree in newick format') parser.add_argument('mapping', type=str, help='Path to txt file containing guest->host mapping') args = parser.parse_args() host = Tree(args.host, format=1) guest = Tree(args.guest, format=1) nodemap = {} mapfile = open(args.mapping) for line in mapfile: gname, hname = line.strip().split('\t') nodemap[guest & gname] = host & hname Main(host, guest, nodemap) """ host = Tree('genes.stree') guest = Tree('0.nt.raxml.treefix.tree') #Add names i = 0
import os import sys from ete3 import Tree parser = argparse.ArgumentParser() parser.add_argument('-t', '--tree') parser.add_argument('-bl', '--branch_lengths', action='store_true') parser.add_argument('-v', '--verbose', action='store_true') opts = parser.parse_args(sys.argv[1:]) if not os.path.isfile(opts.tree): sys.stderr.write("File {0} not found".format(opts.tree)) sys.exit(1) t = Tree(opts.tree) if opts.verbose: orig_root = t.get_tree_root() sys.stderr.write("ORIGINAL TREE:\n" + orig_root.write(format=9) + "\n\n\n") # intial unroot t.unroot() # reroot by midpoint to force unrooting later midpoint = t.get_midpoint_outgroup() t.set_outgroup(midpoint) if opts.verbose: sys.stderr.write("MIDPOINT ROOTING:\n" + t.write(format=9) + "\n\n\n") # final forced unrooting of tree to be absolutely sure t.unroot()
from ete3 import Tree # Load an unrooted tree. Note that three branches hang from the root # node. This usually means that no information is available about # which of nodes is more basal. t = Tree('(A,(H,F),(B,(E,D)));') print "Unrooted tree" print t # /-A # | # | /-H #---------|---------| # | \-F # | # | /-B # \--------| # | /-E # \--------| # \-D # # Let's define that the ancestor of E and D as the tree outgroup. Of # course, the definition of an outgroup will depend on user criteria. ancestor = t.get_common_ancestor("E","D") t.set_outgroup(ancestor) print "Tree rooted at E and D's ancestor is more basal that the others." print t # # /-B # /--------| # | | /-A # | \--------| # | | /-H
def check(self): glottolog = { lng.id: lng for lng in self.read_csv('csv', 'glottolog.csv', namedtuples=True) } msgs = {'error': [], 'warning': []} def _msg(type_, msg, obj=None): # pragma: no cover obj = '{0.__class__.__name__} {0.id}: '.format(obj) if obj else '' msgs[type_].append('%s:%s%s' % (type_.upper(), obj, msg)) def error(msg, obj=None): # pragma: no cover _msg('error', msg, obj=obj) def warning(msg, obj=None): # pragma: no cover _msg('warning', msg, obj=obj) sources = set(e.key for e in self.sources.iterentries()) socids, xdids, gcs, varids = \ set(), collections.defaultdict(set), collections.defaultdict(set), {} for ds in self.datasets: for soc in ds.societies: if soc.id in socids: # pragma: no cover error('duplicate society ID: {0}'.format(soc.id), ds) xdids[soc.xd_id].add(soc.glottocode) gcs[soc.glottocode].add(soc.xd_id) socids.add(soc.id) if soc.glottocode not in glottolog: # pragma: no cover warning( '{0} without valid glottocode {0.glottocode}'.format( soc), ds) elif glottolog[ soc. glottocode].family_name == 'Bookkeeping': # pragma: no cover warning( '{0} mapped to Bookkeeping language: {0.glottocode}'. format(soc), ds) # are there duplicate variables? for var in ds.variables: if var.id in varids: # pragma: no cover error('duplicate variable ID: {0}'.format(var.id), ds) varids[var.id] = [c.code for c in var.codes] if var.type in [ 'Categorical', 'Ordinal' ] else [] # are there undefined variables? undefined = set( [r.var_id for r in ds.data if r.var_id not in varids]) for u in undefined: # pragma: no cover error('undefined variable ID: {0}'.format(u), ds) for d in ds.data: if d.var_id not in varids: # pragma: no cover error('undefined variable ID: {0}'.format(d.var_id), ds) elif len(varids[d.var_id]) > 1 \ and d.code not in varids[d.var_id]: # pragma: no cover error( 'undefined code for variable {0} and society {1}:{2}'. format(d.var_id, d.soc_id, d.code), ds) for ref in d.references: if ref.key not in sources: error( 'undefined source key "{0}" referenced in {1}'. format(ref.key, ds.id), ds) for xdid, glottocodes in xdids.items(): if len(glottocodes - {None}) > 1: # pragma: no cover # No xd_id can be linked to more than one Glottocode! error('xd_id {0} mapped to multiple glottocodes {1}'.format( xdid, glottocodes)) for p in self.phylogenies: if p.source_id: if p.source_id not in sources: # pragma: no cover error( '{0}: invalid source_id {1}'.format(p.id, p.source_id), p) taxa = set() for taxon in p.taxa: taxa.add(taxon.taxon) if taxon.glottocode and taxon.glottocode not in glottolog: error( '{0}: invalid glottocode {1}'.format( p.id, taxon.glottocode), p) for socid in taxon.soc_ids: if socid not in socids: error('{0}: invalid soc_id {1}'.format(p.id, socid), p) for xdid in taxon.xd_ids: if xdid not in xdids: error('{0}: invalid xd_id {1}'.format(p.id, xdid), p) if not p.nexus: # pragma: no cover error('{0}: unable to load summary.trees'.format(p.id), p) try: Tree(p.newick, format=1) except NewickError as e: # pragma: no cover error( '{0}: invalid newick tree from summary.trees: {1}'.format( p.id, e), p) if not p.is_glottolog: for node in p.newick_tree.walk(): if node.name and node.is_leaf and node.name not in taxa: # pragma: no cover warning('Leaf label missing in taxa.csv: {0}'.format( node.name), obj=p) for key in ['warning', 'error']: for msg in msgs[key]: print(msg) return not bool(msgs['error'])
def add_group_to_tree(group, treefile, outdir, to_compress=False): if to_compress: compress = to_compress.split(",") else: compress = [] for line in open(os.path.join(outdir, "homolog_matrix.txt"), 'r'): if line.startswith("\t"): header = line.rstrip().split("\t")[1:] if not line.startswith(group): continue else: vals = [int(x) for x in line.rstrip().split("\t")[1:]] groupdata = dict(zip(header, vals)) ts = TreeStyle() tree = Tree(os.path.abspath(treefile)) pal = sns.cubehelix_palette(rot=-.4, n_colors=13) for node in tree.iter_descendants("preorder"): this_node = [] nstyle = NodeStyle() nstyle["shape"] = "circle" if node.is_leaf(): try: if groupdata[node.name] > 0: nstyle["fgcolor"] = colors.rgb2hex(pal[12]) else: nstyle["fgcolor"] = colors.rgb2hex(pal[0]) except KeyError: nstyle["fgcolor"] = colors.rgb2hex(pal[0]) else: species = {} for x in node.iter_descendants("preorder"): if x.is_leaf(): this_node.append(x.name) s = x.name.split("_")[1] if s in species: species[s] += 1 else: species[s] = 1 for c in compress: try: if float(species[c]) / float(len(this_node)) > 0.95: nstyle["draw_descendants"] = False node.name = "{} clade".format(c) except KeyError: pass count = 0 for t in this_node: try: if groupdata[t] > 0: count += 1 except KeyError: pass v = int(round(float(count) / float(len(this_node)) * 12)) nstyle["fgcolor"] = colors.rgb2hex(pal[v]) nstyle["size"] = 3 * math.sqrt(len(this_node)) node.set_style(nstyle) return tree
stack.append(dictionary[current][1]) stack.append(dictionary[current][2]) stack.append(dictionary[current][3]) result.append("(") else: result.append(current) current_prev = current result.pop() result.append(")") return result if __name__ == "__main__": matrix, length = readInput() dictionary = {} finalCluster = wpgma(matrix, length, dictionary) result = printCluster(dictionary, finalCluster) result = ''.join(result) result = result + ";" #ete3 is tool for pylogenetic tree construction from ete3 import Tree tree = Tree(result) print("WPGMA Resultant Clustering:") print("") print(result) print("") print(tree)