def getTheTrees(): ##DOWNLOAD taxdump and store in taxo folder ##DOWNLOAD TAXREF BY HAND! and put it in taxo/ class Trans: def __init__(self): self.common_name_FR = [] print("Getting french translations...") TRANS = {} ##translations in french with open("taxo/TAXREFv11.txt") as f: for line in f: sciname = line.split("\t")[14] comnameFR = line.split("\t")[19] if (sciname not in TRANS and line.split("\t")[19] != ''): TRANS[sciname] = Trans() if (line.split("\t")[19] != ''): TRANS[sciname].common_name_FR.append(comnameFR) #get translation of ranks print("Getting rank names in french...") RANKS = {} with open("taxo/ranks_FR.txt") as f: for line in f: rank_en = line.split("\t")[0] rank_fr = line.split("\t")[1].rstrip() ##to remove \n RANKS[rank_en] = rank_fr class Taxid: def __init__(self): self.sci_name = "" self.authority = "" self.synonym = "" # self.common_name = "" self.common_name = [] # self.common_name_FR = "" self.common_name_FR = [] cpt = 0 cptfr = 0 ATTR = {} ##here we will list attribute of each species per taxid print("Reading NCBI taxonomy...") with open("taxo/names.dmp") as f: for line in f: taxid = line.split("|")[0].replace("\t", "") tid_val = line.split("|")[1].replace("\t", "") tid_type = line.split("|")[3].replace("\t", "") ##PEUT ETRE RAJOUTER DES PETTS FILTRES COMME CA ??? A VOIR. # n.common_name = n.common_name[0] if len(n.common_name)>0 else "" # n.common_name = n.common_name.replace("'","''"); # n.common_name_FR = n.common_name_FR[0] if len(n.common_name_FR)>0 else "" # n.common_name_FR = n.common_name_FR.replace("'","''"); # n.rank = n.rank.replace("'","''"); # n.rank_FR = n.rank_FR.replace("'","''"); # n.sci_name = n.sci_name.replace("'","''") # #add parenthesis to the common name # if n.common_name!='': # n.common_name = "(" + n.common_name + ")" if (taxid not in ATTR): ATTR[taxid] = Taxid() if (tid_type == "scientific name"): ATTR[taxid].sci_name = tid_val #and get translation in french (if any) if tid_val in TRANS: ATTR[taxid].common_name_FR = TRANS[tid_val].common_name_FR cptfr += 1 if (tid_type == "authority"): if (ATTR[taxid].authority != ""): ATTR[taxid].authority = ATTR[ taxid].authority + ", " + tid_val else: ATTR[taxid].authority = tid_val if (tid_type == "synonym"): if (ATTR[taxid].synonym != ""): ATTR[taxid].synonym = ATTR[taxid].synonym + ", " + tid_val else: ATTR[taxid].synonym = tid_val if (tid_type == "common name"): cpt += 1 ATTR[taxid].common_name.append(tid_val) # if (ATTR[taxid].common_name!=""): # ATTR[taxid].common_name = ATTR[taxid].common_name + ", " + tid_val # else: # ATTR[taxid].common_name = tid_val T = {} ###New gettrees filepath = 'taxo/nodes.dmp' print("Building the NCBI taxonomy tree...") with open(filepath) as fp: first_line = fp.readline() ## remove the 1 | 1 edge for line in fp: dad = line.split("|")[1].replace("\t", "") son = line.split("|")[0].replace("\t", "") rank = line.split("|")[2].replace("\t", "") if (dad not in T): T[dad] = Tree() T[dad].name = dad # T[dad].rank = rank # T[dad].rank_FR = RANKS[rank] T[dad].taxid = dad T[dad].sci_name = ATTR[dad].sci_name T[dad].common_name = ATTR[dad].common_name T[dad].synonym = ATTR[dad].synonym T[dad].authority = ATTR[dad].authority T[dad].common_name_FR = ATTR[dad].common_name_FR if (son not in T): T[son] = Tree() T[son].name = son T[son].rank = rank T[son].rank_FR = RANKS[rank] T[son].taxid = son T[son].sci_name = ATTR[son].sci_name T[son].common_name = ATTR[son].common_name T[son].synonym = ATTR[son].synonym T[son].authority = ATTR[son].authority T[son].common_name_FR = ATTR[son].common_name_FR else: if (hasattr(T[son], 'rank') == False): T[son].rank = rank T[son].rank_FR = RANKS[rank] T[dad].add_child(T[son]) return T
prop = sys.argv[ 8] # whether the probability of a rate change should be proportional to the branch length (y by default) try: f = open(treefile, 'r') except IOError: print("Unknown file: ", treefile) sys.exit() line = "" for l in f: line = line + l f.close() # Starting ultrametric tree ultra_tree = Tree(line, format=0) use_bl = True if prop == "n": use_bl = False rates = dict() rates[ultra_tree.get_tree_root()] = 1.0 number_of_small_changes_per_branch = dict() number_of_big_changes_per_branch = dict() average_dist = 0.0 n_branches = 0 for n in ultra_tree.traverse(strategy="preorder"): if n != ultra_tree.get_tree_root():
for line in f: token = line.rstrip().rsplit() d[token[0]] = token[1] return d number_of_sets = 0 number_of_sets_with_duplication = 0 number_of_sets_with_speciation = 0 number_of_sets_with_relocalisation = 0 number_of_sets_with_duplication_and_relocalisation = 0 number_of_sets_with_speciation_and_relocalisation = 0 # Load Species Tree os.chdir("/cellar/rona/Phytozome10/Phyldog") species_tree = Tree("Phytozome10_constrainedTree_rooted_labelled.tree", format=1) # Create dictionary for each location, with each node on the species tree as a key with a starting value of 0. This value will be updated if there is a loss/gain at that node. number_of_relocalisations_on_species_tree_node = {} for node in species_tree.traverse(): number_of_relocalisations_on_species_tree_node[node.name] = [ 0, 0, 0, 0 ] #format - node.name: relocalisations following dup, total_dups, relocalisations from spec, total_specs os.chdir(location_of_trees) for filename in glob.glob("OG0*.locus.tree"): orthogroup = (filename[:-11]) node_2_node_dict = make_node_2_node_dict( orthogroup) # ortho_node = species_node orthogroup_tree = Tree(filename, format=1) os.chdir(location_of_dup_reloc_files)
index = [] alg = [] for r in SeqIO.parse(fname, format="fasta"): index.append(r.id) alg.append(seq2vector(r.seq)) named_index = {name: i for i, name in enumerate(index)} return np.array(alg), named_index, index tree_file = sys.argv[1] alg_file = sys.argv[2] thr = float(sys.argv[3]) alg, index, i2name = load_alg(alg_file) tree = Tree(tree_file) tree.set_outgroup(tree.get_midpoint_outgroup()) node2content = tree.get_cached_content(store_attr="name") for n in tree.traverse("levelorder"): if n.children: ch1 = n.children[0] ch2 = n.children[1] leaves_left = [index[name] for name in node2content[ch1]] leaves_right = [index[name] for name in node2content[ch2]] if len(leaves_left) < 3 or len(leaves_right) < 3: continue rows, cols = alg[tuple(leaves_left), :].nonzero() colres_left = Counter(cols)
#http://stackoverflow.com/questions/23172293/use-python-to-extract-branch-lengths-from-newick-format pattern = re.compile(r"\b[0-9]+(?:\.[0-9]+)?\b") logger.debug("trees:\n * %s", "\n * ".join(lnf)) nb_input_tree_before = len(lnf) logger.debug("%s trees in %s", nb_input_tree_before, args.tree_dir) for treefilename in lnf: # test if a tree try: t=Tree(treefilename) except: logger.warning("%s is not a newick tree, this tree will not be used",treefilename) lnf.remove(treefilename) t="" if t: treefile=open(treefilename,"r") tree=treefile.read().strip() treefile.close() #test if branch length branch_lengths = pattern.findall(tree) if branch_lengths == []: logger.warning("No branch length in %s, this tree will not be used",treefilename) lnf.remove(treefilename) nb_input_tree_after = len(lnf)
t = readTreeFromFile(file) index = 0 for node in t.traverse("postorder"): if not node.is_leaf(): node.name=str(index) node.support=str(index) index += 1 id2Height = dict() nodeId2LeafListRef = dict() leafList2NodeIdRef = dict() idToDescendants = dict() # Now we want to get the calibrations according to the options that have been user-input. t_begin = Tree() # Balanced or not? if ('y' in balanced): # Getting calibrations from both sides of the root t_begin = t else: # Getting calibrations only from one side choices = [0,1] choice = random.choice(choices) print("Choosing calibrations from subtree: ", choice) t_begin = t.get_children()[choice] print("Number of nodes in sampled subtree: ", len(t_begin.get_descendants())) id2Height = getInternalNodeHeights( t_begin ) nodeId2LeafListRef, leafList2NodeIdRef, idToDescendants = getNameToLeavesAndIdToDescendantIdsLink( t_begin )
f = open(file, 'r') except IOError: print("Unknown file: ", file) sys.exit() #File where I store useful functions exec(open("/home/boussau/Programming/Notebooks/code/functions.py").read()) allTrees = list() for l in f: l2 = l.strip() # removing anything within square brackets if "[" in l2: l2 = re.sub('\[[^\]]+\]\s*', '', l2) print(l2) t = Tree(l2) createNameToLeavesLink(t) allTrees.append(t) f.close() id2Heights = list() for t in allTrees: node2Height, id2Height = getNodeHeights(t) print(id2Height) id2Heights.append(id2Height) #print(len(id2Heights)) # Creating a uniform weight vector weights = [1] * len(id2Heights) outputsWeightedChronogram(allTrees[0].copy(), id2Heights, out, weights)
def plot_heat_tree_V1(taxid2n, tree_file, genes, taxid2st=False, leaf_label_conversion_dico=False): ''' Plot heatmap next to a tree. The order of the heatmap **MUST** be the same, as order of the leafs on the tree. The tree must be in the Newick format. If *output_file* is specified, then heat-tree will be rendered as a PNG, otherwise interactive browser will pop-up with your heat-tree. TODO ajouter en option la possibilite d'ajouter en option la valeur dans la cellule Parameters ---------- tree_file: str Path to the tree file in Newick format. The leaf node labels should be the same as as row names in the heatmap file. E.g. row1, row2. output_file: str, optional If specified the heat-tree will be rendered in that file as a PNG image, otherwise interactive browser will pop-up. **N.B.** program will wait for you to exit the browser before continuing. ''' t1 = Tree(tree_file) tss = TreeStyle() #t.populate(8) # Calculate the midpoint node R = t1.get_midpoint_outgroup() # and set it as tree outgroup t1.set_outgroup(R) # To operate with numbers efficiently import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl norm = mpl.colors.Normalize(vmin=0.8, vmax=1) # map2count[map[0]][0] cmap_blue = cm.Blues m2 = cm.ScalarMappable(norm=norm, cmap=cmap_blue) leaf_number = 0 for lf in t1.iter_leaves(): leaf_number += 1 lf.branch_vertical_margin = 0 try: data = taxid2n[str(lf.name)] except: data = [0] try: st = taxid2st[lf.name] except: st = False ''' if "taxon2accession_list" not in locals(): from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db("k_cosson_05_16") sql = 'select taxon_id, accession from bioentry where biodatabase_id=104' data_tax = server.adaptor.execute_and_fetchall(sql,) taxon2accession_list = {} for i in data_tax: if i[0] not in taxon2accession_list: taxon2accession_list[i[0]] = [i[1]] else: taxon2accession_list[i[0]].append(i[1]) else: for taxon in taxon2accession_list: if lf.name in taxon2accession_list[taxon]: for accession in taxon2accession_list[taxon]: print lf.name, accession try: st = taxid2st[accession] data = taxid2n[accession] print 'st ok!!', st break except: continue ''' if accession2description: try: lf.name = accession2description[lf.name] except: pass if st: lf.name = lf.name + ' (' + st + ')' else: pass for col, value in enumerate(data): if leaf_number == 1: n = TextFace('%s' % (genes[col]), fsize=6) n.vt_align = 2 n.hz_align = 2 n.rotation = 270 n.margin_top = 0 n.margin_right = 0 n.margin_left = 4 n.margin_bottom = 0 n.inner_background.color = "white" n.opacity = 1. tss.aligned_header.add_face(n, col) #lf.add_face(n, col, position="aligned") if value > 0: n = TextFace(' ') n.margin_top = 0 n.margin_right = 0 n.margin_left = 0 n.margin_bottom = 0 n.inner_background.color = rgb2hex(m2.to_rgba( float(value))) #'#140718' #"#81BEF7" n.opacity = 1. lf.add_face(n, col, position="aligned") else: n = TextFace(' ') n.margin_top = 0 n.margin_right = 0 n.margin_left = 0 n.margin_bottom = 0 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col, position="aligned") return t1, leaf_number, tss
def deepbiome_draw_phylogenetic_tree( log, network_info, path_info, num_classes, file_name="%%inline", img_w=500, branch_vertical_margin=20, arc_start=0, arc_span=360, node_name_on=True, name_fsize=10, tree_weight_on=True, tree_weight=None, tree_level_list=['Genus', 'Family', 'Order', 'Class', 'Phylum'], weight_opacity=0.4, weight_max_radios=10, phylum_background_color_on=True, phylum_color=[], phylum_color_legend=False, show_covariates=True, verbose=True): """ Draw phylogenetic tree Parameters ---------- log (logging instance) : python logging instance for logging network_info (dictionary) : python dictionary with network_information path_info (dictionary): python dictionary with path_information num_classes (int): number of classes for the network. 0 for regression, 1 for binary classificatin. file_name (str): name of the figure for save. - "*.png", "*.jpg" - "%%inline" for notebook inline output. default="%%inline" img_w (int): image width (pt) default=500 branch_vertical_margin (int): vertical margin for branch default=20 arc_start (int): angle that arc start default=0 arc_span (int): total amount of angle for the arc span default=360 node_name_on (boolean): show the name of the last leaf node if True default=False name_fsize (int): font size for the name of the last leaf node default=10 tree_weight_on (boolean): show the amount and the direction of the weight for each edge in the tree by circle size and color. default=True tree_weight (ndarray): reference tree weights default=None tree_level_list (list): name of each level of the given reference tree weights default=['Genus', 'Family', 'Order', 'Class', 'Phylum'] weight_opacity (float): opacity for weight circle default= 0.4 weight_max_radios (int): maximum radios for weight circle default= 10 phylum_background_color_on (boolean): show the background color for each phylum based on `phylumn_color`. default= True phylum_color (list): specify the list of background colors for phylum level. If `phylumn_color` is empty, it will arbitrarily assign the color for each phylum. default= [] phylum_color_legend (boolean): show the legend for the background colors for phylum level default= False show_covariates (boolean): show the effect of the covariates default= True verbose (boolean): show the log if True default=True Returns ------- Examples -------- Draw phylogenetic tree deepbiome_draw_phylogenetic_tree(log, network_info, path_info, num_classes, file_name = "%%inline") """ os.environ[ 'QT_QPA_PLATFORM'] = 'offscreen' # for tree figure (https://github.com/etetoolkit/ete/issues/381) reader_class = getattr(readers, network_info['model_info']['reader_class'].strip()) reader = reader_class(log, path_info, verbose=verbose) data_path = path_info['data_info']['data_path'] try: count_path = path_info['data_info']['count_path'] x_list = np.array( pd.read_csv(path_info['data_info']['count_list_path'], header=None).iloc[:, 0]) x_path = np.array([ '%s/%s' % (count_path, x_list[fold]) for fold in range(x_list.shape[0]) if '.csv' in x_list[fold] ]) except: x_path = np.array([ '%s/%s' % (data_path, path_info['data_info']['x_path']) for fold in range(1) ]) reader.read_dataset(x_path[0], None, 0) network_class = getattr( build_network, network_info['model_info']['network_class'].strip()) network = network_class(network_info, path_info, log, fold=0, num_classes=num_classes, tree_level_list=tree_level_list, is_covariates=reader.is_covariates, covariate_names=reader.covariate_names, verbose=False) if len(phylum_color) == 0: colors = mcolors.CSS4_COLORS colors_name = list(colors.keys()) if reader.is_covariates and show_covariates: phylum_color = np.random.choice( colors_name, network.phylogenetic_tree_info['Phylum_with_covariates']. unique().shape[0]) else: phylum_color = np.random.choice( colors_name, network.phylogenetic_tree_info['Phylum'].unique().shape[0]) basic_st = NodeStyle() basic_st['size'] = weight_max_radios * 0.5 basic_st['shape'] = 'circle' basic_st['fgcolor'] = 'black' t = Tree() root_st = NodeStyle() root_st["size"] = 0 t.set_style(root_st) tree_node_dict = {} tree_node_dict['root'] = t upper_class = 'root' lower_class = tree_level_list[-1] lower_layer_names = tree_weight[-1].columns.to_list() layer_tree_node_dict = {} phylum_color_dict = {} for j, val in enumerate(lower_layer_names): t.add_child(name=val) leaf_t = t.get_leaves_by_name(name=val)[0] leaf_t.set_style(basic_st) layer_tree_node_dict[val] = leaf_t if lower_class == 'Phylum' and phylum_background_color_on: phylum_st = copy.deepcopy(basic_st) phylum_st["bgcolor"] = phylum_color[j] phylum_color_dict[val] = phylum_color[j] leaf_t.set_style(phylum_st) tree_node_dict[lower_class] = layer_tree_node_dict upper_class = lower_class upper_layer_names = lower_layer_names for i in range(len(tree_level_list) - 1): lower_class = tree_level_list[-2 - i] if upper_class == 'Disease' and show_covariates == False: lower_layer_names = network.phylogenetic_tree_info[ lower_class].unique() else: lower_layer_names = tree_weight[-i - 1].index.to_list() layer_tree_node_dict = {} for j, val in enumerate(upper_layer_names): parient_t = tree_node_dict[upper_class][val] if upper_class == 'Disease': child_class = lower_layer_names else: child_class = network.phylogenetic_tree_info[lower_class][ network.phylogenetic_tree_info[upper_class] == val].unique() for k, child_val in enumerate(child_class): parient_t.add_child(name=child_val) leaf_t = parient_t.get_leaves_by_name(name=child_val)[0] if lower_class == 'Phylum' and phylum_background_color_on: phylum_st = copy.deepcopy(basic_st) phylum_st["bgcolor"] = phylum_color[k] phylum_color_dict[child_val] = phylum_color[k] leaf_t.set_style(phylum_st) else: leaf_t.set_style(basic_st) if tree_weight_on: edge_weights = np.array(tree_weight[-1 - i]) edge_weights *= (weight_max_radios / np.max(edge_weights)) if upper_class == 'Disease': upper_num = 0 else: upper_num = network.phylogenetic_tree_dict[ upper_class][val] if upper_class == 'Disease' and reader.is_covariates == True and show_covariates: lower_num = network.phylogenetic_tree_dict[ '%s_with_covariates' % lower_class][child_val] else: lower_num = network.phylogenetic_tree_dict[ lower_class][child_val] leaf_t.add_features(weight=edge_weights[lower_num, upper_num]) layer_tree_node_dict[child_val] = leaf_t tree_node_dict[lower_class] = layer_tree_node_dict upper_class = lower_class upper_layer_names = lower_layer_names def layout(node): if "weight" in node.features: # Creates a sphere face whose size is proportional to node's # feature "weight" color = {1: "RoyalBlue", 0: "Red"}[int(node.weight > 0)] C = CircleFace(radius=node.weight, color=color, style="circle") # Let's make the sphere transparent C.opacity = weight_opacity # And place as a float face over the tree faces.add_face_to_node(C, node, 0, position="float") if node_name_on & node.is_leaf(): # Add node name to laef nodes N = AttrFace("name", fsize=name_fsize, fgcolor="black") faces.add_face_to_node(N, node, 0) ts = TreeStyle() ts.show_leaf_name = False ts.mode = "c" ts.arc_start = arc_start ts.arc_span = arc_span ts.layout_fn = layout ts.branch_vertical_margin = branch_vertical_margin ts.show_scale = False if phylum_color_legend: for phylum_name in np.sort(list(phylum_color_dict.keys())): color_name = phylum_color_dict[phylum_name] ts.legend.add_face(CircleFace(weight_max_radios * 1, color_name), column=0) ts.legend.add_face(TextFace(" %s" % phylum_name, fsize=name_fsize), column=1) return t.render(file_name=file_name, w=img_w, tree_style=ts) # ######################################################################################################################### # if __name__ == "__main__": # argdict = argv_parse(sys.argv) # try: gpu_memory_fraction = float(argdict['gpu_memory_fraction'][0]) # except: gpu_memory_fraction = None # try: max_queue_size=int(argdict['max_queue_size'][0]) # except: max_queue_size=10 # try: workers=int(argdict['workers'][0]) # except: workers=1 # try: use_multiprocessing=argdict['use_multiprocessing'][0]=='True' # except: use_multiprocessing=False # ### Logger ############################################################################################ # logger = logging_daily.logging_daily(argdict['log_info'][0]) # logger.reset_logging() # log = logger.get_logging() # log.setLevel(logging_daily.logging.INFO) # log.info('Argument input') # for argname, arg in argdict.items(): # log.info(' {}:{}'.format(argname,arg)) # ### Configuration ##################################################################################### # config_data = configuration.Configurator(argdict['path_info'][0], log) # config_data.set_config_map(config_data.get_section_map()) # config_data.print_config_map() # config_network = configuration.Configurator(argdict['network_info'][0], log) # config_network.set_config_map(config_network.get_section_map()) # config_network.print_config_map() # path_info = config_data.get_config_map() # network_info = config_network.get_config_map() # test_evaluation, train_evaluation, network = deepbiome_train(log, network_info, path_info, number_of_fold=20)
def plot_heat_tree(biodb, taxid2n, tree_file): ''' Plot heatmap next to a tree. The order of the heatmap **MUST** be the same, as order of the leafs on the tree. The tree must be in the Newick format. If *output_file* is specified, then heat-tree will be rendered as a PNG, otherwise interactive browser will pop-up with your heat-tree. Parameters ---------- heatmap_file: str Path to the heatmap file. The first row must have '#Names' as first element of the header. e.g. #Names, A, B, C, D row1, 2, 4, 0, 4 row2, 4, 6, 2, -1 tree_file: str Path to the tree file in Newick format. The leaf node labels should be the same as as row names in the heatmap file. E.g. row1, row2. output_file: str, optional If specified the heat-tree will be rendered in that file as a PNG image, otherwise interactive browser will pop-up. **N.B.** program will wait for you to exit the browser before continuing. ''' from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) taxid2organism = manipulate_biosqldb.taxon_id2genome_description( server, biodb, True) t1 = Tree(tree_file) # Calculate the midpoint node R = t1.get_midpoint_outgroup() # and set it as tree outgroup t1.set_outgroup(R) leaf_number = 0 for lf in t1.iter_leaves(): leaf_number += 1 lf.branch_vertical_margin = 0 try: data = [taxid2n[str(lf.name)]] except: data = [0] #print 'taxon', int(lf.name) lf.name = taxid2organism[int(lf.name)] for col, value in enumerate(data): if value > 0: n = TextFace(' %s ' % str(value)) n.margin_top = 2 n.margin_right = 2 n.margin_left = 2 n.margin_bottom = 2 n.inner_background.color = "#81BEF7" n.opacity = 1. lf.add_face(n, col, position="aligned") else: n = TextFace(' %s ' % str(value)) n.margin_top = 2 n.margin_right = 2 n.margin_left = 2 n.margin_bottom = 2 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col, position="aligned") return t1, leaf_number
def plot_heatmap_tree_locus(biodb, tree_file, taxid2count, taxid2identity=False, taxid2locus=False, reference_taxon=False, n_paralogs_barplot=False): ''' plot tree and associated heatmap with count of homolgs optional: - add identity of closest homolog - add locus tag of closest homolog ''' from chlamdb.biosqldb import manipulate_biosqldb server, db = manipulate_biosqldb.load_db(biodb) taxid2organism = manipulate_biosqldb.taxon_id2genome_description( server, biodb, True) t1 = Tree(tree_file) ts = TreeStyle() ts.draw_guiding_lines = True ts.guiding_lines_color = "gray" # Calculate the midpoint node R = t1.get_midpoint_outgroup() # and set it as tree outgroup t1.set_outgroup(R) leaf_number = 0 for lf in t1.iter_leaves(): if str(lf.name) not in taxid2count: taxid2count[str(lf.name)] = 0 max_count = max([taxid2count[str(lf.name)] for lf in t1.iter_leaves()]) for i, lf in enumerate(t1.iter_leaves()): # top leaf, add header if i == 0: n = TextFace('Number of homologs') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") ts.aligned_header.add_face(n, 1) if taxid2identity: n = TextFace('Protein identity') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") ts.aligned_header.add_face(n, 2) if taxid2locus: n = TextFace('Locus tag') n.margin_top = 1 n.margin_right = 1 n.margin_left = 20 n.margin_bottom = 1 n.inner_background.color = "white" n.opacity = 1. n.rotation = -25 #lf.add_face(n, 7, position="aligned") ts.aligned_header.add_face(n, 3) leaf_number += 1 lf.branch_vertical_margin = 0 data = [taxid2count[str(lf.name)]] # possibility to add one or more columns for col, value in enumerate(data): col_index = col if value > 0: n = TextFace(' %s ' % str(value)) n.margin_top = 2 n.margin_right = 2 if col == 0: n.margin_left = 20 else: n.margin_left = 2 n.margin_bottom = 2 n.inner_background.color = "white" # #81BEF7 n.opacity = 1. lf.add_face(n, col, position="aligned") else: n = TextFace(' %s ' % str(value)) n.margin_top = 2 n.margin_right = 2 if col == 0: n.margin_left = 20 else: n.margin_left = 2 n.margin_bottom = 2 n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col, position="aligned") # optionally indicate number of paralogs as a barplot if n_paralogs_barplot: col_index += 1 percent = (float(value) / max_count) * 100 n = StackedBarFace([percent, 100 - percent], width=150, height=18, colors=['#6699ff', 'white'], line_color='white') n.rotation = 0 n.inner_border.color = "white" n.inner_border.width = 0 n.margin_right = 15 n.margin_left = 0 lf.add_face(n, col + 1, position="aligned") # optionally add additionnal column with identity if taxid2identity: import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl norm = mpl.colors.Normalize(vmin=0, vmax=100) cmap = cm.OrRd m = cm.ScalarMappable(norm=norm, cmap=cmap) try: if round(taxid2identity[str(lf.name)], 2) != 100: value = "%.2f" % round(taxid2identity[str(lf.name)], 2) else: value = "%.1f" % round(taxid2identity[str(lf.name)], 2) except: value = '-' if str(lf.name) == str(reference_taxon): value = ' ' n = TextFace(' %s ' % value) n.margin_top = 2 n.margin_right = 2 n.margin_left = 20 n.margin_bottom = 2 if not value.isspace() and value is not '-': n.inner_background.color = rgb2hex(m.to_rgba(float(value))) if float(value) > 82: n.fgcolor = 'white' n.opacity = 1. if str(lf.name) == str(reference_taxon): n.inner_background.color = '#800000' lf.add_face(n, col_index + 1, position="aligned") # optionaly add column with locus name if taxid2locus: try: value = str(taxid2locus[str(lf.name)]) except: value = '-' n = TextFace(' %s ' % value) n.margin_top = 2 n.margin_right = 2 n.margin_left = 2 n.margin_bottom = 2 if str(lf.name) != str(reference_taxon): n.inner_background.color = "white" else: n.fgcolor = '#ff0000' n.inner_background.color = "white" n.opacity = 1. lf.add_face(n, col_index + 2, position="aligned") lf.name = taxid2organism[str(lf.name)] return t1, leaf_number, ts
"python3 draw_tanglegram.py -newick1 ./all_1469_new.newick -newick2 ./nxrA.newick -cf1 ./gene_annotation.txt -cf2 ./phylum_annotate.txt -length 'max' -sep '_' -extra_set 'rename' " import sys from .tanglegram import * from bin.format_newick import sort_tree from os.path import dirname, join, exists from ete3 import Tree import io # raw gene 2 species tree example_dir = r"D:\Desktop\OneDrive - The Chinese University of Hong Kong\luo lab\项目\AOB\whole_tree\nxrA" desktop_NOB_tmp = "/d/Desktop/NOB_HGT" gene_tree = join(example_dir, 'nxrA.newick') species_tree = join(example_dir, '..', 'all', 'all_1469_new.newick') species_tree = Tree(species_tree, format=3) for l in species_tree.get_leaves(): l.name = l.name.split('_')[-1].replace('.', 'v') gene_tree_colors = join(example_dir, 'gene_annotation.txt') species_tree_colors = join(example_dir, '..', 'all', 'phylum_annotate.txt') Angst_reconciles = sort_tree( Tree("D:/Desktop/NOB_HGT/angst/AnGST.newick", format=3)) fig = main(gene_tree, species_tree, gene_tree_colors, species_tree_colors, l_legnth='max', sep='_',
def plot_tree(canopy, folder='results', seed=0): t = Tree() nstyle = NodeStyle() nstyle['fgcolor'] = 'black' nstyle['size'] = 0 t.set_style(nstyle) r = lambda: np.random.randint(0, 255) def get_color(): return '#%02X%02X%02X' % (r(), r(), r()) log_path = folder + '/log_' + canopy + '_' + str(seed) + '.csv' log = np.loadtxt(log_path, delimiter=',', dtype=np.float) labels = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(canopy), delimiter='\t', dtype=np.int)[:, 1] # print(log[:,:2].astype(int)) n = int(np.max(log[:, :2])) + 1 np.random.seed(4) colors = [get_color() for i in range(n)] nodes = {} for i in np.arange(n)[::-1]: node = t.add_child(name=str(i)) nstyle = NodeStyle() nstyle['fgcolor'] = colors[labels[int(i)]] nstyle['size'] = 5 node.set_style(nstyle) nodes[i] = node counter = 0 for i, j, link in log: i, j = int(i), int(j) depth_i = depth(nodes[i]) depth_j = depth(nodes[j]) # print(depth_i, depth_j) par_i = nup(nodes[i], depth_i - 1) par_j = nup(nodes[j], depth_j - 1) if depth_i == depth_j: l = nodes[i].detach() r = nodes[j].detach() else: par_i = nup(nodes[i], depth_i - 1) par_j = nup(nodes[j], depth_j - 1) # print(par_i) # print(par_j) l = par_i.detach() r = par_j.detach() new_node = t.add_child() nstyle = NodeStyle() nstyle['fgcolor'] = 'black' nstyle['size'] = 0 new_node.set_style(nstyle) new_node.add_child(l, name=str(i)) new_node.add_child(r, name=str(j)) max_depth = 0 for node in t.get_leaves(): d = depth(node) max_depth = max(max_depth, d) max_depth += 1 for node in t.get_leaves(): d = depth(node) node.dist = max_depth - d ts = TreeStyle() ts.show_leaf_name = True ts.rotation = 90 # ts.show_branch_length = True # ts.show_branch_support = True ts.show_scale = False t.show(tree_style=ts)
#!/usr/bin/env python3 # The branch length is retrieved from http://www.timetree.org/ from ete3 import Tree t = Tree(name="ancestry4") n1 = t.add_child(dist=12.9, name="ancestry3") t.add_child(name="rheMac", dist=28.1) n2 = n1.add_child(dist=6.59, name="ancestry2") n1.add_child(name="ponAbe", dist=15.2) n3 = n2.add_child(dist=2.21, name="ancestry1") n2.add_child(name="gorGor", dist=8.61) n3.add_child(name="hg", dist=6.4) n3.add_child(name="panTro", dist=6.4) #print(t) print(t.write(features=[]))
} sequence_data_path = replicate_output_dir + "sequence_data.fas" character_data_path = replicate_output_dir + "character_data.fas" history_tree_path = replicate_output_dir + "unlabeled_trait_history.nwk" true_history_path = replicate_output_dir + "labeled_trait_history.nwk" write_simulation_parameters(tree_path, model_parameters, sequence_data_path, character_data_path, history_tree_path, true_history_path, aln_len, simulation_parameters_path) simulation_output_log = replicate_output_dir + "simulator_log.txt" res = os.system(simulator_path + " param=" + simulation_parameters_path + " > " + simulation_output_log) # re-write the history path without internal nodes names history_tree = Tree(history_tree_path, format=1) history_tree.write(outfile=history_tree_path, format=5) # extract the labeling of nodes in the trait history for relax parameters and traitrelax debugging label_to_nodes = {"0": [], "1": []} true_history = Tree(true_history_path, format=1) node_index = 0 label_regex = re.compile("{(.*?)}") for node in true_history.traverse("postorder"): if not node.is_root(): node_id = node_index node_index += 1 node_label = label_regex.search(node.name).group(1) label_to_nodes[node_label].append(node_id) labels_str = "model1.nodes_id=" for i in range(len(label_to_nodes["0"]) - 1):
from ete3 import Tree t = Tree('((H:0.3,I:0.1):0.5, A:1, (B:0.4,(C:1,D:1):0.5):0.5);') # Create a small function to filter your nodes def conditional_function(node): if node.dist > 0.3: return True else: return False # Use previous function to find matches. Note that we use the traverse # method in the filter function. This will iterate over all nodes to # assess if they meet our custom conditions and will return a list of # matches. matches = filter(conditional_function, t.traverse()) print len(matches), "nodes have ditance >0.3" # depending on the complexity of your conditions you can do the same # in just one line with the help of lambda functions: matches = filter(lambda n: n.dist > 0.3 and n.is_leaf(), t.traverse()) print len(matches), "nodes have ditance >0.3 and are leaves"
def scale_tree(input_tree_path, output_tree_path, scaling_factor=1.0): tree = Tree(input_tree_path, format=1) for node in tree.traverse(): node.dist = node.dist * scaling_factor tree.write(outfile=output_tree_path, format=1)
from ete3 import Tree # Let's create simple tree t = Tree('((((H,K),(F,I)G),E),((L,(N,Q)O),(P,S)));', format=1) print "Original tree looks like this:" print t # # /-H # /--------| # | \-K # /--------| # | | /-F # /--------| \--------| # | | \-I # | | # | \-E #---------| # | /-L # | /--------| # | | | /-N # | | \--------| # \--------| \-Q # | # | /-P # \--------| # \-S # Prune the tree in order to keep only some leaf nodes. t.prune(["H", "F", "E", "Q", "P"]) print "Pruned tree" print t # # /-F
from ete3 import Tree import sys t = Tree(sys.argv[1]) tips = [] for tip in t: tips.append(tip.name) Archaea = open("archaeagenomes.csv") Bacteria = open("bacteriagenomes.csv") archaea = [] bacteria = [] for line in Archaea: this_id = line.strip() if this_id in tips: archaea.append(this_id) for line in Bacteria: this_id = line.strip() if this_id in tips: bacteria.append(this_id) all_taxa = archaea + bacteria ancestorA = t.get_common_ancestor(archaea) ancestorB = t.get_common_ancestor(bacteria) ancestorR = t.get_common_ancestor(all_taxa) t.unroot() ab_dist = ancestorA.get_distance(ancestorB)
def renderingTreeImage(self): path = os.path.join('Input', 'ProteinInput') seq_records = SeqIO.parse(path, 'fasta') for record in seq_records: self.input_protein_accession_number.append(record.id) self.input_protein_sequence.append(record.seq) with open(os.path.join('execs', 'tmp', "rooted_tree.nwk")) as nwk_tree_handle: nwk_tree = nwk_tree_handle.read() t = Tree(nwk_tree) print(t) print '\n' ts = TreeStyle() ts.title.add_face(TextFace( 'PhyloEpsilon - Protein Ortholog Finding Tool by Bryan Dighera', fsize=16, ), column=0) ts.allow_face_overlap = True ts.show_leaf_name = True ts.show_branch_support = True leaf_names = [] for leaf in t.get_leaf_names(): np_xp_pattern = re.compile('N[P]|X[P]') digits_pattern = re.compile('\d+.\d') np_xp_search_obj = re.search(np_xp_pattern, leaf) digits_search_obj = re.search(digits_pattern, leaf) np_xp_name = np_xp_search_obj.group() digits_name = digits_search_obj.group() final_accession = str(np_xp_name + '_' + digits_name) print final_accession leaf_names.append(final_accession) #print 'leaf names: ' + '%s' % leaf_names P = Protein() protein_domains, domain_colors, unrepeated_domains = P.Domains() print domain_colors #Creates a dictionary that corresponds the protein accession number to its corresponding introns for i in range(len(leaf_names)): self.accession_dict_with_introns[ self.input_protein_accession_number[i]] = self.exon_lengths[i] i = 0 print 'protein accession number: ' + '%s' % self.input_protein_accession_number print 'Accession dict: ' + '%s' % self.accession_dict_with_introns + '\n' #Iterates through the accession numbers that correspond the the order of the leaves of the phylogenetic tree to retrieve introns and build fig for accession_number in leaf_names: intron_motifs = [[0, 0, "[]", None, 12, "White", "White", None]] #Checks the accession number against the dictionary and retrieves the corresponding introns, if no introns then doesn't append any if accession_number in self.accession_dict_with_introns: print accession_number, self.accession_dict_with_introns[ accession_number] exon_list = self.accession_dict_with_introns[accession_number] print exon_list for exon_length in exon_list: if str(exon_length) != 'NONE': for location in exon_length: split_exon_location = str(location).split('-') protein_seq_exon_location = int( math.floor(int(split_exon_location[1]) / 3)) #Calculates the intron phase and then checks the phase to append appropriate color indicating phase on diagram intron_phase = (int(split_exon_location[1]) - int(split_exon_location[0])) % 3 if intron_phase == 0: intron_motifs.append([ protein_seq_exon_location - 2, protein_seq_exon_location + 2, "[]", None, 5, "Grey", "Grey", None ]) elif intron_phase == 1: intron_motifs.append([ protein_seq_exon_location - 2, protein_seq_exon_location + 2, "[]", None, 5, "Black", "Black", None ]) elif intron_phase == 2: intron_motifs.append([ protein_seq_exon_location - 2, protein_seq_exon_location + 2, "[]", None, 5, "Blue", "Blue", None ]) else: print 'NO INTRONS FOUND FOR RECORD' print str(intron_motifs) + '\n' msa_protein_seq = self.msa_aligned_protein[i].strip('-') #ete3 module that adds the introns(motifs) to the phylogenetic tree seqFace = SeqMotifFace(str(msa_protein_seq), gapcolor="black", seq_format='line', scale_factor=1, motifs=intron_motifs) (t & t.get_leaf_names()[i]).add_face(seqFace, 0, "aligned") i += 1 n = 0 # Iterates through the accession numbers that correspond to the order of the leaves of the phylogenetic tree and compare to domain dict values # TODO: Add the legend and possibly give a number to each of the domains so they can be easily identified in the legend for accession_number in leaf_names: domain_motifs = [[0, 0, "[]", None, 12, "White", "White", None]] for domain in protein_domains: if accession_number in domain: print 'leaf accession #: ' + '%s' % accession_number print 'domains accession: ' + '%s' % domain.keys()[0] print domain.values()[0] for each_domain in domain.values()[0]: try: domain_motif_color = domain_colors[each_domain[0]] start_domain_loc = int( each_domain[1].split(':')[0]) end_domain_loc = int(each_domain[1].split(':')[1]) domain_name = str(each_domain[0]) domain_motifs.append([ start_domain_loc, end_domain_loc, "<>", 20, 20, 'Black', domain_motif_color, 'arial|8|black|' ]) except ValueError: domain_motif_color = domain_colors[each_domain[0]] start_pattern = re.compile('(?<!=\W)\d+') start_pattern_search = re.search( start_pattern, str(each_domain[1].split(':')[0])) start_domain_loc = int( start_pattern_search.group()) end_pattern = re.compile('(?<!=\W)\d+') end_pattern_search = re.search( end_pattern, str(each_domain[1].split(':')[1])) end_domain_loc = int(end_pattern_search.group()) domain_motifs.append([ start_domain_loc, end_domain_loc, "<>", 20, 20, 'Black', domain_motif_color, 'arial|8|black|' ]) print domain_motifs msa_protein_seq = self.msa_aligned_protein[n].strip('-') print msa_protein_seq print len(msa_protein_seq) print '*' * 100 domainFace = SeqMotifFace(str(msa_protein_seq), gapcolor="black", seq_format='line', scale_factor=1, motifs=domain_motifs) (t & t.get_leaf_names()[n]).add_face(domainFace, 0, "aligned") n += 1 #Creating the legend print protein_domains for single_unrepeat, colors in domain_colors.iteritems(): ts.legend.add_face(TextFace(single_unrepeat), column=0) ts.legend.add_face(SeqMotifFace( "A" * 45, [[0, 80, "[]", None, 8, "Black", colors, None]]), column=1) ts.legend_position = 1 #name_of_run = nameOfRun() file_name = self.run_name t.show(tree_style=ts) t.render(os.path.join('CompletedTrees', file_name + '.pdf'), tree_style=ts)
breakdown[info[tax_level]] = 1.0 / float(total_size) return breakdown #compute the most frequent sister group of each (monophyletic?) group on the tree, to identify trends in gene transfers, "unstable" taxa, etc. labels = {} name_to_tax_info = defaultdict(dict) taxa_names = [] summary = defaultdict(dict) groups = [] clades_per_group = defaultdict(list) target_label = 'cluster' #edit this to make the comparisons at a desired taxonomic level #read the ML tree, set up the taxonomy stuff, and calculate the number of clades per label, and the sizes of those clades (to report at the end) ml_tree = Tree(sys.argv[1]) for leaf in ml_tree: taxonomy = parse_taxonomy(leaf.name) name_to_tax_info[leaf.name] = taxonomy taxa_names.append(leaf.name) leaf.add_feature("tax", taxonomy[target_label]) labels[taxonomy[target_label]] = 1 groups = labels.keys() #compute the number of clades per label in the ML tree, and their sizes ML_groups = defaultdict( list ) #the list is the size of each clade, len(list) is the number of clades for that label in the ML tree for label in groups: for node in ml_tree.get_monophyletic(values=[label], target_attr="tax"): size_clade = 0
leaf_style["hz_line_color"] = "black" leaf_style["hz_line_width"] = 5 leaf_style["vt_line_color"] = "black" leaf_style["vt_line_width"] = 5 bg_color = color_dict[phylum] leaf_style["bgcolor"] = bg_color leaf.set_style(leaf_style) leaf_face = TextFace(leaf_name.strip("'"), fsize=20) leaf.add_face(leaf_face, 0, 'aligned') break except TreeError: leaf_name = "'%s'" % leaf_name time_redo -= 1 continue if __name__ == '__main__': params = read_params(sys.argv) mkdir(params['outdir']) t = Tree(params['newick'], format=1) # t.show(tree_style=ts) color_dict, genus_in_phylum = get_dict(params['tax_ass']) set_default_node_style(t) set_leaf_style(genus_in_phylum, t) ts = get_default_tree_style(color_dict) pdf_file = '%s/phylo_tree.pdf' % params['outdir'] png_file = '%s/phylo_tree.png' % params['outdir'] t.render(pdf_file, tree_style=ts, dpi=300) image_trans(pdf_file, png_file)
if "--all" in sys.argv: tree_file_list = [ t for t in glob("trees/HIV1_FLT_2018_genome_DNA_mask*.fa.treefile") ] plot_prefix = "whole_v_allmasks" if "--mask-as-ref" in sys.argv: tree_file_list = [ref_tree_file] ref_tree_file = "trees/HIV1_FLT_2018_genome_DNA_mask100.fa.treefile" plot_prefix = "masked_v_whole" ref_bs_label = "Masked Alignment Bootstrap" tree_bs_label = "Whole Alignment Bootstrap" pct_masks = [100] tree_orientation = 1 ref_tree = Tree(ref_tree_file, format=1) ref_tree.set_outgroup(outgroup) add_support_and_subtypes(ref_tree) mask_regex = r"mask(\d+)" shared_edge_support_values = [] ref_only_edge_support_values = [] for tree_file in tree_file_list: pct_mask_match = re.search(r"mask(\d+)", tree_file) if pct_mask_match is not None: pct_mask = int(pct_mask_match.groups()[0]) else: pct_mask = 0 print("Adding {} bootstrap values to tree from {}...".format( tree_file, ref_tree_file))
stack.append(dictionary[current][3]) result.append("(") else: result.append(current) current_prev = current result.pop() result.append(")") return result if __name__ == "__main__": matrix, length = readInput() dictionary = {} finalCluster = upgma(matrix, length, dictionary) result = printCluster(dictionary, finalCluster) result = ''.join(result) result = result + ";" #ete3 is tool for pylogenetic tree construction from ete3 import Tree tree = Tree(result) print("UPGMA Resultant Clustering:") print("") print(result) print("") print(tree)
if line.startswith('LTR'): # record name, divergence, and scaled divergence (IGC) (rt_name, classification, I, clust, clustSize, model, div, divc, IGC) = line.strip().split('\t') divergences[rt_name] = div divergencesCorrected[rt_name] = divc IGCdct[rt_name] = IGC classifDct[rt_name] = classification # generate color gradient for representing divergence values as colored # circles at tips of leaves in the rendered tree num_colors = 20000 # yellow, red, blue, black color_gradient = polylinear_gradient( ['#FAFF00', '#FF1800', '#001EFF', '#000000'], num_colors) # load the newick tree t = Tree(tree_flpath) # for marking which elements did not have information in the divergences # file for coloring the circles white NOLTRDIVERGENCES = False # record the greatest divergence value for automatically setting the # outgroup as the element with the most divergent LTRs (estimating the # branch containing the oldest element as the first split in the tree) greatest_div = {'element': None, 'value': 0} # scale bootstrap values to percentages for node in t.traverse(): node.support = node.support * 100 # assign the colors for the node circles based on divergence for node in t: node_name = str(node).split('-')[-1] rt_name = 'LTR_retrotransposon{0}'.format(node_name.split('_')[0]) if rt_name in divergences:
def build_nj_phylip(alignment, outfile, outgroup, work_dir="."): """ build neighbor joining tree of DNA seqs with PHYLIP in EMBOSS PHYLIP manual http://evolution.genetics.washington.edu/phylip/doc/ """ phy_file = op.join(work_dir, "work", "aln.phy") try: AlignIO.write(alignment, file(phy_file, "w"), "phylip") except ValueError: print( "Repeated seq name, possibly due to truncation. NJ tree not built.", file=sys.stderr, ) return None seqboot_out = phy_file.rsplit(".", 1)[0] + ".fseqboot" seqboot_cl = FSeqBootCommandline( FPHYLIP_BIN("fseqboot"), sequence=phy_file, outfile=seqboot_out, seqtype="d", reps=100, seed=12345, ) stdout, stderr = seqboot_cl() logging.debug("Resampling alignment: %s" % seqboot_cl) dnadist_out = phy_file.rsplit(".", 1)[0] + ".fdnadist" dnadist_cl = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), sequence=seqboot_out, outfile=dnadist_out, method="f") stdout, stderr = dnadist_cl() logging.debug("Calculating distance for bootstrapped alignments: %s" % dnadist_cl) neighbor_out = phy_file.rsplit(".", 1)[0] + ".njtree" e = phy_file.rsplit(".", 1)[0] + ".fneighbor" neighbor_cl = FNeighborCommandline( FPHYLIP_BIN("fneighbor"), datafile=dnadist_out, outfile=e, outtreefile=neighbor_out, ) stdout, stderr = neighbor_cl() logging.debug("Building Neighbor Joining tree: %s" % neighbor_cl) consense_out = phy_file.rsplit(".", 1)[0] + ".consensustree.nodesupport" e = phy_file.rsplit(".", 1)[0] + ".fconsense" consense_cl = FConsenseCommandline( FPHYLIP_BIN("fconsense"), intreefile=neighbor_out, outfile=e, outtreefile=consense_out, ) stdout, stderr = consense_cl() logging.debug("Building consensus tree: %s" % consense_cl) # distance without bootstrapping dnadist_out0 = phy_file.rsplit(".", 1)[0] + ".fdnadist0" dnadist_cl0 = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), sequence=phy_file, outfile=dnadist_out0, method="f") stdout, stderr = dnadist_cl0() logging.debug("Calculating distance for original alignment: %s" % dnadist_cl0) # infer branch length on consensus tree consensustree1 = phy_file.rsplit(".", 1)[0] + ".consensustree.branchlength" run_ffitch(distfile=dnadist_out0, outtreefile=consensustree1, intreefile=consense_out) # write final tree ct_s = Tree(consense_out) if outgroup: t1 = consensustree1 + ".rooted" t2 = smart_reroot(consensustree1, outgroup, t1) if t2 == t1: outfile = outfile.replace(".unrooted", "") ct_b = Tree(t2) else: ct_b = Tree(consensustree1) nodesupport = {} for node in ct_s.traverse("postorder"): node_children = tuple(sorted([f.name for f in node])) if len(node_children) > 1: nodesupport[node_children] = node.dist / 100.0 for k, v in nodesupport.items(): ct_b.get_common_ancestor(*k).support = v print(ct_b) ct_b.write(format=0, outfile=outfile) try: s = op.getsize(outfile) except OSError: s = 0 if s: logging.debug("NJ tree printed to %s" % outfile) return outfile, phy_file else: logging.debug("Something was wrong. NJ tree was not built.") return None
from ete3 import Tree import os import re treefilename = input("Enter master tree name: ") dirname = input("Enter tree file directory: ") supertree = Tree(treefilename) exists = False checkname = '' newcheckname = '' mastertreenamelist = [] for node in supertree.traverse("postorder"): mastertreenamelist.append(node.name) dirlist = os.listdir(dirname) for filename in dirlist: currenttree = Tree(dirname+filename) for node in currenttree.traverse("postorder"): #extracts species name from node for l in range(len(node.name)): if node.name[l] == "_": newcheckname += "_" break else: newcheckname += node.name[l] #finds species in mastertree if (newcheckname in mastertreenamelist): exists = True
print("reading input reconciled trees.") spTree = None isUndated = False IN = open(params["-g"], "r") l = IN.readline() while l != "": if l != "\n": if l.startswith("("): ##special ignore white lines ALEtree = Tree(l, format=1) RT = ALEtreeToReconciledTree(ALEtree, isUndated=isUndated) if isUndated: refineReconciledTreeWithTransferBack(RT) ConvertRTtoLossIndepVersion(RT, speciesTree=spTree, keptChildNameSuffix=".c") XMLlines = RT.getTreeRecPhyloXMLLines() for xmlline in XMLlines: OUT.write(indentLevel * indentChar + xmlline + "\n")
from ete3 import Tree t = Tree() t.populate(15) print(t) t.show()
def getTheTrees(): ##DOWNLOAD taxdump and store in taxo folder ##DOWNLOAD TAXREF BY HAND! and put it in taxo/ class Trans: def __init__(self): self.common_name_FR = [] print "Getting french translations..." TRANS = {} ##translations in french with open("taxo/TAXREFv11.txt") as f: for line in f: sciname = line.split("\t")[14] comnameFR = line.split("\t")[19] if (TRANS.has_key(sciname) == False and line.split("\t")[19] != ''): TRANS[sciname] = Trans() if (line.split("\t")[19] != ''): TRANS[sciname].common_name_FR.append(comnameFR) #get translation of ranks print "\nGetting rank names in french..." RANKS = {} with open("taxo/ranks.txt") as f: for line in f: rank_en = line.split("\t")[0] rank_fr = line.split("\t")[1].rstrip() ##to remove \n RANKS[rank_en] = rank_fr class Taxid: def __init__(self): self.sci_name = "" self.authority = "" self.synonym = "" # self.common_name = "" self.common_name = [] # self.common_name_FR = "" self.common_name_FR = [] cpt = 0 cptfr = 0 ATTR = {} ##here we will list attribute of each species per taxid print "Reading NCBI taxonomy..." with open("taxo/names.dmp") as f: for line in f: taxid = line.split("|")[0].replace("\t", "") tid_val = line.split("|")[1].replace("\t", "") tid_type = line.split("|")[3].replace("\t", "") if (ATTR.has_key(taxid) == False): ATTR[taxid] = Taxid() if (tid_type == "scientific name"): ATTR[taxid].sci_name = tid_val #and get translation in french (if any) if TRANS.has_key(tid_val): ATTR[taxid].common_name_FR = TRANS[tid_val].common_name_FR cptfr += 1 if (tid_type == "authority"): if (ATTR[taxid].authority != ""): ATTR[taxid].authority = ATTR[ taxid].authority + ", " + tid_val else: ATTR[taxid].authority = tid_val if (tid_type == "synonym"): if (ATTR[taxid].synonym != ""): ATTR[taxid].synonym = ATTR[taxid].synonym + ", " + tid_val else: ATTR[taxid].synonym = tid_val if (tid_type == "common name"): cpt += 1 ATTR[taxid].common_name.append(tid_val) if (tid_type == "genbank common name"): cpt += 1 ATTR[taxid].common_name.append(tid_val) # if (ATTR[taxid].common_name!=""): # ATTR[taxid].common_name = ATTR[taxid].common_name + ", " + tid_val # else: # ATTR[taxid].common_name = tid_val T = {} ###New gettrees from ete3 import Tree filepath = 'taxo/nodes.dmp' print "Building the NCBI taxonomy tree..." with open(filepath) as fp: first_line = fp.readline() ## remove the 1 | 1 edge for line in fp: dad = line.split("|")[1].replace("\t", "") son = line.split("|")[0].replace("\t", "") rank = line.split("|")[2].replace("\t", "") if (T.has_key(dad) == False): T[dad] = Tree() T[dad].name = dad T[dad].taxid = dad T[dad].sci_name = ATTR[dad].sci_name T[dad].common_name = ATTR[dad].common_name T[dad].synonym = ATTR[dad].synonym T[dad].authority = ATTR[dad].authority T[dad].common_name_FR = ATTR[dad].common_name_FR if (T.has_key(son) == False): T[son] = Tree() T[son].name = son T[son].rank = rank T[son].rank_FR = RANKS[rank] T[son].taxid = son T[son].sci_name = ATTR[son].sci_name T[son].common_name = ATTR[son].common_name T[son].synonym = ATTR[son].synonym T[son].authority = ATTR[son].authority T[son].common_name_FR = ATTR[son].common_name_FR else: if (hasattr(T[son], 'rank') == False): T[son].rank = rank # T[son].rank_FR = RANKS[rank] T[dad].add_child(T[son]) return T