def populate_leaves(new_clade, taxid, lineages): for sp in lineages.keys(): lineage = lineages[sp] if not taxid in lineage: continue new_leaf = ete3.PhyloNode(name=sp) new_leaf.ancestors = lineage new_clade.add_child(new_leaf) return new_clade
def read_treefile(g): g['rooted_tree'] = ete3.PhyloNode(g['rooted_tree_file'], format=1) assert len(g['rooted_tree'].get_children( )) == 2, 'The input tree may be unrooted: {}'.format(g['rooted_tree_file']) g['rooted_tree'] = tree.standardize_node_names(g['rooted_tree']) g['rooted_tree'] = tree.add_numerical_node_labels(g['rooted_tree']) g['num_node'] = len(list(g['rooted_tree'].traverse())) print('Using internal node names and branch lengths in --iqtree_treefile ' 'and the root position in --rooted_tree_file.') return g
def get_misc_node_statistics(tree_file, tax_annot=False): tree = ete3.PhyloNode(tree_file, format=1) tree = add_numerical_node_labels(tree) cn1 = ["numerical_label", "taxon", "taxid", "num_sp", "num_leaf", "so_event", "dup_conf_score"] cn2 = ["parent", "sister", "child1", "child2", "so_event_parent"] cn = cn1 + cn2 df = pandas.DataFrame(0, index=range(0, len(list(tree.traverse()))), columns=cn) df.loc[:, "parent"] = -999 df.loc[:, "sister"] = -999 df.loc[:, "child1"] = -999 df.loc[:, "child2"] = -999 df.loc[:, "so_event"] = "L" df.loc[:, "so_event_parent"] = "S" if tax_annot: tree = taxonomic_annotation(tree) else: for node in tree.traverse(): node.taxid = -999 if node.is_leaf(): node.sci_name = re.sub('_.*','',node.name.replace('_',' ',1)) else: node.sci_name = '' row = 0 for node in tree.traverse(): df.loc[row, "numerical_label"] = node.numerical_label df.loc[row, "taxon"] = node.sci_name df.loc[row, "taxid"] = node.taxid df.loc[row, "num_sp"] = len(set([leaf.sci_name for leaf in node.iter_leaves()])) df.loc[row, "num_leaf"] = len(list(node.get_leaves())) if hasattr(node.up, "numerical_label"): df.loc[row, "parent"] = node.up.numerical_label sister = node.get_sisters() if len(sister) == 1: df.loc[row, "sister"] = sister[0].numerical_label if not node.is_leaf(): df.loc[row, "child1"] = node.children[0].numerical_label df.loc[row, "child2"] = node.children[1].numerical_label sp_child1 = set([leaf.sci_name for leaf in node.children[0].iter_leaves()]) sp_child2 = set([leaf.sci_name for leaf in node.children[1].iter_leaves()]) num_union = len(sp_child1.union(sp_child2)) num_intersection = len(sp_child1.intersection(sp_child2)) node.dup_conf_score = num_intersection / num_union df.loc[row, "dup_conf_score"] = node.dup_conf_score if node.dup_conf_score > 0: df.loc[row, "so_event"] = "D" elif node.dup_conf_score == 0: df.loc[row, "so_event"] = "S" if not isinstance(node.up, type(None)): if (node.up.dup_conf_score > 0): df.loc[row, "so_event_parent"] = "D" row += 1 return (df)
def make_sisters(tree, clade1, clade2): anc = get_clade(tree, clade1) anc_up = anc.up anc.detach() anc_up.delete() sis = get_clade(tree, clade2) sis_up = sis.up sis.detach() new_node = ete3.PhyloNode() sis_up.add_child(new_node) new_node.add_child(sis) new_node.add_child(anc)
def taxid2tree(lineages, taxid_counts): ncbi = ete3.NCBITaxa() is_multiple = (taxid_counts[:, 1] > 1) multi_counts = taxid_counts[is_multiple, :] clades = list() for i in numpy.arange(multi_counts.shape[0]): taxid = multi_counts[i, 0] count = multi_counts[i, 1] ancestors = ncbi.get_lineage(taxid) new_clade = ete3.PhyloNode() new_clade.ancestors = ancestors new_clade = populate_leaves(new_clade, taxid, lineages) clades = add_new_clade(clades, new_clade) assert len(clades) == 1, 'Failed to merge clades into a single tree.' tree = clades[0] return tree
def get_input_information(g): files = os.listdir(g['phylobayes_dir']) sample_labels = [file for file in files if "_sample.labels" in file][0] g['tree'] = ete3.PhyloNode(g['phylobayes_dir'] + sample_labels, format=1) g['tree'] = tree.add_numerical_node_labels(g['tree']) g['num_node'] = len(list(g['tree'].traverse())) state_files = [f for f in files if f.endswith('.ancstatepostprob')] state_table = pandas.read_csv(g['phylobayes_dir'] + state_files[0], sep="\t", index_col=False, header=0) g['num_input_site'] = state_table.shape[0] g['num_input_state'] = state_table.shape[1] - 2 g['input_state'] = state_table.columns[2:].tolist() g['num_ancstatepostprob_file'] = len(state_files) if g['num_input_state'] == 4: g['input_data_type'] = 'nuc' elif g['num_input_state'] == 20: g['input_data_type'] = 'pep' elif g['num_input_state'] > 20: g['input_data_type'] = 'cdn' if (g['input_data_type'] == 'nuc') & (g['calc_omega']): g['state_columns'] = list( itertools.product(numpy.arange(len(g['input_state'])), repeat=3)) codon_orders = list(itertools.product(g['input_state'], repeat=3)) codon_orders = [c[0] + c[1] + c[2] for c in codon_orders] g['codon_orders'] = codon_orders amino_acids = sorted( list(set([c[0] for c in g['codon_table'] if c[0] != '*']))) g['amino_acid_orders'] = amino_acids matrix_groups = dict() for aa in list(set(amino_acids)): matrix_groups[aa] = [c[1] for c in g['codon_table'] if c[0] == aa] g['matrix_groups'] = matrix_groups synonymous_indices = dict() for aa in matrix_groups.keys(): synonymous_indices[aa] = [] for i, c in enumerate(g['codon_orders']): for aa in matrix_groups.keys(): if c in matrix_groups[aa]: synonymous_indices[aa].append(i) break g['synonymous_indices'] = synonymous_indices g['max_synonymous_size'] = max( [len(si) for si in synonymous_indices.values()]) return g
def annotate_tree(g): g['node_label_tree_file'] = g['iqtree_treefile'] f = open(g['node_label_tree_file']) tree_string = f.readline() g['node_label_tree'] = ete3.PhyloNode(tree_string, format=1) f.close() g['node_label_tree'] = tree.standardize_node_names(g['node_label_tree']) g['tree'] = tree.transfer_root(tree_to=g['node_label_tree'], tree_from=g['rooted_tree'], verbose=False) g['tree'] = tree.add_numerical_node_labels(g['tree']) print('Total branch length of --rooted_tree_file:', sum([n.dist for n in g['rooted_tree'].traverse()])) print('Total branch length of --iqtree_treefile:', sum([n.dist for n in g['node_label_tree'].traverse()])) print('') return g
def nwk2table(tree, mode=['branch_length', 'branch_support', 'node_name'], age=False, parent=False, sister=False): if (mode == 'branch_length'): tree_format = 1 attr = 'dist' elif (mode == 'branch_support'): tree_format = 0 attr = 'support' elif (mode == 'node_name'): tree_format = 1 attr = 'name' cn = ["numerical_label", mode] if type(tree) == str: tree = ete3.PhyloNode(tree, format=tree_format) elif type(tree) == ete3.PhyloNode: tree = tree tree = add_numerical_node_labels(tree) df = pandas.DataFrame(0, index=range(0, len(list(tree.traverse()))), columns=cn) row = 0 for node in tree.traverse(): df.loc[row, "numerical_label"] = node.numerical_label df.loc[row, mode] = getattr(node, attr) row += 1 if (mode == 'branch_support'): df.loc[df['branch_support'] == 1, 'branch_support'] = numpy.nan if (mode == 'branch_length') & (age): assert check_ultrametric(tree) df['age'] = numpy.nan for node in tree.traverse(): df.loc[(df['numerical_label'] == node.numerical_label), 'age'] = node.get_distance( target=node.get_leaves()[0]) if parent: df['parent'] = -1 for node in tree.traverse(): if not node.is_root(): df.loc[(df['numerical_label'] == node.numerical_label), 'parent'] = node.up.numerical_label if sister: df['sister'] = -1 for node in tree.traverse(): if not node.is_root(): df.loc[(df['numerical_label'] == node.numerical_label), 'sister'] = node.get_sisters()[ 0].numerical_label df = df.sort_values(by='numerical_label', ascending=True) return (df)
def ou2table(regime_file, leaf_file, input_tree_file): df_regime = pandas.read_csv(regime_file, sep="\t") df_leaf = pandas.read_csv(leaf_file, sep="\t") tree = ete3.PhyloNode(input_tree_file, format=1) tree = add_numerical_node_labels(tree) tissues = df_leaf.columns[3:] if ('expectations' in df_leaf['param'].values): df_leaf.loc[(df_leaf['param'] == 'expectations'), 'param'] = 'mu' cn1 = ["numerical_label", "regime", "is_shift", "num_child_shift"] cn2 = ["tau", "delta_tau", "delta_maxmu", "mu_complementarity"] cn3 = ["mu_" + tissue for tissue in tissues] cn = cn1 + cn2 + cn3 df = pandas.DataFrame(0, index=range(0, len(list(tree.traverse()))), columns=cn) for node in tree.traverse(): node.regime = 0 for node in tree.traverse(): if node.name in df_regime.loc[:, "node_name"].fillna(value="placeholder_text").values: regime_nos = df_regime.loc[df_regime.node_name.values == node.name, "regime"] regime_no = int(regime_nos.iloc[0]) for sub_node in node.traverse(): sub_node.regime = regime_no df_leaf_unique = df_leaf.loc[ df_leaf['param'] == 'mu', df_leaf.columns[[c not in ['label', 'param'] for c in df_leaf.columns]]] df_leaf_unique = df_leaf_unique.drop_duplicates() df_leaf_unique = df_leaf_unique.groupby(by='regime').mean() df_leaf_unique = df_leaf_unique.reset_index() for node in tree.traverse(): node.mu = df_leaf_unique.loc[(df_leaf_unique['regime'] == node.regime), :] row = 0 for node in tree.traverse(): df.loc[row, "numerical_label"] = node.numerical_label df.loc[row, "regime"] = node.regime df.loc[row, cn3] = node.mu.loc[:, tissues].values[0] is_shift = 0 if not node.is_root(): if node.regime != node.up.regime: is_shift = 1 df.loc[row, "is_shift"] = is_shift row += 1 df["tau"] = calc_tau(df, cn3, unlog2=True, unPlus1=True) row = 0 for node in tree.traverse(): # highest_value = df.loc[df.numerical_label==node.numerical_label,cn3].max(axis=1).values # is_highest = df.loc[df.numerical_label==node.numerical_label,cn3].values.reshape(-1) == numpy.float(highest_value) # highest_in = numpy.array(cn3)[is_highest][0].replace("mu_", "") # df.loc[df.numerical_label==node.numerical_label,"highest_mu"] = highest_in if not node.is_root(): tau_up = df.loc[df.numerical_label == node.up.numerical_label, "tau"].values tau_my = df.loc[df.numerical_label == node.numerical_label, "tau"].values df.loc[df.numerical_label == node.numerical_label, "delta_tau"] = tau_my - tau_up if df.loc[df.numerical_label == node.numerical_label, "is_shift"].values: my_label = node.numerical_label sis_label = node.get_sisters()[0].numerical_label my_maxmu = df.loc[df.numerical_label == my_label, cn3].max(axis=1).values sis_maxmu = df.loc[df.numerical_label == sis_label, cn3].max(axis=1).values delta_maxmu = my_maxmu - sis_maxmu df.loc[df.numerical_label == node.numerical_label, "delta_maxmu"] = delta_maxmu my_mu = df.loc[df.numerical_label == my_label, cn3] sis_mu = df.loc[df.numerical_label == sis_label, cn3] my_mu_unlog = (numpy.exp2(my_mu) - 1).clip(lower=0).values[0] sis_mu_unlog = (numpy.exp2(sis_mu) - 1).clip(lower=0).values[0] df.loc[df.numerical_label == node.numerical_label, "mu_complementarity"] = calc_complementarity( my_mu_unlog, sis_mu_unlog) if not node.is_leaf(): is_child1_shift = (node.regime != node.get_children()[0].regime) is_child2_shift = (node.regime != node.get_children()[1].regime) num_child_shift = sum([is_child1_shift, is_child2_shift]) df.loc[row, "num_child_shift"] = num_child_shift row += 1 return (df)
def get_tree_height(tree_file): tree = ete3.PhyloNode(tree_file, format=1) height = tree.get_distance(target=tree.get_leaves()[0]) return height