Example #1
0
def populate_leaves(new_clade, taxid, lineages):
    for sp in lineages.keys():
        lineage = lineages[sp]
        if not taxid in lineage:
            continue
        new_leaf = ete3.PhyloNode(name=sp)
        new_leaf.ancestors = lineage
        new_clade.add_child(new_leaf)
    return new_clade
Example #2
0
def read_treefile(g):
    g['rooted_tree'] = ete3.PhyloNode(g['rooted_tree_file'], format=1)
    assert len(g['rooted_tree'].get_children(
    )) == 2, 'The input tree may be unrooted: {}'.format(g['rooted_tree_file'])
    g['rooted_tree'] = tree.standardize_node_names(g['rooted_tree'])
    g['rooted_tree'] = tree.add_numerical_node_labels(g['rooted_tree'])
    g['num_node'] = len(list(g['rooted_tree'].traverse()))
    print('Using internal node names and branch lengths in --iqtree_treefile '
          'and the root position in --rooted_tree_file.')
    return g
Example #3
0
def get_misc_node_statistics(tree_file, tax_annot=False):
    tree = ete3.PhyloNode(tree_file, format=1)
    tree = add_numerical_node_labels(tree)
    cn1 = ["numerical_label", "taxon", "taxid", "num_sp", "num_leaf", "so_event", "dup_conf_score"]
    cn2 = ["parent", "sister", "child1", "child2", "so_event_parent"]
    cn = cn1 + cn2
    df = pandas.DataFrame(0, index=range(0, len(list(tree.traverse()))), columns=cn)
    df.loc[:, "parent"] = -999
    df.loc[:, "sister"] = -999
    df.loc[:, "child1"] = -999
    df.loc[:, "child2"] = -999
    df.loc[:, "so_event"] = "L"
    df.loc[:, "so_event_parent"] = "S"
    if tax_annot:
        tree = taxonomic_annotation(tree)
    else:
        for node in tree.traverse():
            node.taxid = -999
            if node.is_leaf():
                node.sci_name = re.sub('_.*','',node.name.replace('_',' ',1))
            else:
                node.sci_name = ''
    row = 0
    for node in tree.traverse():
        df.loc[row, "numerical_label"] = node.numerical_label
        df.loc[row, "taxon"] = node.sci_name
        df.loc[row, "taxid"] = node.taxid
        df.loc[row, "num_sp"] = len(set([leaf.sci_name for leaf in node.iter_leaves()]))
        df.loc[row, "num_leaf"] = len(list(node.get_leaves()))
        if hasattr(node.up, "numerical_label"):
            df.loc[row, "parent"] = node.up.numerical_label
        sister = node.get_sisters()
        if len(sister) == 1:
            df.loc[row, "sister"] = sister[0].numerical_label
        if not node.is_leaf():
            df.loc[row, "child1"] = node.children[0].numerical_label
            df.loc[row, "child2"] = node.children[1].numerical_label
            sp_child1 = set([leaf.sci_name for leaf in node.children[0].iter_leaves()])
            sp_child2 = set([leaf.sci_name for leaf in node.children[1].iter_leaves()])
            num_union = len(sp_child1.union(sp_child2))
            num_intersection = len(sp_child1.intersection(sp_child2))
            node.dup_conf_score = num_intersection / num_union
            df.loc[row, "dup_conf_score"] = node.dup_conf_score
            if node.dup_conf_score > 0:
                df.loc[row, "so_event"] = "D"
            elif node.dup_conf_score == 0:
                df.loc[row, "so_event"] = "S"
        if not isinstance(node.up, type(None)):
            if (node.up.dup_conf_score > 0):
                df.loc[row, "so_event_parent"] = "D"
        row += 1
    return (df)
Example #4
0
def make_sisters(tree, clade1, clade2):
    anc = get_clade(tree, clade1)
    anc_up = anc.up
    anc.detach()
    anc_up.delete()

    sis = get_clade(tree, clade2)
    sis_up = sis.up
    sis.detach()

    new_node = ete3.PhyloNode()
    sis_up.add_child(new_node)
    new_node.add_child(sis)
    new_node.add_child(anc)
Example #5
0
def taxid2tree(lineages, taxid_counts):
    ncbi = ete3.NCBITaxa()
    is_multiple = (taxid_counts[:, 1] > 1)
    multi_counts = taxid_counts[is_multiple, :]
    clades = list()
    for i in numpy.arange(multi_counts.shape[0]):
        taxid = multi_counts[i, 0]
        count = multi_counts[i, 1]
        ancestors = ncbi.get_lineage(taxid)
        new_clade = ete3.PhyloNode()
        new_clade.ancestors = ancestors
        new_clade = populate_leaves(new_clade, taxid, lineages)
        clades = add_new_clade(clades, new_clade)
    assert len(clades) == 1, 'Failed to merge clades into a single tree.'
    tree = clades[0]
    return tree
Example #6
0
def get_input_information(g):
    files = os.listdir(g['phylobayes_dir'])
    sample_labels = [file for file in files if "_sample.labels" in file][0]
    g['tree'] = ete3.PhyloNode(g['phylobayes_dir'] + sample_labels, format=1)
    g['tree'] = tree.add_numerical_node_labels(g['tree'])
    g['num_node'] = len(list(g['tree'].traverse()))
    state_files = [f for f in files if f.endswith('.ancstatepostprob')]
    state_table = pandas.read_csv(g['phylobayes_dir'] + state_files[0],
                                  sep="\t",
                                  index_col=False,
                                  header=0)
    g['num_input_site'] = state_table.shape[0]
    g['num_input_state'] = state_table.shape[1] - 2
    g['input_state'] = state_table.columns[2:].tolist()
    g['num_ancstatepostprob_file'] = len(state_files)
    if g['num_input_state'] == 4:
        g['input_data_type'] = 'nuc'
    elif g['num_input_state'] == 20:
        g['input_data_type'] = 'pep'
    elif g['num_input_state'] > 20:
        g['input_data_type'] = 'cdn'
    if (g['input_data_type'] == 'nuc') & (g['calc_omega']):
        g['state_columns'] = list(
            itertools.product(numpy.arange(len(g['input_state'])), repeat=3))
        codon_orders = list(itertools.product(g['input_state'], repeat=3))
        codon_orders = [c[0] + c[1] + c[2] for c in codon_orders]
        g['codon_orders'] = codon_orders
        amino_acids = sorted(
            list(set([c[0] for c in g['codon_table'] if c[0] != '*'])))
        g['amino_acid_orders'] = amino_acids
        matrix_groups = dict()
        for aa in list(set(amino_acids)):
            matrix_groups[aa] = [c[1] for c in g['codon_table'] if c[0] == aa]
        g['matrix_groups'] = matrix_groups
        synonymous_indices = dict()
        for aa in matrix_groups.keys():
            synonymous_indices[aa] = []
        for i, c in enumerate(g['codon_orders']):
            for aa in matrix_groups.keys():
                if c in matrix_groups[aa]:
                    synonymous_indices[aa].append(i)
                    break
        g['synonymous_indices'] = synonymous_indices
        g['max_synonymous_size'] = max(
            [len(si) for si in synonymous_indices.values()])
    return g
Example #7
0
def annotate_tree(g):
    g['node_label_tree_file'] = g['iqtree_treefile']
    f = open(g['node_label_tree_file'])
    tree_string = f.readline()
    g['node_label_tree'] = ete3.PhyloNode(tree_string, format=1)
    f.close()
    g['node_label_tree'] = tree.standardize_node_names(g['node_label_tree'])
    g['tree'] = tree.transfer_root(tree_to=g['node_label_tree'],
                                   tree_from=g['rooted_tree'],
                                   verbose=False)
    g['tree'] = tree.add_numerical_node_labels(g['tree'])
    print('Total branch length of --rooted_tree_file:',
          sum([n.dist for n in g['rooted_tree'].traverse()]))
    print('Total branch length of --iqtree_treefile:',
          sum([n.dist for n in g['node_label_tree'].traverse()]))
    print('')
    return g
Example #8
0
def nwk2table(tree, mode=['branch_length', 'branch_support', 'node_name'], age=False, parent=False, sister=False):
    if (mode == 'branch_length'):
        tree_format = 1
        attr = 'dist'
    elif (mode == 'branch_support'):
        tree_format = 0
        attr = 'support'
    elif (mode == 'node_name'):
        tree_format = 1
        attr = 'name'
    cn = ["numerical_label", mode]
    if type(tree) == str:
        tree = ete3.PhyloNode(tree, format=tree_format)
    elif type(tree) == ete3.PhyloNode:
        tree = tree
    tree = add_numerical_node_labels(tree)
    df = pandas.DataFrame(0, index=range(0, len(list(tree.traverse()))), columns=cn)
    row = 0
    for node in tree.traverse():
        df.loc[row, "numerical_label"] = node.numerical_label
        df.loc[row, mode] = getattr(node, attr)
        row += 1
    if (mode == 'branch_support'):
        df.loc[df['branch_support'] == 1, 'branch_support'] = numpy.nan
    if (mode == 'branch_length') & (age):
        assert check_ultrametric(tree)
        df['age'] = numpy.nan
        for node in tree.traverse():
            df.loc[(df['numerical_label'] == node.numerical_label), 'age'] = node.get_distance(
                target=node.get_leaves()[0])
    if parent:
        df['parent'] = -1
        for node in tree.traverse():
            if not node.is_root():
                df.loc[(df['numerical_label'] == node.numerical_label), 'parent'] = node.up.numerical_label
    if sister:
        df['sister'] = -1
        for node in tree.traverse():
            if not node.is_root():
                df.loc[(df['numerical_label'] == node.numerical_label), 'sister'] = node.get_sisters()[
                    0].numerical_label
    df = df.sort_values(by='numerical_label', ascending=True)
    return (df)
Example #9
0
def ou2table(regime_file, leaf_file, input_tree_file):
    df_regime = pandas.read_csv(regime_file, sep="\t")
    df_leaf = pandas.read_csv(leaf_file, sep="\t")
    tree = ete3.PhyloNode(input_tree_file, format=1)
    tree = add_numerical_node_labels(tree)
    tissues = df_leaf.columns[3:]
    if ('expectations' in df_leaf['param'].values):
        df_leaf.loc[(df_leaf['param'] == 'expectations'), 'param'] = 'mu'
    cn1 = ["numerical_label", "regime", "is_shift", "num_child_shift"]
    cn2 = ["tau", "delta_tau", "delta_maxmu", "mu_complementarity"]
    cn3 = ["mu_" + tissue for tissue in tissues]
    cn = cn1 + cn2 + cn3
    df = pandas.DataFrame(0, index=range(0, len(list(tree.traverse()))), columns=cn)
    for node in tree.traverse():
        node.regime = 0
    for node in tree.traverse():
        if node.name in df_regime.loc[:, "node_name"].fillna(value="placeholder_text").values:
            regime_nos = df_regime.loc[df_regime.node_name.values == node.name, "regime"]
            regime_no = int(regime_nos.iloc[0])
            for sub_node in node.traverse():
                sub_node.regime = regime_no
    df_leaf_unique = df_leaf.loc[
        df_leaf['param'] == 'mu', df_leaf.columns[[c not in ['label', 'param'] for c in df_leaf.columns]]]
    df_leaf_unique = df_leaf_unique.drop_duplicates()
    df_leaf_unique = df_leaf_unique.groupby(by='regime').mean()
    df_leaf_unique = df_leaf_unique.reset_index()
    for node in tree.traverse():
        node.mu = df_leaf_unique.loc[(df_leaf_unique['regime'] == node.regime), :]
    row = 0
    for node in tree.traverse():
        df.loc[row, "numerical_label"] = node.numerical_label
        df.loc[row, "regime"] = node.regime
        df.loc[row, cn3] = node.mu.loc[:, tissues].values[0]
        is_shift = 0
        if not node.is_root():
            if node.regime != node.up.regime:
                is_shift = 1
        df.loc[row, "is_shift"] = is_shift
        row += 1
    df["tau"] = calc_tau(df, cn3, unlog2=True, unPlus1=True)
    row = 0
    for node in tree.traverse():
        # highest_value = df.loc[df.numerical_label==node.numerical_label,cn3].max(axis=1).values
        # is_highest = df.loc[df.numerical_label==node.numerical_label,cn3].values.reshape(-1) == numpy.float(highest_value)
        # highest_in = numpy.array(cn3)[is_highest][0].replace("mu_", "")
        # df.loc[df.numerical_label==node.numerical_label,"highest_mu"] = highest_in
        if not node.is_root():
            tau_up = df.loc[df.numerical_label == node.up.numerical_label, "tau"].values
            tau_my = df.loc[df.numerical_label == node.numerical_label, "tau"].values
            df.loc[df.numerical_label == node.numerical_label, "delta_tau"] = tau_my - tau_up
            if df.loc[df.numerical_label == node.numerical_label, "is_shift"].values:
                my_label = node.numerical_label
                sis_label = node.get_sisters()[0].numerical_label
                my_maxmu = df.loc[df.numerical_label == my_label, cn3].max(axis=1).values
                sis_maxmu = df.loc[df.numerical_label == sis_label, cn3].max(axis=1).values
                delta_maxmu = my_maxmu - sis_maxmu
                df.loc[df.numerical_label == node.numerical_label, "delta_maxmu"] = delta_maxmu
                my_mu = df.loc[df.numerical_label == my_label, cn3]
                sis_mu = df.loc[df.numerical_label == sis_label, cn3]
                my_mu_unlog = (numpy.exp2(my_mu) - 1).clip(lower=0).values[0]
                sis_mu_unlog = (numpy.exp2(sis_mu) - 1).clip(lower=0).values[0]
                df.loc[df.numerical_label == node.numerical_label, "mu_complementarity"] = calc_complementarity(
                    my_mu_unlog, sis_mu_unlog)
        if not node.is_leaf():
            is_child1_shift = (node.regime != node.get_children()[0].regime)
            is_child2_shift = (node.regime != node.get_children()[1].regime)
            num_child_shift = sum([is_child1_shift, is_child2_shift])
            df.loc[row, "num_child_shift"] = num_child_shift
        row += 1
    return (df)
Example #10
0
def get_tree_height(tree_file):
    tree = ete3.PhyloNode(tree_file, format=1)
    height = tree.get_distance(target=tree.get_leaves()[0])
    return height