Example #1
0
def main():
    print("Hello World!")
    print(
        "This file demonstrates the usage of the DendroPy module and its functions."
    )

    print("\n1. Tree Generations\n")
    print("\n1.1. Create two random birth-death trees and print them:\n")

    # If you want to compare two trees, make sure you specify the same Taxon Namespace!
    tns = dendropy.TaxonNamespace()
    num_leaves = 5

    t1 = dendropy.simulate.treesim.birth_death_tree(birth_rate=1.0,
                                                    death_rate=0.5,
                                                    num_extant_tips=num_leaves,
                                                    taxon_namespace=tns)
    t2 = dendropy.simulate.treesim.birth_death_tree(birth_rate=1.0,
                                                    death_rate=0.2,
                                                    num_extant_tips=num_leaves,
                                                    taxon_namespace=tns)
    print("\tTree 1: ", t1.as_string("newick"))
    t1.print_plot()
    print("\tTree 2: ", t2.as_string("newick"))
    t2.print_plot()

    print("\n2. Compare Trees\n")
    print(
        "\n2.1. Compare tree with itself and print Robinson-Foulds (RF) distance:\n"
    )

    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t1, t1))

    print(
        "\n2.2. Compare different trees and print Robinson-Foulds (RF) distance:\n"
    )

    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t1, t2))

    print("\n3. Load Trees from Newick Files and Compare:\n")
    print("\n3.1 Load trees from Newick files:\n")

    # If you want to compare two trees, make sure you specify the same Taxon Namespace!
    tns = dendropy.TaxonNamespace()
    directory = '/Users/filipbergentoft/Desktop/Github/DD2434/Assignment 2/2_4/'
    filename = directory + "data/q2_4/q2_4_tree_mixture.pkl_tree_0_newick.txt"
    with open(filename, 'r') as input_file:
        newick_str = input_file.read()
    t0 = dendropy.Tree.get(data=newick_str,
                           schema="newick",
                           taxon_namespace=tns)
    print("\tTree 0: ", t0.as_string("newick"))
    t0.print_plot()

    filename = directory + "data/q2_4/q2_4_tree_mixture.pkl_tree_1_newick.txt"
    with open(filename, 'r') as input_file:
        newick_str = input_file.read()
    t1 = dendropy.Tree.get(data=newick_str,
                           schema="newick",
                           taxon_namespace=tns)
    print("\tTree 1: ", t1.as_string("newick"))
    t1.print_plot()

    filename = directory + "data/q2_4/q2_4_tree_mixture.pkl_tree_2_newick.txt"
    with open(filename, 'r') as input_file:
        newick_str = input_file.read()
    t2 = dendropy.Tree.get(data=newick_str,
                           schema="newick",
                           taxon_namespace=tns)
    print("\tTree 2: ", t2.as_string("newick"))
    t2.print_plot()

    print("\n3.2 Compare trees and print Robinson-Foulds (RF) distance:\n")

    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t0, t1))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t0, t2))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t1, t2))

    print("\n4. Load Inferred Trees")
    filename = directory + "data/q2_4/q2_4_own_results_em_topology.npy"  # This is the result you have.
    topology_list = np.load(filename)
    print(topology_list.shape)
    print(topology_list)

    rt0 = Tree()
    rt0.load_tree_from_direct_arrays(topology_list[0])
    rt0 = dendropy.Tree.get(data=rt0.newick,
                            schema="newick",
                            taxon_namespace=tns)
    print("\tInferred Tree 0: ", rt0.as_string("newick"))
    rt0.print_plot()

    rt1 = Tree()
    rt1.load_tree_from_direct_arrays(topology_list[1])
    rt1 = dendropy.Tree.get(data=rt1.newick,
                            schema="newick",
                            taxon_namespace=tns)
    print("\tInferred Tree 1: ", rt1.as_string("newick"))
    rt1.print_plot()

    rt2 = Tree()
    rt2.load_tree_from_direct_arrays(topology_list[2])
    rt2 = dendropy.Tree.get(data=rt2.newick,
                            schema="newick",
                            taxon_namespace=tns)
    print("\tInferred Tree 2: ", rt2.as_string("newick"))
    rt2.print_plot()

    print("\n4.2 Compare trees and print Robinson-Foulds (RF) distance:\n")

    print("\tt0 vs inferred trees")
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t0, rt0))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t0, rt1))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t0, rt2))

    print("\tt1 vs inferred trees")
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t1, rt0))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t1, rt1))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t1, rt2))

    print("\tt2 vs inferred trees")
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t2, rt0))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t2, rt1))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t2, rt2))

    print("\nInvestigate")

    print("\tRF distance: \t",
          dendropy.calculate.treecompare.symmetric_difference(t0, rt0))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.find_missing_bipartitions(t0, rt0))
    print(
        "\tRF distance: \t",
        dendropy.calculate.treecompare.false_positives_and_negatives(t0, rt0))
    print("\tRF distance: \t",
          dendropy.calculate.treecompare.find_missing_bipartitions(t0, rt1))
    print(
        "\tRF distance: \t",
        dendropy.calculate.treecompare.false_positives_and_negatives(t0, rt1))
def main():
    print("\n1. Load samples from txt file.\n")
    default_sample_filename = 'q_2_5_tm_10node_20sample_4clusters.pkl_samples.txt'
    default_output_filename = 'q_2_5_tm_10node_20sample_4clusters_results'
    default_num_clusters = 4
    default_seed_val = 42
    default_real_values_filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_0_topology.npy"

    samples = np.loadtxt(default_sample_filename,
                         delimiter="\t",
                         dtype=np.int32)
    num_samples, num_nodes = samples.shape
    print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes)
    print("\tSamples: \n", samples)

    print("\n2. Run EM Algorithm.\n")

    loglikelihood, topology_array, theta_array = em_algorithm(
        default_seed_val, samples, num_clusters=default_num_clusters)

    print("\n3. Save, print and plot the results.\n")
    print("top array 0 = ", topology_array[0, :])
    save_results(loglikelihood, topology_array, theta_array,
                 default_output_filename)
    save_results(loglikelihood, topology_array[0, :], theta_array[0, :],
                 "q_2_5_tm_10node_20sample_4clusters0_results")
    save_results(loglikelihood, topology_array[1, :], theta_array[1, :],
                 "q_2_5_tm_10node_20sample_4clusters1_results")
    save_results(loglikelihood, topology_array[2, :], theta_array[2, :],
                 "q_2_5_tm_10node_20sample_4clusters2_results")
    save_results(loglikelihood, topology_array[3, :], theta_array[3, :],
                 "q_2_5_tm_10node_20sample_4clusters3_results")

    for i in range(default_num_clusters):
        print("\n\tCluster: ", i)
        print("\tTopology: \t", topology_array[i])
        print("\tTheta: \t", theta_array[i])

    Theta_Expected = np.array(theta_array.shape)
    Topology_Expected = np.zeros((default_num_clusters, samples.shape[1]))

    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_0_theta.npy"
    theta_expected0 = np.load(filename)
    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_0_topology.npy"
    topology_expected0 = np.load(filename)
    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_pi.npy"
    pi_expected = np.load(filename)

    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_1_theta.npy"
    theta_expected1 = np.load(filename)
    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_1_topology.npy"
    topology_expected1 = np.load(filename)

    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_2_theta.npy"
    theta_expected2 = np.load(filename)
    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_2_topology.npy"
    topology_expected2 = np.load(filename)

    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_3_theta.npy"
    theta_expected3 = np.load(filename)
    filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_3_topology.npy"
    topology_expected3 = np.load(filename)
    Theta_Expected = theta_array

    thetafiles = [
        theta_expected0, theta_expected1, theta_expected2, theta_expected3
    ]
    topologyfiles = [
        topology_expected0, topology_expected1, topology_expected2,
        topology_expected3
    ]
    for k in range(default_num_clusters):
        Theta_Expected[k][:][:][:] = thetafiles[k]
        Topology_Expected[k, :] = topologyfiles[k]
    loglikelihood_expected = np.zeros(len(loglikelihood))
    for iterations in range(len(loglikelihood)):
        loglikelihood_expected[iterations] = log_likelihood(
            default_num_clusters, samples, Theta_Expected, Topology_Expected,
            pi_expected)

    print("loglikelihood_expected = ", loglikelihood_expected[0])
    plt.figure(figsize=(8, 3))
    plt.subplot(121)
    plt.plot(np.exp(loglikelihood_expected), label='Expected')
    plt.plot(np.exp(loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    plt.plot(loglikelihood_expected, label='Expected')
    plt.plot(loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend(loc=(1.04, 0))
    plt.show()

    print("\n4. Retrieve real results and compare.\n")
    if default_real_values_filename != "":
        print("\tComparing the results with real values...")

        print("\t4.1. Make the Robinson-Foulds distance analysis.\n")
        print("Hello World!")
        print(
            "This file demonstrates the usage of the DendroPy module and its functions."
        )

        print("\n1. Tree Generations\n")
        print("\n1.1. Create two random birth-death trees and print them:\n")

        tns = dendropy.TaxonNamespace()
        num_leaves = 5

        t1 = dendropy.simulate.treesim.birth_death_tree(
            birth_rate=1.0,
            death_rate=0.5,
            num_extant_tips=num_leaves,
            taxon_namespace=tns)
        t2 = dendropy.simulate.treesim.birth_death_tree(
            birth_rate=1.0,
            death_rate=0.2,
            num_extant_tips=num_leaves,
            taxon_namespace=tns)
        print("\tTree 1: ", t1.as_string("newick"))
        t1.print_plot()
        print("\tTree 2: ", t2.as_string("newick"))
        t2.print_plot()

        print("\n2. Compare Trees\n")
        print(
            "\n2.1. Compare tree with itself and print Robinson-Foulds (RF) distance:\n"
        )

        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t1, t1))

        print(
            "\n2.2. Compare different trees and print Robinson-Foulds (RF) distance:\n"
        )

        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t1, t2))

        print("\n3. Load Trees from Newick Files and Compare:\n")
        print("\n3.1 Load trees from Newick files:\n")

        # If you want to compare two trees, make sure you specify the same Taxon Namespace!
        tns = dendropy.TaxonNamespace()

        filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_0_newick.txt"
        with open(filename, 'r') as input_file:
            newick_str = input_file.read()
        t0 = dendropy.Tree.get(data=newick_str,
                               schema="newick",
                               taxon_namespace=tns)
        print("\tTree 0: ", t0.as_string("newick"))
        t0.print_plot()

        filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_1_newick.txt"
        with open(filename, 'r') as input_file:
            newick_str = input_file.read()
        t1 = dendropy.Tree.get(data=newick_str,
                               schema="newick",
                               taxon_namespace=tns)
        print("\tTree 1: ", t1.as_string("newick"))
        t1.print_plot()

        filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_2_newick.txt"
        with open(filename, 'r') as input_file:
            newick_str = input_file.read()
        t2 = dendropy.Tree.get(data=newick_str,
                               schema="newick",
                               taxon_namespace=tns)
        print("\tTree 2: ", t2.as_string("newick"))
        t2.print_plot()

        filename = "q_2_5_tm_10node_20sample_4clusters.pkl_tree_3_newick.txt"
        with open(filename, 'r') as input_file:
            newick_str = input_file.read()
        t3 = dendropy.Tree.get(data=newick_str,
                               schema="newick",
                               taxon_namespace=tns)
        print("\tTree 3: ", t3.as_string("newick"))
        t3.print_plot()

        print("\n3.2 Compare trees and print Robinson-Foulds (RF) distance:\n")

        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t0, t1))
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t0, t2))
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t1, t2))

        filename = "q_2_5_tm_10node_20sample_4clusters0_results_em_topology.npy"
        topology_list = np.load(filename)
        rt0 = Tree()
        rt0.load_tree_from_direct_arrays(np.array(topology_list))
        rt0 = dendropy.Tree.get(data=rt0.newick,
                                schema="newick",
                                taxon_namespace=tns)
        print("\tInferred Tree 0: ", rt0.as_string("newick"))
        rt0.print_plot()

        filename = "q_2_5_tm_10node_20sample_4clusters1_results_em_topology.npy"
        topology_list = np.load(filename)
        rt1 = Tree()
        rt1.load_tree_from_direct_arrays(np.array(topology_list))
        rt1 = dendropy.Tree.get(data=rt1.newick,
                                schema="newick",
                                taxon_namespace=tns)
        print("\tInferred Tree 1: ", rt1.as_string("newick"))
        rt1.print_plot()

        filename = "q_2_5_tm_10node_20sample_4clusters2_results_em_topology.npy"
        topology_list = np.load(filename)
        rt2 = Tree()
        rt2.load_tree_from_direct_arrays(np.array(topology_list))
        rt2 = dendropy.Tree.get(data=rt2.newick,
                                schema="newick",
                                taxon_namespace=tns)
        print("\tInferred Tree 2: ", rt2.as_string("newick"))
        rt2.print_plot()

        filename = "q_2_5_tm_10node_20sample_4clusters3_results_em_topology.npy"
        topology_list = np.load(filename)
        rt3 = Tree()
        rt3.load_tree_from_direct_arrays(np.array(topology_list))
        rt3 = dendropy.Tree.get(data=rt3.newick,
                                schema="newick",
                                taxon_namespace=tns)
        print("\tInferred Tree 3: ", rt3.as_string("newick"))
        rt3.print_plot()

        print("\n4.2 Compare trees and print Robinson-Foulds (RF) distance:\n")

        print("\tt0 vs inferred trees")
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t0, rt0))
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t0, rt1))
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t0, rt2))

        print("\tt1 vs inferred trees")
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t1, rt0))
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t1, rt1))
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t1, rt2))

        print("\tt2 vs inferred trees")
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t2, rt0))
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t2, rt1))
        print("\tRF distance: \t",
              dendropy.calculate.treecompare.symmetric_difference(t2, rt2))

        print("\t4.2. Make the likelihood comparison.\n")