예제 #1
0
def main():

    num_clusters = 3
    new_tm = TreeMixture(num_clusters=num_clusters, num_nodes=3)
    new_tm.simulate_pi(None)
    new_tm.simulate_trees(None)
    new_tm.sample_mixtures(100)
    new_samples = new_tm.samples
    #samples = tm.samples
    seed_val = None
    directory = '/Users/filipbergentoft/Desktop/Github/DD2434/Assignment 2/2_4/'
    sample_filename = directory + "data/q2_4/q2_4_tree_mixture.pkl_samples.txt"
    output_filename = directory + "data/q2_4/q2_4_own_results"
    real_values_filename = directory + "data/q2_4/q2_4_tree_mixture.pkl"

    samples = np.loadtxt(sample_filename, delimiter="\t", dtype=np.int32)
    np.random.shuffle(samples)

    best_tm = sieving(n_first_mixtures=50,
                      n_second_mixtures=10,
                      n_first_iterations=10,
                      n_second_iterations=100,
                      samples=samples,
                      num_clusters=num_clusters)

    real_tm = TreeMixture(num_clusters=3, num_nodes=5)
    real_tm.load_mixture(real_values_filename)

    print('best tree', mixture_likelihood(best_tm, samples))
    print('best tree', best_tm.pi)
    print('real tree', mixture_likelihood(real_tm, samples))
    print('real tree', real_tm.pi)

    print(RF_comparison(best_tm, real_tm))
    for tree in new_tm.clusters:
        print('Real tree topology')
        print(tree.get_topology_array())

    for tree in best_tm.clusters:
        print('Inferred tree topology')
        print(tree.get_topology_array())

    sns.set_style('darkgrid')
    """
    plt.subplot(121)
    plt.plot(np.exp(best_tm.loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    """
    plt.plot(best_tm.loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend()
    plt.show()
예제 #2
0
파일: 2.4.py 프로젝트: NikoPalic/KTH_MLADV
def main():
    print("Hello World!")

    seed_val = 123

    #sample_filename = "q2_4/q2_4_tree_mixture.pkl_samples.txt"
    #real_values_filename = "q2_4/q2_4_tree_mixture.pkl"

    #sample_filename = "q2_4/case1.pkl_samples.txt"
    #real_values_filename = "q2_4/case1.pkl"

    #sample_filename = "q2_4/case2.pkl_samples.txt"
    #real_values_filename = "q2_4/case2.pkl"

    sample_filename = "q2_4/case3.pkl_samples.txt"
    real_values_filename = "q2_4/case3.pkl"

    num_clusters = 2  #need to change this fpr each case!

    samples = np.loadtxt(sample_filename, delimiter="\t", dtype=np.int32)

    loglikelihood, my_tm = sieving(seed_val,
                                   samples,
                                   num_clusters=num_clusters)

    plt.figure(figsize=(8, 3))
    plt.subplot(121)
    plt.plot(np.exp(loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    plt.plot(loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend(loc=(1.04, 0))
    plt.show()

    if real_values_filename != "":
        real = TreeMixture(0, 0)
        real.load_mixture(real_values_filename)

        print("\t4.1. Make the Robinson-Foulds distance analysis.\n")
        tns = dendropy.TaxonNamespace()

        real_trees = [i.newick for i in real.clusters]
        my_trees = [i.newick for i in my_tm.clusters]
        print(my_trees)

        i = 0
        for real_tree in real_trees:
            real_den = dendropy.Tree.get(data=real_tree,
                                         schema="newick",
                                         taxon_namespace=tns)
            j = 0
            for my_tree in my_trees:
                my_den = dendropy.Tree.get(data=my_tree,
                                           schema="newick",
                                           taxon_namespace=tns)
                print(
                    "RF distance: $<", i, j, ">$\t=",
                    dendropy.calculate.treecompare.symmetric_difference(
                        real_den, my_den), "\\\\")
                j += 1
            i += 1

        print("4.2. Make the likelihood comparison.\n")
        real_log_hood = tm_likelihood(real, samples, len(samples),
                                      num_clusters)
        print("Real: ", real_log_hood)
        print("Infered: ", loglikelihood)
예제 #3
0
def main():
    # Code to process command line arguments
    parser = argparse.ArgumentParser(
        description='EM algorithm for likelihood of a tree GM.')
    parser.add_argument(
        '--sample_filename',
        type=str,
        default='data/q_2_5_tm_20node_20sample_4clusters.pkl_samples.txt',
        help=
        'Specify the name of the sample file (i.e data/example_samples.txt)')
    parser.add_argument(
        '--real_values_filename',
        type=str,
        default='data/q_2_5_tm_20node_20sample_4clusters.pkl',
        help=
        'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)'
    )
    parser.add_argument(
        '--output_filename',
        type=str,
        default='q_2_5_tm_20node_20sample_4clusters_result.txt',
        help=
        'Specify the name of the output file (i.e data/example_results.txt)')
    parser.add_argument('--num_nodes',
                        type=int,
                        default=10,
                        help='Specify the number of nodes of trees (i.e 10)')
    parser.add_argument('--num_clusters',
                        type=int,
                        default=10,
                        help='Specify the number of clusters (i.e 3)')
    parser.add_argument(
        '--seed_val',
        type=int,
        default=123,
        help='Specify the seed value for reproducibility (i.e 42)')
    parser.add_argument(
        '--if_simulate',
        type=bool,
        default=True,
        help='Specify whether the sampling is enabled (i.e False)')
    parser.add_argument(
        '--num_samples',
        type=int,
        default=50,
        help='Specify the number of samples if sampling is enabled (i.e 1000)')
    # You can add more default parameters if you want.

    print("Hello World!")
    print(
        "This file demonstrates the flow of function templates of question 2.5."
    )

    print("\n0. Load the parameters from command line.\n")

    args = parser.parse_args()
    print("\tArguments are: ", args)

    if args.if_simulate:
        print("\n1. Make new tree and sample.\n")
        tm_truth = TreeMixture(num_clusters=args.num_clusters,
                               num_nodes=args.num_nodes)
        tm_truth.simulate_pi(seed_val=args.seed_val)
        tm_truth.simulate_trees(seed_val=args.seed_val)
        tm_truth.sample_mixtures(args.num_samples, seed_val=args.seed_val)
    else:
        print("\n1. Load true tree from file.\n")
        tm_truth = TreeMixture(0, 0)
        tm_truth.load_mixture(args.real_values_filename)
    print("Load samples.")
    samples = tm_truth.samples
    num_samples, num_nodes = samples.shape
    print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes)
    print("\tSamples: \n", samples)

    print("\n2. Run EM Algorithm.\n")
    loglikelihood, topology_array, theta_array, tm = em_algorithm(
        args.seed_val, samples, num_clusters=args.num_clusters)

    print("\n3. Save, print and plot the results.\n")
    # save_results(loglikelihood, topology_array, theta_array, args.output_filename)
    for i in range(args.num_clusters):
        print("\n\tCluster: ", i)
        print("\tTopology: \t", topology_array[i])
        print("\tTheta: \t", theta_array[i])

    plt.figure(figsize=(8, 3))
    plt.subplot(121)
    plt.plot(np.exp(loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    plt.plot(loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend(loc=(1.04, 0))
    plt.show()

    print("\n4. Retrieve real results and compare.\n")
    if args.real_values_filename != "":
        print(
            "\n=> Compare trees and print Robinson-Foulds (RF) distance (result v.s truth):\n"
        )
        N = len(samples)
        K = tm_truth.num_clusters
        tns = dendropy.TaxonNamespace()
        print("\\hline")
        for k in range(K):
            print(k, end=" & ")
            for j in range(K):
                t_0 = tm.clusters[k]
                t_0.get_tree_newick()
                t_0 = dendropy.Tree.get(data=t_0.newick,
                                        schema="newick",
                                        taxon_namespace=tns)
                t_t = tm_truth.clusters[j]
                t_t.get_tree_newick()
                t_t = dendropy.Tree.get(data=t_t.newick,
                                        schema="newick",
                                        taxon_namespace=tns)
                print(dendropy.calculate.treecompare.symmetric_difference(
                    t_0, t_t),
                      end=" & ")
            print("\\\\")

        print("\n=> Compare log-likelihood (result v.s truth):\n")
        posterior = np.ones((N, K))
        prior = np.ones(N)
        for n, x in enumerate(samples):
            for k, tree in enumerate(tm_truth.clusters):
                visit_list = [tree.root]
                while len(visit_list) != 0:
                    cur_node = visit_list[0]
                    visit_list = visit_list[1:]
                    visit_list = visit_list + cur_node.descendants
                    if cur_node.ancestor is None:
                        posterior[n, k] *= cur_node.cat[x[int(cur_node.name)]]
                    else:
                        posterior[n, k] *= cur_node.cat[x[int(
                            cur_node.ancestor.name)]][x[int(cur_node.name)]]
            prior[n] *= np.sum(posterior[n] * tm_truth.pi)
        loglikelihood_truth = np.sum(np.log(prior))
        print("%f : %f" % (loglikelihood[-1], loglikelihood_truth))
예제 #4
0
def main():
    # Code to process command line arguments
    parser = argparse.ArgumentParser(
        description='EM algorithm for likelihood of a tree GM.')
    parser.add_argument(
        'sample_filename',
        type=str,
        help=
        'Specify the name of the sample file (i.e data/example_samples.txt)')
    parser.add_argument(
        'output_filename',
        type=str,
        help=
        'Specify the name of the output file (i.e data/example_results.txt)')
    parser.add_argument('num_clusters',
                        type=int,
                        help='Specify the number of clusters (i.e 3)')
    parser.add_argument(
        '--seed_val',
        type=int,
        default=42,
        help='Specify the seed value for reproducibility (i.e 42)')
    parser.add_argument(
        '--real_values_filename',
        type=str,
        default="",
        help=
        'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)'
    )
    # You can add more default parameters if you want.

    print("Hello World!")
    print(
        "This file demonstrates the flow of function templates of question 2.5."
    )

    print("\n0. Load the parameters from command line.\n")

    args = parser.parse_args()
    print("\tArguments are: ", args)

    print("\n1. Load samples from txt file.\n")

    samples = np.loadtxt(args.sample_filename, delimiter="\t", dtype=np.int32)
    num_samples, num_nodes = samples.shape
    print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes)
    print("\tSamples: \n", samples)

    print("\n2. Run EM Algorithm.\n")

    loglikelihood, topology_array, theta_array = em_algorithm(
        args.seed_val, samples, num_clusters=args.num_clusters)

    print("\n3. Save, print and plot the results.\n")

    save_results(loglikelihood, topology_array, theta_array,
                 args.output_filename)

    for i in range(args.num_clusters):
        print("\n\tCluster: ", i)
        print("\tTopology: \t", topology_array[i])
        print("\tTheta: \t", theta_array[i])

    plt.figure(figsize=(8, 3))
    plt.subplot(121)
    plt.plot(np.exp(loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    plt.plot(loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend(loc=(1.04, 0))
    plt.show()

    print("\n4. Retrieve real results and compare.\n")
    if args.real_values_filename != "":
        print("\tComparing the results with real values...")
        actual_tm = TreeMixture(args.num_clusters, num_nodes)
        actual_tm.load_mixture(args.real_values_filename)

        inferred_tm = TreeMixture(args.num_clusters, num_nodes)
        inferred_tm.load_mixture(args.output_filename)

        print("\t4.1. Make the Robinson-Foulds distance analysis.\n")
        diff = compute_tree_mix_diff(actual_tm, inferred_tm)
        print("Total Robinson-Foulds distance: " + str(diff))

        print("\t4.2. Make the likelihood comparison.\n")
        actual_lik = actual_tm.likelihood_dataset(samples)
        inferred_lik = inferred_tm.likelihood_dataset(samples)
        print("Log-Likelihood of actual tree: " + str(actual_lik) +
              ", inferred tree: " + str(inferred_lik))