def main(): num_clusters = 3 new_tm = TreeMixture(num_clusters=num_clusters, num_nodes=3) new_tm.simulate_pi(None) new_tm.simulate_trees(None) new_tm.sample_mixtures(100) new_samples = new_tm.samples #samples = tm.samples seed_val = None directory = '/Users/filipbergentoft/Desktop/Github/DD2434/Assignment 2/2_4/' sample_filename = directory + "data/q2_4/q2_4_tree_mixture.pkl_samples.txt" output_filename = directory + "data/q2_4/q2_4_own_results" real_values_filename = directory + "data/q2_4/q2_4_tree_mixture.pkl" samples = np.loadtxt(sample_filename, delimiter="\t", dtype=np.int32) np.random.shuffle(samples) best_tm = sieving(n_first_mixtures=50, n_second_mixtures=10, n_first_iterations=10, n_second_iterations=100, samples=samples, num_clusters=num_clusters) real_tm = TreeMixture(num_clusters=3, num_nodes=5) real_tm.load_mixture(real_values_filename) print('best tree', mixture_likelihood(best_tm, samples)) print('best tree', best_tm.pi) print('real tree', mixture_likelihood(real_tm, samples)) print('real tree', real_tm.pi) print(RF_comparison(best_tm, real_tm)) for tree in new_tm.clusters: print('Real tree topology') print(tree.get_topology_array()) for tree in best_tm.clusters: print('Inferred tree topology') print(tree.get_topology_array()) sns.set_style('darkgrid') """ plt.subplot(121) plt.plot(np.exp(best_tm.loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) """ plt.plot(best_tm.loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend() plt.show()
def main(): print("Hello World!") seed_val = 123 #sample_filename = "q2_4/q2_4_tree_mixture.pkl_samples.txt" #real_values_filename = "q2_4/q2_4_tree_mixture.pkl" #sample_filename = "q2_4/case1.pkl_samples.txt" #real_values_filename = "q2_4/case1.pkl" #sample_filename = "q2_4/case2.pkl_samples.txt" #real_values_filename = "q2_4/case2.pkl" sample_filename = "q2_4/case3.pkl_samples.txt" real_values_filename = "q2_4/case3.pkl" num_clusters = 2 #need to change this fpr each case! samples = np.loadtxt(sample_filename, delimiter="\t", dtype=np.int32) loglikelihood, my_tm = sieving(seed_val, samples, num_clusters=num_clusters) plt.figure(figsize=(8, 3)) plt.subplot(121) plt.plot(np.exp(loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) plt.plot(loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend(loc=(1.04, 0)) plt.show() if real_values_filename != "": real = TreeMixture(0, 0) real.load_mixture(real_values_filename) print("\t4.1. Make the Robinson-Foulds distance analysis.\n") tns = dendropy.TaxonNamespace() real_trees = [i.newick for i in real.clusters] my_trees = [i.newick for i in my_tm.clusters] print(my_trees) i = 0 for real_tree in real_trees: real_den = dendropy.Tree.get(data=real_tree, schema="newick", taxon_namespace=tns) j = 0 for my_tree in my_trees: my_den = dendropy.Tree.get(data=my_tree, schema="newick", taxon_namespace=tns) print( "RF distance: $<", i, j, ">$\t=", dendropy.calculate.treecompare.symmetric_difference( real_den, my_den), "\\\\") j += 1 i += 1 print("4.2. Make the likelihood comparison.\n") real_log_hood = tm_likelihood(real, samples, len(samples), num_clusters) print("Real: ", real_log_hood) print("Infered: ", loglikelihood)
def main(): # Code to process command line arguments parser = argparse.ArgumentParser( description='EM algorithm for likelihood of a tree GM.') parser.add_argument( '--sample_filename', type=str, default='data/q_2_5_tm_20node_20sample_4clusters.pkl_samples.txt', help= 'Specify the name of the sample file (i.e data/example_samples.txt)') parser.add_argument( '--real_values_filename', type=str, default='data/q_2_5_tm_20node_20sample_4clusters.pkl', help= 'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)' ) parser.add_argument( '--output_filename', type=str, default='q_2_5_tm_20node_20sample_4clusters_result.txt', help= 'Specify the name of the output file (i.e data/example_results.txt)') parser.add_argument('--num_nodes', type=int, default=10, help='Specify the number of nodes of trees (i.e 10)') parser.add_argument('--num_clusters', type=int, default=10, help='Specify the number of clusters (i.e 3)') parser.add_argument( '--seed_val', type=int, default=123, help='Specify the seed value for reproducibility (i.e 42)') parser.add_argument( '--if_simulate', type=bool, default=True, help='Specify whether the sampling is enabled (i.e False)') parser.add_argument( '--num_samples', type=int, default=50, help='Specify the number of samples if sampling is enabled (i.e 1000)') # You can add more default parameters if you want. print("Hello World!") print( "This file demonstrates the flow of function templates of question 2.5." ) print("\n0. Load the parameters from command line.\n") args = parser.parse_args() print("\tArguments are: ", args) if args.if_simulate: print("\n1. Make new tree and sample.\n") tm_truth = TreeMixture(num_clusters=args.num_clusters, num_nodes=args.num_nodes) tm_truth.simulate_pi(seed_val=args.seed_val) tm_truth.simulate_trees(seed_val=args.seed_val) tm_truth.sample_mixtures(args.num_samples, seed_val=args.seed_val) else: print("\n1. Load true tree from file.\n") tm_truth = TreeMixture(0, 0) tm_truth.load_mixture(args.real_values_filename) print("Load samples.") samples = tm_truth.samples num_samples, num_nodes = samples.shape print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes) print("\tSamples: \n", samples) print("\n2. Run EM Algorithm.\n") loglikelihood, topology_array, theta_array, tm = em_algorithm( args.seed_val, samples, num_clusters=args.num_clusters) print("\n3. Save, print and plot the results.\n") # save_results(loglikelihood, topology_array, theta_array, args.output_filename) for i in range(args.num_clusters): print("\n\tCluster: ", i) print("\tTopology: \t", topology_array[i]) print("\tTheta: \t", theta_array[i]) plt.figure(figsize=(8, 3)) plt.subplot(121) plt.plot(np.exp(loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) plt.plot(loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend(loc=(1.04, 0)) plt.show() print("\n4. Retrieve real results and compare.\n") if args.real_values_filename != "": print( "\n=> Compare trees and print Robinson-Foulds (RF) distance (result v.s truth):\n" ) N = len(samples) K = tm_truth.num_clusters tns = dendropy.TaxonNamespace() print("\\hline") for k in range(K): print(k, end=" & ") for j in range(K): t_0 = tm.clusters[k] t_0.get_tree_newick() t_0 = dendropy.Tree.get(data=t_0.newick, schema="newick", taxon_namespace=tns) t_t = tm_truth.clusters[j] t_t.get_tree_newick() t_t = dendropy.Tree.get(data=t_t.newick, schema="newick", taxon_namespace=tns) print(dendropy.calculate.treecompare.symmetric_difference( t_0, t_t), end=" & ") print("\\\\") print("\n=> Compare log-likelihood (result v.s truth):\n") posterior = np.ones((N, K)) prior = np.ones(N) for n, x in enumerate(samples): for k, tree in enumerate(tm_truth.clusters): visit_list = [tree.root] while len(visit_list) != 0: cur_node = visit_list[0] visit_list = visit_list[1:] visit_list = visit_list + cur_node.descendants if cur_node.ancestor is None: posterior[n, k] *= cur_node.cat[x[int(cur_node.name)]] else: posterior[n, k] *= cur_node.cat[x[int( cur_node.ancestor.name)]][x[int(cur_node.name)]] prior[n] *= np.sum(posterior[n] * tm_truth.pi) loglikelihood_truth = np.sum(np.log(prior)) print("%f : %f" % (loglikelihood[-1], loglikelihood_truth))
def main(): # Code to process command line arguments parser = argparse.ArgumentParser( description='EM algorithm for likelihood of a tree GM.') parser.add_argument( 'sample_filename', type=str, help= 'Specify the name of the sample file (i.e data/example_samples.txt)') parser.add_argument( 'output_filename', type=str, help= 'Specify the name of the output file (i.e data/example_results.txt)') parser.add_argument('num_clusters', type=int, help='Specify the number of clusters (i.e 3)') parser.add_argument( '--seed_val', type=int, default=42, help='Specify the seed value for reproducibility (i.e 42)') parser.add_argument( '--real_values_filename', type=str, default="", help= 'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)' ) # You can add more default parameters if you want. print("Hello World!") print( "This file demonstrates the flow of function templates of question 2.5." ) print("\n0. Load the parameters from command line.\n") args = parser.parse_args() print("\tArguments are: ", args) print("\n1. Load samples from txt file.\n") samples = np.loadtxt(args.sample_filename, delimiter="\t", dtype=np.int32) num_samples, num_nodes = samples.shape print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes) print("\tSamples: \n", samples) print("\n2. Run EM Algorithm.\n") loglikelihood, topology_array, theta_array = em_algorithm( args.seed_val, samples, num_clusters=args.num_clusters) print("\n3. Save, print and plot the results.\n") save_results(loglikelihood, topology_array, theta_array, args.output_filename) for i in range(args.num_clusters): print("\n\tCluster: ", i) print("\tTopology: \t", topology_array[i]) print("\tTheta: \t", theta_array[i]) plt.figure(figsize=(8, 3)) plt.subplot(121) plt.plot(np.exp(loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) plt.plot(loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend(loc=(1.04, 0)) plt.show() print("\n4. Retrieve real results and compare.\n") if args.real_values_filename != "": print("\tComparing the results with real values...") actual_tm = TreeMixture(args.num_clusters, num_nodes) actual_tm.load_mixture(args.real_values_filename) inferred_tm = TreeMixture(args.num_clusters, num_nodes) inferred_tm.load_mixture(args.output_filename) print("\t4.1. Make the Robinson-Foulds distance analysis.\n") diff = compute_tree_mix_diff(actual_tm, inferred_tm) print("Total Robinson-Foulds distance: " + str(diff)) print("\t4.2. Make the likelihood comparison.\n") actual_lik = actual_tm.likelihood_dataset(samples) inferred_lik = inferred_tm.likelihood_dataset(samples) print("Log-Likelihood of actual tree: " + str(actual_lik) + ", inferred tree: " + str(inferred_lik))