def do_experiment(num_clusters, num_nodes): tm = TreeMixture(num_clusters, num_nodes) tm.simulate_pi(seed_val) tm.simulate_trees(seed_val) tm.sample_mixtures(2000, seed_val=seed_val) samples = tm.samples em = EM_Algorithm(samples, num_clusters, seed_val=seed_val) em.initialize(1, 2) loglikelihoods, topology_list, theta_list = em.optimize(max_num_iter=100) real_likelyhood = tm.likelihood_dataset(samples) learned_likelyhood = em.tree_mixture.likelihood_dataset(samples) print("Real likelyhood: " + str(real_likelyhood) + ", learned " + str(learned_likelyhood)) plt.title('Actual likelihood: ' + str(real_likelyhood) + ', inferred likelyhood: ' + str(learned_likelyhood)) plt.plot(range(len(loglikelihoods)), loglikelihoods, label=str(num_clusters) + ' clusters, ' + str(num_nodes) + ' nodes') plt.legend() plt.show()
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100): """ This function is for the EM algorithm. :param seed_val: Seed value for reproducibility. Type: int :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes) :param num_clusters: Number of clusters. Type: int :param max_num_iter: Maximum number of EM iterations. Type: int :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array. Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter. :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes) :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2) You can change the function signature and add new parameters. Add them as parameters with some default values. i.e. Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10): You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123): """ # Set the seed np.random.seed(seed_val) # TODO: Implement EM algorithm here. # Start: Example Code Segment. Delete this segment completely before you implement the algorithm. print("Running EM algorithm...") loglikelihood = [] for iter_ in range(max_num_iter): loglikelihood.append(np.log((1 + iter_) / max_num_iter)) from Tree import TreeMixture tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(seed_val=seed_val) tm.simulate_trees(seed_val=seed_val) tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seed_val) topology_list = [] theta_list = [] for i in range(num_clusters): topology_list.append(tm.clusters[i].get_topology_array()) theta_list.append(tm.clusters[i].get_theta_array()) loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) theta_list = np.array(theta_list) # End: Example Code Segment ### return loglikelihood, topology_list, theta_list
def initialize(self, sieving_tries=100, sieving_train=10): """Initializes the TreeMixtures using sieving. Parameters: sieving_tries -- Number of random initializations sieving_train -- Number of iterations for each initialization """ print("Initializing EM ...") best_tree_mix = None best_likelihood = -float('inf') for t in tqdm(range(sieving_tries)): tree_mix = TreeMixture(self.num_clusters, self.num_nodes) tree_mix.simulate_trees(self.seed_val) tree_mix.simulate_pi(self.seed_val) tm, likelihoods = self.__iter_optimize(sieving_train, tree_mix=tree_mix, display_progress=False) if likelihoods[-1] > best_likelihood: best_likelihood = likelihoods[-1] best_tree_mix = tm self.tree_mixture = best_tree_mix
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=10, tm=None): """ This function is for the EM algorithm. :param seed_val: Seed value for reproducibility. Type: int :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes) :param num_clusters: Number of clusters. Type: int :param max_num_iter: Maximum number of EM iterations. Type: int :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array. Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter. :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes) :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2) This is a suggested template. Feel free to code however you want. """ # Set the seed np.random.seed(seed_val) # TODO: Implement EM algorithm here. N = len(samples) K = num_clusters V = samples.shape[1] if tm == None: tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(seed_val=seed_val) tm.simulate_trees(seed_val=seed_val) log_hoods = [] for iteration in range(max_num_iter): #STEP 1 R = np.zeros(shape=(N, K)) for n in range(N): for k in range(K): nth_sample = samples[n] kth_tree = tm.clusters[k] hood = tree_sample_likelihood(kth_tree, nth_sample) R[n, k] = tm.pi[k] * hood R = normalize(R, axis=1, norm='l1') #STEP 2 new_pi = np.zeros(shape=(K)) for k in range(K): suma = 0 for n in range(N): suma += R[n, k] new_pi[k] = suma / N tm.pi = new_pi for k in range(K): #STEP 3 Qstab = np.zeros(shape=(V, V, 2, 2)) #Xs x Xt x (0 or 1) x (0 or 1) Nstab = np.zeros(shape=(V, V, 2, 2)) #Xs x Xt x (0 or 1) x (0 or 1) #2 vertex relation for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue for n in range(N): a = samples[n][Xs] b = samples[n][Xt] r_nk = R[n, k] Nstab[Xs, Xt, a, b] += r_nk for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue denom = sum(R[:, k]) for a in range(2): #for each observation (0 or 1) for b in range(2): num = Nstab[Xs, Xt, a, b] Qstab[Xs, Xt, a, b] = num / denom #1 vertex relation Qsa = np.zeros(shape=(V, 2)) Nsa = np.zeros(shape=(V, 2)) for Xs in range(V): # foreach vertex for n in range(N): a = samples[n][Xs] r_nk = R[n, k] Nsa[Xs, a] += r_nk for Xs in range(V): for a in range(2): num = Nsa[Xs, a] denom = sum(Nsa[Xs, :]) Qsa[Xs, a] = num / denom #mutual information Info = np.zeros(shape=(V, V)) #information between vertices for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue for a in range(2): for b in range(2): qab = Qstab[Xs, Xt, a, b] qa = Qsa[Xs, a] qb = Qsa[Xt, b] if qab / (qa * qb) != 0: Info[Xs, Xt] += qab * log(qab / (qa * qb)) else: Info[Xs, Xt] += 0 #conditional information (for step 5) Qcond_stab = np.zeros(shape=(V, V, 2, 2)) for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue for a in range(2): for b in range(2): num = Nstab[Xs, Xt, a, b] denom = sum(Nstab[Xs, Xt, a, :]) Qcond_stab[Xs, Xt, a, b] = num / denom #p(Xt=b|Xs=a) #STEP 4 g = Graph(V) for Xs in range(V): #foreach vertex pair for Xt in range(V): if Xs == Xt: continue g.addEdge(Xs, Xt, Info[Xs, Xt]) mst = g.maximum_spanning_tree() #this is an array mst = sorted(mst, key=lambda x: x[0]) #STEP 5 topology_array = [np.nan for i in range(V)] theta_array = [None for i in range(V)] #placeholder topology_array = np.array(topology_array) theta_array = np.array(theta_array) #root root = 0 theta_array[0] = Qsa[root, :] MST = {} for u, v, w in mst: if u not in MST: MST[u] = [] MST[u].append(v) if v not in MST: MST[v] = [] MST[v].append(u) VISITED = [] def dfs(curr, prior): VISITED.append(curr) if prior != -1: cat = Qcond_stab[prior, curr] theta_array[curr] = cat topology_array[curr] = prior for child in MST[curr]: if child in VISITED: continue dfs(child, curr) dfs(root, -1) new_tree = Tree() #print(topology_array) #print(theta_array) new_tree.load_tree_from_direct_arrays(topology_array, theta_array) tm.clusters[k] = new_tree #print("End iteration ", iteration) log_hood = tm_likelihood(tm, samples, N, num_clusters) #print(log_hood) log_hoods.append(log_hood) loglikelihood_list = np.array(log_hoods) return loglikelihood_list, tm
def case3(): real = TreeMixture(2, 5) real.simulate_pi() real.simulate_trees(seed_val=123) real.sample_mixtures(30, seed_val=123) real.save_mixture("q2_4/case3.pkl")
def case1(): real = TreeMixture(7, 4) real.simulate_pi() real.simulate_trees(seed_val=123) real.sample_mixtures(100, seed_val=123) real.save_mixture("q2_4/case1.pkl")
def main(): print("Hello World!") seed_val = 123 #sample_filename = "q2_4/q2_4_tree_mixture.pkl_samples.txt" #real_values_filename = "q2_4/q2_4_tree_mixture.pkl" #sample_filename = "q2_4/case1.pkl_samples.txt" #real_values_filename = "q2_4/case1.pkl" #sample_filename = "q2_4/case2.pkl_samples.txt" #real_values_filename = "q2_4/case2.pkl" sample_filename = "q2_4/case3.pkl_samples.txt" real_values_filename = "q2_4/case3.pkl" num_clusters = 2 #need to change this fpr each case! samples = np.loadtxt(sample_filename, delimiter="\t", dtype=np.int32) loglikelihood, my_tm = sieving(seed_val, samples, num_clusters=num_clusters) plt.figure(figsize=(8, 3)) plt.subplot(121) plt.plot(np.exp(loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) plt.plot(loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend(loc=(1.04, 0)) plt.show() if real_values_filename != "": real = TreeMixture(0, 0) real.load_mixture(real_values_filename) print("\t4.1. Make the Robinson-Foulds distance analysis.\n") tns = dendropy.TaxonNamespace() real_trees = [i.newick for i in real.clusters] my_trees = [i.newick for i in my_tm.clusters] print(my_trees) i = 0 for real_tree in real_trees: real_den = dendropy.Tree.get(data=real_tree, schema="newick", taxon_namespace=tns) j = 0 for my_tree in my_trees: my_den = dendropy.Tree.get(data=my_tree, schema="newick", taxon_namespace=tns) print( "RF distance: $<", i, j, ">$\t=", dendropy.calculate.treecompare.symmetric_difference( real_den, my_den), "\\\\") j += 1 i += 1 print("4.2. Make the likelihood comparison.\n") real_log_hood = tm_likelihood(real, samples, len(samples), num_clusters) print("Real: ", real_log_hood) print("Infered: ", loglikelihood)
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=30): best_likelihood = -100000000.0 best_seed = 0 for seedno in range(100): np.random.seed(seedno) #print("Running EM algorithm...") tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(seedno) tm.simulate_trees(seedno) tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seedno) topology_list = [] theta_list = [] for k in range(num_clusters): topology_list.append(tm.clusters[k].get_topology_array()) theta_list.append(tm.clusters[k].get_theta_array()) topology_list = np.array(topology_list) theta_list = np.array(theta_list) loglikelihood = np.zeros(max_num_iter) pi = tm.pi for it in range(max_num_iter): #print("start iteration",it) #1: compute responsibilities resp = responsibilities(num_clusters, samples, theta_list, topology_list, pi) #print(resp) #2: set pi' = sum(r[n,k]/N) pi = np.zeros(num_clusters) pi_newdenom = np.sum(resp) for k in range(num_clusters): pi[k] = np.sum(resp[:, k]) / pi_newdenom #print(pi) #3: calculate mutual information between x[s] and x[t] N_ind1, q_denom1 = q_parts1(num_clusters, samples, resp) N_ind0, q_denom0 = q_parts0(num_clusters, samples, resp) #4: set Tau'[k] as maximum spanning tree in G[k] ##PACKAGE NETWORKX USED TO CONVERT MAXIMUM SPANNING TREE TO TOPOLOGY trees = [ Graph(samples.shape[1]), Graph(samples.shape[1]), Graph(samples.shape[1]), Graph(samples.shape[1]) ] weights = np.zeros( (num_clusters, samples.shape[1], samples.shape[1])) MST = [ Graph(samples.shape[1]), Graph(samples.shape[1]), Graph(samples.shape[1]), Graph(samples.shape[1]) ] for k in range(num_clusters): for s in range(samples.shape[1]): for t in range(samples.shape[1]): weights[k, s, t] = I_Info(k, s, t, N_ind1, N_ind0, q_denom0, q_denom1, num_clusters, samples) #print(weights) trees[k].addEdge(s, t, weights[k, s, t]) MST[k] = trees[k].maximum_spanning_tree() tree_graphs = [nx.Graph(), nx.Graph(), nx.Graph(), nx.Graph()] treearray = [nx.Graph(), nx.Graph(), nx.Graph(), nx.Graph()] for k in range(num_clusters): for u_of_edge, v_of_edge, weight in MST[k]: tree_graphs[k].add_edge(u_of_edge=u_of_edge, v_of_edge=v_of_edge) treearray[k] = list(nx.bfs_edges(G=tree_graphs[k], source=0)) tau_new = topology_list for k in range(num_clusters): for s in range(0, len(treearray[k])): parent = treearray[k][s][0] child = treearray[k][s][1] tau_new[k][child] = parent #5: set Theta'[k](X[r]) theta_new = theta_list for k in range(num_clusters): theta_new[k][0][:] = [ q0(k, 0, 0, N_ind0, q_denom0), q0(k, 0, 1, N_ind0, q_denom0) ] for s in range(1, samples.shape[1]): for a in range(0, 2): for b in range(0, 2): theta_new[k][s][a][b] = q_parts1cond( s, int(tau_new[k][s]), a, b, samples, resp[:, k]) #6: calculate log-likelihood theta_list = theta_new topology_list = tau_new loglikelihood[it] = log_likelihood(num_clusters, samples, theta_list, topology_list, pi) #print("best_likelihood = ",best_likelihood, "loglikelihood = ",loglikelihood[9]) if best_likelihood < loglikelihood[25]: print(best_likelihood, ">", loglikelihood[25]) best_likelihood = loglikelihood[25] best_seed = seedno print("seed val = ", best_seed) "repeat algorithm after finding best seed" np.random.seed(best_seed) #print("Running EM algorithm...") # TODO: Implement EM algorithm here. tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(best_seed) tm.simulate_trees(best_seed) tm.sample_mixtures(num_samples=samples.shape[0], seed_val=best_seed) topology_list = [] theta_list = [] for k in range(num_clusters): topology_list.append(tm.clusters[k].get_topology_array()) theta_list.append(tm.clusters[k].get_theta_array()) #loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) theta_list = np.array(theta_list) #start iterations loglikelihood = np.zeros(max_num_iter) pi = tm.pi for it in range(max_num_iter): #print("start iteration",it) #1: compute responsibilities resp = responsibilities(num_clusters, samples, theta_list, topology_list, pi) #print(resp) #2: set pi' = sum(r[n,k]/N) pi = np.zeros(num_clusters) pi_newdenom = np.sum(resp) for k in range(num_clusters): pi[k] = np.sum(resp[:, k]) / pi_newdenom #print(pi) #3: calculate mutual information between x[s] and x[t] N_ind1, q_denom1 = q_parts1(num_clusters, samples, resp) N_ind0, q_denom0 = q_parts0(num_clusters, samples, resp) #4: set Tau'[k] as maximum spanning tree in G[k] trees = [ Graph(samples.shape[1]), Graph(samples.shape[1]), Graph(samples.shape[1]), Graph(samples.shape[1]) ] weights = np.zeros((num_clusters, samples.shape[1], samples.shape[1])) MST = [ Graph(samples.shape[1]), Graph(samples.shape[1]), Graph(samples.shape[1]), Graph(samples.shape[1]) ] for k in range(num_clusters): for s in range(samples.shape[1]): for t in range(samples.shape[1]): weights[k, s, t] = I_Info(k, s, t, N_ind1, N_ind0, q_denom0, q_denom1, num_clusters, samples) trees[k].addEdge(s, t, weights[k, s, t]) MST[k] = trees[k].maximum_spanning_tree() ##PACKAGE NETWORKX USED TO CONVERT MAXIMUM SPANNING TREE TO TOPOLOGY tree_graphs = [nx.Graph(), nx.Graph(), nx.Graph(), nx.Graph()] treearray = [nx.Graph(), nx.Graph(), nx.Graph(), nx.Graph()] for k in range(num_clusters): for u_of_edge, v_of_edge, weight in MST[k]: tree_graphs[k].add_edge(u_of_edge=u_of_edge, v_of_edge=v_of_edge) treearray[k] = list(nx.bfs_edges(G=tree_graphs[k], source=0)) tau_new = topology_list for k in range(num_clusters): for s in range(0, len(treearray[k])): parent = treearray[k][s][0] child = treearray[k][s][1] tau_new[k][child] = parent #5: set Theta'[k](X[r]) theta_new = theta_list for k in range(num_clusters): theta_new[k][0][:] = [ q0(k, 0, 0, N_ind0, q_denom0), q0(k, 0, 1, N_ind0, q_denom0) ] for s in range(1, samples.shape[1]): for a in range(0, 2): for b in range(0, 2): theta_new[k][s][a][b] = q_parts1cond( s, int(tau_new[k][s]), a, b, samples, resp[:, k]) #6: calculate log-likelihood theta_list = theta_new topology_list = tau_new loglikelihood[it] = log_likelihood(num_clusters, samples, theta_list, topology_list, pi) print("topology_list = ", topology_list) print(loglikelihood) return loglikelihood, np.array(topology_list), theta_list
def main(): # Code to process command line arguments parser = argparse.ArgumentParser( description='EM algorithm for likelihood of a tree GM.') parser.add_argument( 'sample_filename', type=str, help= 'Specify the name of the sample file (i.e data/example_samples.txt)') parser.add_argument( 'output_filename', type=str, help= 'Specify the name of the output file (i.e data/example_results.txt)') parser.add_argument('num_clusters', type=int, help='Specify the number of clusters (i.e 3)') parser.add_argument( '--seed_val', type=int, default=42, help='Specify the seed value for reproducibility (i.e 42)') parser.add_argument( '--real_values_filename', type=str, default="", help= 'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)' ) # You can add more default parameters if you want. print("Hello World!") print( "This file demonstrates the flow of function templates of question 2.5." ) print("\n0. Load the parameters from command line.\n") args = parser.parse_args() print("\tArguments are: ", args) print("\n1. Load samples from txt file.\n") samples = np.loadtxt(args.sample_filename, delimiter="\t", dtype=np.int32) sample_filename = args.sample_filename customData = False if sample_filename == "data/q_2_5_tm_20node_20sample_4clusters.pkl_samples.txt": pi_file = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_pi.npy' tree0 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_0_topology.npy' tree1 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_1_topology.npy' tree2 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_2_topology.npy' tree3 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_3_topology.npy' theta0 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_0_theta.npy' theta1 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_1_theta.npy' theta2 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_2_theta.npy' theta3 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_3_theta.npy' elif sample_filename == "data/q_2_5_tm_10node_50sample_4clusters.pkl_samples.txt": pi_file = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_pi.npy' tree0 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_0_topology.npy' tree1 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_1_topology.npy' tree2 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_2_topology.npy' tree3 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_3_topology.npy' theta0 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_0_theta.npy' theta1 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_1_theta.npy' theta2 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_2_theta.npy' theta3 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_3_theta.npy' elif sample_filename == "data/q_2_5_tm_10node_20sample_4clusters.pkl_samples.txt": pi_file = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_pi.npy' tree0 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_0_topology.npy' tree1 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_1_topology.npy' tree2 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_2_topology.npy' tree3 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_3_topology.npy' theta0 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_0_theta.npy' theta1 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_1_theta.npy' theta2 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_2_theta.npy' theta3 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_3_theta.npy' else: print( 'Testing with Custom File. Please provide true mixture for likelihood and RF Comparisiom' ) customData = True num_samples, num_nodes = samples.shape print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes) print("\tSamples: \n", samples) print("\n2. Run EM Algorithm.\n") sieving = np.random.randint(20, 999, size=10) good_seed = sieving[0] bestFit = -np.Infinity for sieve in sieving: loglikelihood, topology_array, theta_array = em_algorithm( sieve, samples, args.num_clusters, 10) # print(loglikelihood) thisFit = loglikelihood[-1] if (thisFit > bestFit): bestFit = thisFit good_seed = sieve loglikelihood, topology_array, theta_array = em_algorithm( good_seed, samples, num_clusters=args.num_clusters) print("\n3. Save, print and plot the results.\n") num_clusters = args.num_clusters save_results(loglikelihood, topology_array, theta_array, args.output_filename) for i in range(args.num_clusters): print("\n\tCluster: ", i) print("\tTopology: \t", topology_array[i]) print("\tTheta: \t", theta_array[i]) plt.figure(figsize=(8, 3)) plt.subplot(121) plt.plot(np.exp(loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) plt.title(sample_filename) plt.plot(loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend(loc=(1.04, 0)) plt.show() print("\n4. Retrieve real results and compare.\n") if args.real_values_filename != "" or True: if customData == False: print("\tComparing the results with real values...") print("\t4.1. Make the Robinson-Foulds distance analysis.\n") # TODO: Do RF Comparison t0 = Tree() t1 = Tree() t2 = Tree() t3 = Tree() true_pi = np.load(pi_file) t0.load_tree_from_arrays(tree0, theta0) t1.load_tree_from_arrays(tree1, theta1) t2.load_tree_from_arrays(tree2, theta2) t3.load_tree_from_arrays(tree3, theta3) tns = dendropy.TaxonNamespace() t0_rf = dendropy.Tree.get(data=t0.get_tree_newick(), schema="newick", taxon_namespace=tns) t1_rf = dendropy.Tree.get(data=t1.get_tree_newick(), schema="newick", taxon_namespace=tns) t2_rf = dendropy.Tree.get(data=t2.get_tree_newick(), schema="newick", taxon_namespace=tns) t3_rf = dendropy.Tree.get(data=t3.get_tree_newick(), schema="newick", taxon_namespace=tns) t0_infer = Tree() t1_infer = Tree() t2_infer = Tree() t3_infer = Tree() t0_infer.load_tree_from_direct_arrays(topology_array[0], theta_array[0]) t1_infer.load_tree_from_direct_arrays(topology_array[1], theta_array[1]) t2_infer.load_tree_from_direct_arrays(topology_array[2], theta_array[2]) t3_infer.load_tree_from_direct_arrays(topology_array[3], theta_array[3]) t0_infer_rf = dendropy.Tree.get(data=t0_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t1_infer_rf = dendropy.Tree.get(data=t1_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t2_infer_rf = dendropy.Tree.get(data=t2_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t3_infer_rf = dendropy.Tree.get(data=t3_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) print('File:', sample_filename) print('------Robinson-Foulds Distance------') rfTree0 = [ RfDist(t0_infer_rf, t0_rf), RfDist(t0_infer_rf, t1_rf), RfDist(t0_infer_rf, t2_rf), RfDist(t0_infer_rf, t3_rf) ] rfTree1 = [ RfDist(t1_infer_rf, t0_rf), RfDist(t1_infer_rf, t1_rf), RfDist(t1_infer_rf, t2_rf), RfDist(t1_infer_rf, t3_rf) ] rfTree2 = [ RfDist(t2_infer_rf, t0_rf), RfDist(t2_infer_rf, t1_rf), RfDist(t2_infer_rf, t2_rf), RfDist(t2_infer_rf, t3_rf) ] rfTree3 = [ RfDist(t3_infer_rf, t0_rf), RfDist(t3_infer_rf, t1_rf), RfDist(t3_infer_rf, t2_rf), RfDist(t3_infer_rf, t3_rf) ] print('------Real Trees------') print(t0.get_tree_newick()) print(t1.get_tree_newick()) print(t2.get_tree_newick()) print(t3.get_tree_newick()) print('------Inferred Trees------') print(t0_infer.get_tree_newick()) print(t1_infer.get_tree_newick()) print(t2_infer.get_tree_newick()) print(t3_infer.get_tree_newick()) print() print('RF Distance of Inferred Tree 0 with each Tree(true):', rfTree0) print('RF Distance of Inferred Tree 1 with each Tree(true):', rfTree1) print('RF Distance of Inferred Tree 2 with each Tree(true):', rfTree2) print('RF Distance of Inferred Tree 3 with each Tree(true):', rfTree3) print('------Robinson-Foulds Distance------') print("\t4.2. Make the likelihood comparison.\n") # TODO: Do Likelihood Comparison print('Log Likelihood of real mixture: ' + str( truelikelihood([t0, t1, t2, t3], samples, num_samples, num_clusters, true_pi))) print('Log Likelihood of inferred mixture: ' + str(loglikelihood[-1])) else: print('Testing with Custom Data') tns = dendropy.TaxonNamespace() if sample_filename == "newData/Data1/Dataset1.pkl_samples.txt": num_clusters = 5 num_nodes = 30 tm = TreeMixture(num_clusters, num_nodes) seed_val = 12 tm.simulate_pi(seed_val=seed_val) tm.simulate_trees(seed_val) seed_val = 12 num_samples = 20 tm.sample_mixtures(num_samples, seed_val=seed_val) t0_rf = dendropy.Tree.get( data=tm.clusters[0].get_tree_newick(), schema="newick", taxon_namespace=tns) t1_rf = dendropy.Tree.get( data=tm.clusters[1].get_tree_newick(), schema="newick", taxon_namespace=tns) t2_rf = dendropy.Tree.get( data=tm.clusters[2].get_tree_newick(), schema="newick", taxon_namespace=tns) t3_rf = dendropy.Tree.get( data=tm.clusters[3].get_tree_newick(), schema="newick", taxon_namespace=tns) t4_rf = dendropy.Tree.get( data=tm.clusters[4].get_tree_newick(), schema="newick", taxon_namespace=tns) t0_infer = Tree() t1_infer = Tree() t2_infer = Tree() t3_infer = Tree() t4_infer = Tree() t0_infer.load_tree_from_direct_arrays(topology_array[0], theta_array[0]) t1_infer.load_tree_from_direct_arrays(topology_array[1], theta_array[1]) t2_infer.load_tree_from_direct_arrays(topology_array[2], theta_array[2]) t3_infer.load_tree_from_direct_arrays(topology_array[3], theta_array[3]) t4_infer.load_tree_from_direct_arrays(topology_array[4], theta_array[4]) t0_infer_rf = dendropy.Tree.get( data=t0_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t1_infer_rf = dendropy.Tree.get( data=t1_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t2_infer_rf = dendropy.Tree.get( data=t2_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t3_infer_rf = dendropy.Tree.get( data=t3_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t4_infer_rf = dendropy.Tree.get( data=t4_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) print('File:', sample_filename) print('------Robinson-Foulds Distance------') rfTree0 = [ RfDist(t0_infer_rf, t0_rf), RfDist(t0_infer_rf, t1_rf), RfDist(t0_infer_rf, t2_rf), RfDist(t0_infer_rf, t3_rf) ] rfTree1 = [ RfDist(t1_infer_rf, t0_rf), RfDist(t1_infer_rf, t1_rf), RfDist(t1_infer_rf, t2_rf), RfDist(t1_infer_rf, t3_rf) ] rfTree2 = [ RfDist(t2_infer_rf, t0_rf), RfDist(t2_infer_rf, t1_rf), RfDist(t2_infer_rf, t2_rf), RfDist(t2_infer_rf, t3_rf) ] rfTree3 = [ RfDist(t3_infer_rf, t0_rf), RfDist(t3_infer_rf, t1_rf), RfDist(t3_infer_rf, t2_rf), RfDist(t3_infer_rf, t3_rf) ] rfTree4 = [ RfDist(t4_infer_rf, t0_rf), RfDist(t4_infer_rf, t1_rf), RfDist(t4_infer_rf, t2_rf), RfDist(t4_infer_rf, t4_rf) ] print('------Real Trees------') print(tm.clusters[0].get_tree_newick()) print(tm.clusters[1].get_tree_newick()) print(tm.clusters[2].get_tree_newick()) print(tm.clusters[3].get_tree_newick()) print(tm.clusters[4].get_tree_newick()) print('------Inferred Trees------') print(t0_infer.get_tree_newick()) print(t1_infer.get_tree_newick()) print(t2_infer.get_tree_newick()) print(t3_infer.get_tree_newick()) print(t4_infer.get_tree_newick()) print() print('RF Distance of Inferred Tree 0 with each Tree(true):', rfTree0) print('RF Distance of Inferred Tree 1 with each Tree(true):', rfTree1) print('RF Distance of Inferred Tree 2 with each Tree(true):', rfTree2) print('RF Distance of Inferred Tree 3 with each Tree(true):', rfTree3) print('RF Distance of Inferred Tree 4 with each Tree(true):', rfTree4) print('------Robinson-Foulds Distance------') print("\t4.2. Make the likelihood comparison.\n") print('Log Likelihood of real mixture: ' + str( truelikelihood([ tm.clusters[0], tm.clusters[1], tm.clusters[2], tm.clusters[3], tm.clusters[4] ], samples, num_samples, num_clusters, tm.pi))) print('Log Likelihood of inferred mixture: ' + str(loglikelihood[-1])) elif sample_filename == "newData/Data2/Dataset2.pkl_samples.txt": num_clusters = 3 num_nodes = 50 tm = TreeMixture(num_clusters, num_nodes) seed_val = 123 tm.simulate_pi(seed_val=seed_val) tm.simulate_trees(seed_val) seed_val = 12 num_samples = 100 tm.sample_mixtures(num_samples, seed_val=seed_val) t0_rf = dendropy.Tree.get( data=tm.clusters[0].get_tree_newick(), schema="newick", taxon_namespace=tns) t1_rf = dendropy.Tree.get( data=tm.clusters[1].get_tree_newick(), schema="newick", taxon_namespace=tns) t2_rf = dendropy.Tree.get( data=tm.clusters[2].get_tree_newick(), schema="newick", taxon_namespace=tns) t0_infer = Tree() t1_infer = Tree() t2_infer = Tree() t0_infer.load_tree_from_direct_arrays(topology_array[0], theta_array[0]) t1_infer.load_tree_from_direct_arrays(topology_array[1], theta_array[1]) t2_infer.load_tree_from_direct_arrays(topology_array[2], theta_array[2]) t0_infer_rf = dendropy.Tree.get( data=t0_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t1_infer_rf = dendropy.Tree.get( data=t1_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) t2_infer_rf = dendropy.Tree.get( data=t2_infer.get_tree_newick(), schema="newick", taxon_namespace=tns) print('File:', sample_filename) print('------Robinson-Foulds Distance------') rfTree0 = [ RfDist(t0_infer_rf, t0_rf), RfDist(t0_infer_rf, t1_rf), RfDist(t0_infer_rf, t2_rf) ] rfTree1 = [ RfDist(t1_infer_rf, t0_rf), RfDist(t1_infer_rf, t1_rf), RfDist(t1_infer_rf, t2_rf) ] rfTree2 = [ RfDist(t2_infer_rf, t0_rf), RfDist(t2_infer_rf, t1_rf), RfDist(t2_infer_rf, t2_rf) ] print('------Real Trees------') print(tm.clusters[0].get_tree_newick()) print(tm.clusters[1].get_tree_newick()) print(tm.clusters[2].get_tree_newick()) print('------Inferred Trees------') print(t0_infer.get_tree_newick()) print(t1_infer.get_tree_newick()) print(t2_infer.get_tree_newick()) print() print('RF Distance of Inferred Tree 0 with each Tree(true):', rfTree0) print('RF Distance of Inferred Tree 1 with each Tree(true):', rfTree1) print('RF Distance of Inferred Tree 2 with each Tree(true):', rfTree2) print('------Robinson-Foulds Distance------') print("\t4.2. Make the likelihood comparison.\n") print('Log Likelihood of real mixture: ' + str( truelikelihood( [tm.clusters[0], tm.clusters[1], tm.clusters[2]], samples, num_samples, num_clusters, tm.pi))) print('Log Likelihood of inferred mixture: ' + str(loglikelihood[-1]))
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100): """ This function is for the EM algorithm. :param seed_val: Seed value for reproducibility. Type: int :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes) :param num_clusters: Number of clusters. Type: int :param max_num_iter: Maximum number of EM iterations. Type: int :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array. Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter. :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes) :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2) You can change the function signature and add new parameters. Add them as parameters with some default values. i.e. Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10): You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123): """ print("Running EM algorithm...") # Set threshold for convergence THRES = 1e-4 num_sieving = 10 num_samples = np.size(samples, 0) num_nodes = np.size(samples, 1) np.random.seed(seed_val) seeds = np.random.randint(0, 100000000, num_sieving) last_loglikelihoods = [] tms = [] for seed in seeds: np.random.seed(seed) tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes) tm.simulate_pi(seed_val=seed) tm.simulate_trees(seed_val=seed) tm_loglikelihood, tm = em_helper(tm, samples, num_clusters, max_num_iter=10) last_loglikelihoods.append(tm_loglikelihood[-1]) tms.append(tm) print("=> Sieving finished") seed = seeds[last_loglikelihoods.index(max(last_loglikelihoods))] # tm = tms[last_loglikelihoods.index(max(last_loglikelihoods))] tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes) tm.simulate_pi(seed_val=seed) tm.simulate_trees(seed_val=seed) loglikelihood, tm = em_helper(tm, samples, num_clusters, max_num_iter=max_num_iter) print("=> EM finished") topology_list = [] theta_list = [] for t in tm.clusters: topology_list.append(t.get_topology_array()) theta_list.append(t.get_theta_array()) loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) # theta_list = np.array(theta_list) return loglikelihood, topology_list, theta_list, tm
def main(): # Code to process command line arguments parser = argparse.ArgumentParser( description='EM algorithm for likelihood of a tree GM.') parser.add_argument( '--sample_filename', type=str, default='data/q_2_5_tm_20node_20sample_4clusters.pkl_samples.txt', help= 'Specify the name of the sample file (i.e data/example_samples.txt)') parser.add_argument( '--real_values_filename', type=str, default='data/q_2_5_tm_20node_20sample_4clusters.pkl', help= 'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)' ) parser.add_argument( '--output_filename', type=str, default='q_2_5_tm_20node_20sample_4clusters_result.txt', help= 'Specify the name of the output file (i.e data/example_results.txt)') parser.add_argument('--num_nodes', type=int, default=10, help='Specify the number of nodes of trees (i.e 10)') parser.add_argument('--num_clusters', type=int, default=10, help='Specify the number of clusters (i.e 3)') parser.add_argument( '--seed_val', type=int, default=123, help='Specify the seed value for reproducibility (i.e 42)') parser.add_argument( '--if_simulate', type=bool, default=True, help='Specify whether the sampling is enabled (i.e False)') parser.add_argument( '--num_samples', type=int, default=50, help='Specify the number of samples if sampling is enabled (i.e 1000)') # You can add more default parameters if you want. print("Hello World!") print( "This file demonstrates the flow of function templates of question 2.5." ) print("\n0. Load the parameters from command line.\n") args = parser.parse_args() print("\tArguments are: ", args) if args.if_simulate: print("\n1. Make new tree and sample.\n") tm_truth = TreeMixture(num_clusters=args.num_clusters, num_nodes=args.num_nodes) tm_truth.simulate_pi(seed_val=args.seed_val) tm_truth.simulate_trees(seed_val=args.seed_val) tm_truth.sample_mixtures(args.num_samples, seed_val=args.seed_val) else: print("\n1. Load true tree from file.\n") tm_truth = TreeMixture(0, 0) tm_truth.load_mixture(args.real_values_filename) print("Load samples.") samples = tm_truth.samples num_samples, num_nodes = samples.shape print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes) print("\tSamples: \n", samples) print("\n2. Run EM Algorithm.\n") loglikelihood, topology_array, theta_array, tm = em_algorithm( args.seed_val, samples, num_clusters=args.num_clusters) print("\n3. Save, print and plot the results.\n") # save_results(loglikelihood, topology_array, theta_array, args.output_filename) for i in range(args.num_clusters): print("\n\tCluster: ", i) print("\tTopology: \t", topology_array[i]) print("\tTheta: \t", theta_array[i]) plt.figure(figsize=(8, 3)) plt.subplot(121) plt.plot(np.exp(loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) plt.plot(loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend(loc=(1.04, 0)) plt.show() print("\n4. Retrieve real results and compare.\n") if args.real_values_filename != "": print( "\n=> Compare trees and print Robinson-Foulds (RF) distance (result v.s truth):\n" ) N = len(samples) K = tm_truth.num_clusters tns = dendropy.TaxonNamespace() print("\\hline") for k in range(K): print(k, end=" & ") for j in range(K): t_0 = tm.clusters[k] t_0.get_tree_newick() t_0 = dendropy.Tree.get(data=t_0.newick, schema="newick", taxon_namespace=tns) t_t = tm_truth.clusters[j] t_t.get_tree_newick() t_t = dendropy.Tree.get(data=t_t.newick, schema="newick", taxon_namespace=tns) print(dendropy.calculate.treecompare.symmetric_difference( t_0, t_t), end=" & ") print("\\\\") print("\n=> Compare log-likelihood (result v.s truth):\n") posterior = np.ones((N, K)) prior = np.ones(N) for n, x in enumerate(samples): for k, tree in enumerate(tm_truth.clusters): visit_list = [tree.root] while len(visit_list) != 0: cur_node = visit_list[0] visit_list = visit_list[1:] visit_list = visit_list + cur_node.descendants if cur_node.ancestor is None: posterior[n, k] *= cur_node.cat[x[int(cur_node.name)]] else: posterior[n, k] *= cur_node.cat[x[int( cur_node.ancestor.name)]][x[int(cur_node.name)]] prior[n] *= np.sum(posterior[n] * tm_truth.pi) loglikelihood_truth = np.sum(np.log(prior)) print("%f : %f" % (loglikelihood[-1], loglikelihood_truth))
def main(): # Code to process command line arguments parser = argparse.ArgumentParser( description='EM algorithm for likelihood of a tree GM.') parser.add_argument( 'sample_filename', type=str, help= 'Specify the name of the sample file (i.e data/example_samples.txt)') parser.add_argument( 'output_filename', type=str, help= 'Specify the name of the output file (i.e data/example_results.txt)') parser.add_argument('num_clusters', type=int, help='Specify the number of clusters (i.e 3)') parser.add_argument( '--seed_val', type=int, default=42, help='Specify the seed value for reproducibility (i.e 42)') parser.add_argument( '--real_values_filename', type=str, default="", help= 'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)' ) # You can add more default parameters if you want. print("Hello World!") print( "This file demonstrates the flow of function templates of question 2.5." ) print("\n0. Load the parameters from command line.\n") args = parser.parse_args() print("\tArguments are: ", args) print("\n1. Load samples from txt file.\n") samples = np.loadtxt(args.sample_filename, delimiter="\t", dtype=np.int32) num_samples, num_nodes = samples.shape print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes) print("\tSamples: \n", samples) print("\n2. Run EM Algorithm.\n") loglikelihood, topology_array, theta_array = em_algorithm( args.seed_val, samples, num_clusters=args.num_clusters) print("\n3. Save, print and plot the results.\n") save_results(loglikelihood, topology_array, theta_array, args.output_filename) for i in range(args.num_clusters): print("\n\tCluster: ", i) print("\tTopology: \t", topology_array[i]) print("\tTheta: \t", theta_array[i]) plt.figure(figsize=(8, 3)) plt.subplot(121) plt.plot(np.exp(loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) plt.plot(loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend(loc=(1.04, 0)) plt.show() print("\n4. Retrieve real results and compare.\n") if args.real_values_filename != "": print("\tComparing the results with real values...") actual_tm = TreeMixture(args.num_clusters, num_nodes) actual_tm.load_mixture(args.real_values_filename) inferred_tm = TreeMixture(args.num_clusters, num_nodes) inferred_tm.load_mixture(args.output_filename) print("\t4.1. Make the Robinson-Foulds distance analysis.\n") diff = compute_tree_mix_diff(actual_tm, inferred_tm) print("Total Robinson-Foulds distance: " + str(diff)) print("\t4.2. Make the likelihood comparison.\n") actual_lik = actual_tm.likelihood_dataset(samples) inferred_lik = inferred_tm.likelihood_dataset(samples) print("Log-Likelihood of actual tree: " + str(actual_lik) + ", inferred tree: " + str(inferred_lik))
def em_algorithm(seed_val, samples, num_clusters, max_num_iter, tm=None): num_samples = samples.shape[0] num_nodes = samples.shape[1] loglikelihood = [] if tm is None: tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes) tm.simulate_pi(None) tm.simulate_trees(None) #samples = tm.samples for iter_ in range(max_num_iter): # 1. Compute responsibilities for all trees sample_likelihoods = np.array([[sample_likelihood(tm.clusters[ii], samples[jj,:]\ , tm.pi[ii]) for ii in range(num_clusters)] for jj in range(num_samples)]) sum_over_trees_likelihoods = np.reshape( np.sum(sample_likelihoods, axis=1), (num_samples, 1)) Responsibilities = np.divide(sample_likelihoods, sum_over_trees_likelihoods) # Computing loglikelihood ll = np.sum(np.log(np.sum(sample_likelihoods, axis=1)), axis=None) loglikelihood.append(ll) tm.loglikelihood.append(ll) # 2. Updating pi for all trees tm.pi = np.sum(Responsibilities, axis=0) / num_samples vertices = list(range(num_nodes)) # 3. Updating each tree for i in range(num_clusters): tree = tm.clusters[i] responsibilities = Responsibilities[:, i] # Creating the symmetric mutual information matrix mutual_information_matrix = np.asarray([[mutual_information(responsibilities, samples, s_idx, t_idx) \ for s_idx in vertices] for t_idx in vertices]) # Computing the graph graph = create_graph(num_nodes, responsibilities, samples, mutual_information_matrix, vertices) # Finding the maximum spanning tree MST = maximum_spanning_tree(graph) # Choosing the root as 0 root_name = 0 # Finding the order of nodes in the tree ordered_nodes, I_sum_tree = create_ordered_nodes(MST, root_name) # Getting attributes for tree to enable update topology_array, theta_array = create_tree_attributes1( ordered_nodes, root_name, samples, responsibilities, num_nodes) # Updating the tree tree.load_tree_from_direct_arrays(topology_array, theta_array) # ------------------------------------------- topology_list = [] theta_list = [] for i in range(num_clusters): topology_list.append(tm.clusters[i].get_topology_array()) theta_list.append(tm.clusters[i].get_theta_array()) loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) theta_list = np.array(theta_list) return loglikelihood, topology_list, theta_list, tm
def main(): num_clusters = 3 new_tm = TreeMixture(num_clusters=num_clusters, num_nodes=3) new_tm.simulate_pi(None) new_tm.simulate_trees(None) new_tm.sample_mixtures(100) new_samples = new_tm.samples #samples = tm.samples seed_val = None directory = '/Users/filipbergentoft/Desktop/Github/DD2434/Assignment 2/2_4/' sample_filename = directory + "data/q2_4/q2_4_tree_mixture.pkl_samples.txt" output_filename = directory + "data/q2_4/q2_4_own_results" real_values_filename = directory + "data/q2_4/q2_4_tree_mixture.pkl" samples = np.loadtxt(sample_filename, delimiter="\t", dtype=np.int32) np.random.shuffle(samples) best_tm = sieving(n_first_mixtures=50, n_second_mixtures=10, n_first_iterations=10, n_second_iterations=100, samples=samples, num_clusters=num_clusters) real_tm = TreeMixture(num_clusters=3, num_nodes=5) real_tm.load_mixture(real_values_filename) print('best tree', mixture_likelihood(best_tm, samples)) print('best tree', best_tm.pi) print('real tree', mixture_likelihood(real_tm, samples)) print('real tree', real_tm.pi) print(RF_comparison(best_tm, real_tm)) for tree in new_tm.clusters: print('Real tree topology') print(tree.get_topology_array()) for tree in best_tm.clusters: print('Inferred tree topology') print(tree.get_topology_array()) sns.set_style('darkgrid') """ plt.subplot(121) plt.plot(np.exp(best_tm.loglikelihood), label='Estimated') plt.ylabel("Likelihood of Mixture") plt.xlabel("Iterations") plt.subplot(122) """ plt.plot(best_tm.loglikelihood, label='Estimated') plt.ylabel("Log-Likelihood of Mixture") plt.xlabel("Iterations") plt.legend() plt.show()
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100): """ This function is for the EM algorithm. :param seed_val: Seed value for reproducibility. Type: int :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes) :param num_clusters: Number of clusters. Type: int :param max_num_iter: Maximum number of EM iterations. Type: int :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array. Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter. :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes) :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2) You can change the function signature and add new parameters. Add them as parameters with some default values. i.e. Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10): You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123): """ # Set the seed np.random.seed(seed_val) # TODO: Implement EM algorithm here. # Start: Example Code Segment. Delete this segment completely before you implement the algorithm. print("Running EM algorithm...") from Kruskal_v1 import Graph # return result in the method import sys tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(seed_val=seed_val) tm.simulate_trees(seed_val=seed_val) tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seed_val) eps = sys.float_info.min topology_list = [] theta_list = [] loglikelihood = [] num_samples = samples.shape[0] num_nodes = samples.shape[1] for iter in range(max_num_iter): r = np.ones((num_samples, num_clusters)) for i, sample in enumerate(samples): for j, t in enumerate(tm.clusters): visitedNodes = [t.root] r[i, j] *= tm.pi[j] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] if len(presentNode.descendants) != 0: visitedNodes = visitedNodes + presentNode.descendants if presentNode.ancestor == None: #root node r[i, j] *= presentNode.cat[sample[int(presentNode.name)]] else: r[i, j] *= presentNode.cat[sample[int( presentNode.ancestor.name)]][sample[int( presentNode.name)]] r += eps rn = np.sum(r, axis=1).reshape(num_samples, 1) r /= rn loglikelihood.append(np.sum(np.log(rn))) tm.pi = np.sum(r, axis=0) / num_samples den = np.sum(r, axis=0) NominatorQk = np.zeros((num_nodes, num_nodes, 2, 2, num_clusters)) for s in range(num_nodes): for t in range(num_nodes): for a in range(2): for b in range(2): matched_index = np.where( (samples[:, (s, t)] == [a, b]).all(1))[0] NominatorQk[s, t, a, b] = np.sum(r[matched_index], axis=0) / den DenominatorQk = np.zeros((num_nodes, 2, num_clusters)) for s in range(num_nodes): for a in range(2): matched_index = np.where((samples[:, s] == a)) DenominatorQk[s, a] = np.sum(r[matched_index], axis=0) / den Iqst = np.zeros((num_nodes, num_nodes, num_clusters)) for s in range(num_nodes): for t in range(num_nodes): for a in range(2): for b in range(2): if (np.all(NominatorQk[s, t, a, b, :] > 0)): Iqst[s, t] += NominatorQk[s, t, a, b] * np.log( (NominatorQk[s, t, a, b] / (DenominatorQk[s, a])) / DenominatorQk[t, b]) else: Iqst[s, t] += 0 for k in range(num_clusters): g = Graph(num_nodes) for s in range(num_nodes): for t in range(s + 1, num_nodes): g.addEdge(s, t, Iqst[s, t, k]) mst_edges = np.array(g.maximum_spanning_tree())[:, [0, 1]] topology_array = np.zeros(num_nodes) topology_array[0] = np.nan visitedNodes = [0] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] child_edges = np.array(np.where(mst_edges == [presentNode])).T for ind in child_edges: child = mst_edges[ind[0]][1 - ind[1]] topology_array[int(child)] = presentNode visitedNodes.append(child) if np.size(child_edges) != 0: mst_edges = np.delete(mst_edges, child_edges[:, 0], 0) new_tree = Tree() new_tree.load_tree_from_direct_arrays(topology_array) new_tree.alpha = [1.0] * 2 new_tree.k = 2 visitedNodes = [new_tree.root] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] if len(presentNode.descendants) != 0: visitedNodes = visitedNodes + presentNode.descendants if presentNode.ancestor == None: presentNode.cat = DenominatorQk[int(presentNode.name), :, k].tolist() else: presentNode.cat = NominatorQk[ int(presentNode.ancestor.name), int(presentNode.name), :, :, k] presentNode.cat[0] = presentNode.cat[0] / np.sum( presentNode.cat[0]) presentNode.cat[1] = presentNode.cat[1] / np.sum( presentNode.cat[1]) presentNode.cat = [presentNode.cat[0], presentNode.cat[1]] tm.clusters[k] = new_tree for j, t in enumerate(tm.clusters): topology_list.append(t.get_topology_array()) theta_list.append(t.get_theta_array()) loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) theta_list = np.array(theta_list) return loglikelihood, topology_list, theta_list
from Tree import TreeMixture import Exercise2_5 import argparse # This file has the pupose of creating new samples for testing exercise 5 parser = argparse.ArgumentParser() parser.add_argument("seed", help="Introduce the seed to generate trees", type=int) parser.add_argument("samples", help="Introduce the number of samples", type=int) parser.add_argument("nodes", help="Introduce the number of nodes", type=int) parser.add_argument("clusters", help="Introduce the number of clusters", type=int) args = parser.parse_args() print("Generating tree with seed:", args.seed, "\tsamples:", args.samples, "\tnodes:", args.nodes, "\tclusters:", args.clusters) tm = TreeMixture(num_clusters=args.clusters, num_nodes=args.nodes) tm.simulate_pi(seed_val=args.seed) tm.simulate_trees(seed_val=args.seed) tm.sample_mixtures(num_samples=args.samples, seed_val=args.seed) path = 'data/q_2_5_tm_' + str(args.nodes) + 'node_' + str( args.samples) + 'sample_' + str(args.clusters) + 'clusters.pkl' tm.save_mixture(path, True)
def em_algorithm(seed_val, samples, num_clusters, max_num_iter): # Initialize the needed variables sieving = 100 max_log = float("-inf") best_seed = 0 # Get the best seed for likelihood for siev in tqdm(range(sieving)): # Set the seed aux_seed = seed_val + siev # Try with all seeds from @param:seed_val to @param:seed_val + sieving np.random.seed(aux_seed) # Generate tree mixture tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(seed_val=aux_seed) tm.simulate_trees(seed_val=aux_seed) topology_list = [] theta_list = [] for i in range(num_clusters): topology_list.append(tm.clusters[i].get_topology_array()) theta_list.append(tm.clusters[i].get_theta_array()) # Run 10 iterations according to this mixture loglikelihood = computationsEM(10, samples, num_clusters, tm, topology_list, theta_list) aux = loglikelihood[-1] if (aux > max_log): max_log = aux best_seed = aux_seed # -------------------- End of sieving -------------------- # # Variable initialization np.random.seed(best_seed) topology_list = [] # Dimensions: (num_clusters, num_nodes) theta_list = [] # Dimensions: (num_clusters, num_nodes, 2) tm = TreeMixture(num_clusters = num_clusters, num_nodes = samples.shape[1]) tm.simulate_pi(seed_val = best_seed) tm.simulate_trees(seed_val = best_seed) for k in range(num_clusters): topology_list.append(tm.clusters[k].get_topology_array()) theta_list.append(tm.clusters[k].get_theta_array()) # Beginning of iterations pi = tm.pi loglikelihood = computationsEM(max_num_iter, samples, num_clusters, tm, topology_list, theta_list) return loglikelihood, topology_list, theta_list
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100): print("Running EM algorithm...") # Set threshold for convergence THRES = 1e-4 # Set rounds for sieving num_sieving = 10 # Get the dimension of the data num_samples = np.size(samples, 0) num_nodes = np.size(samples, 1) # Sieving np.random.seed(seed_val) seeds = np.random.randint(0, 100000000, num_sieving) last_loglikelihoods = [] tms = [] for seed in seeds: np.random.seed(seed) tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes) tm.simulate_pi(seed_val=seed) tm.simulate_trees(seed_val=seed) tm_loglikelihood, tm = em_helper(tm, samples, num_clusters, max_num_iter=10) last_loglikelihoods.append(tm_loglikelihood[-1]) tms.append(tm) # Main procedure for EM algorithm print("=> Sieving finished") seed = seeds[last_loglikelihoods.index(max(last_loglikelihoods))] tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes) tm.simulate_pi(seed_val=seed) tm.simulate_trees(seed_val=seed) loglikelihood, tm = em_helper(tm, samples, num_clusters, max_num_iter=max_num_iter) print("=> EM finished") topology_list = [] theta_list = [] for t in tm.clusters: topology_list.append(t.get_topology_array()) theta_list.append(t.get_theta_array()) loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) theta_list = np.array(theta_list) return loglikelihood, topology_list, theta_list, tm