def em_helper(tm, samples, num_clusters, max_num_iter=10): num_samples = np.size(samples, 0) num_nodes = np.size(samples, 1) loglikelihood = [] for iter in range(max_num_iter): print("==================== " + str(iter) + "-th iteration ====================") # Step 1: Compute the responsibilities r = np.ones((num_samples, num_clusters)) for n, x in enumerate(samples): for k, t in enumerate(tm.clusters): r[n, k] *= tm.pi[k] visit_list = [t.root] while len(visit_list) is not 0: cur_node = visit_list[0] visit_list = visit_list[1:] visit_list = visit_list + cur_node.descendants if cur_node.ancestor is None: r[n, k] *= cur_node.cat[x[int(cur_node.name)]] else: r[n, k] *= cur_node.cat[x[int( cur_node.ancestor.name)]][x[int(cur_node.name)]] r += epsilon marginal = np.reshape(np.sum(r, axis=1), (num_samples, 1)) loglikelihood.append(np.sum(np.log(marginal))) marginal_expand = np.repeat(marginal, num_clusters, axis=1) r /= marginal_expand # Step 2: Update categorical distribution tm.pi = np.mean(r, axis=0) # Step 3: Construct directed graphs denom = np.sum(r, axis=0) q = np.zeros( (num_nodes, num_nodes, 2, 2, num_clusters)) # (s, t, a, b, k) for s in range(num_nodes): for t in range(num_nodes): for a in range(2): for b in range(2): index = np.where((samples[:, (s, t)] == [a, b]).all(1))[0] numer = np.sum(r[index], axis=0) q[s, t, a, b] = numer / denom q += epsilon q_s = np.zeros((num_nodes, 2, num_clusters)) for s in range(num_nodes): for a in range(2): index = np.where(samples[:, s] == a) numer = np.sum(r[index], axis=0) q_s[s, a] = numer / denom q_s += epsilon I = np.zeros((num_nodes, num_nodes, num_clusters)) # (s, t, k) for s in range(num_nodes): for t in range(num_nodes): for a in range(2): for b in range(2): I[s, t] += q[s, t, a, b] * np.log( q[s, t, a, b] / q_s[s, a] / q_s[t, b]) clusters = [] for k in range(num_clusters): g = Graph(num_nodes) for s in range(num_nodes): for t in range(s + 1, num_nodes): g.addEdge(s, t, I[s, t, k]) # Step 4: Construct maximum spanning trees edges = np.array(g.maximum_spanning_tree())[:, 0:2] topology_array = np.zeros(num_nodes) topology_array[0] = np.nan visit_list = [0] while len(visit_list) != 0: cur_node = visit_list[0] index = np.where(edges == cur_node) index = np.transpose(np.stack(index)) visit_list = visit_list[1:] for id in index: child = edges[id[0], 1 - id[1]] topology_array[int(child)] = cur_node visit_list.append(int(child)) if np.size(index) is not 0: edges = np.delete(edges, index[:, 0], axis=0) tree = Tree() tree.load_tree_from_direct_arrays(topology_array) tree.k = 2 tree.alpha = [1.0] * 2 # Step 5: Update CPDs visit_list = [tree.root] while len(visit_list) != 0: cur_node = visit_list[0] visit_list = visit_list[1:] visit_list = visit_list + cur_node.descendants if cur_node.ancestor is None: cur_node.cat = q_s[int(cur_node.name), :, k].tolist() else: cat = q[int(cur_node.ancestor.name), int(cur_node.name), :, :, k] cur_node.cat = [cat[0], cat[1]] clusters.append(tree) tm.clusters = clusters return loglikelihood, tm
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100): """ This function is for the EM algorithm. :param seed_val: Seed value for reproducibility. Type: int :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes) :param num_clusters: Number of clusters. Type: int :param max_num_iter: Maximum number of EM iterations. Type: int :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array. Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter. :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes) :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2) You can change the function signature and add new parameters. Add them as parameters with some default values. i.e. Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10): You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123): """ # Set the seed np.random.seed(seed_val) # TODO: Implement EM algorithm here. # Start: Example Code Segment. Delete this segment completely before you implement the algorithm. print("Running EM algorithm...") from Kruskal_v1 import Graph # return result in the method import sys tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1]) tm.simulate_pi(seed_val=seed_val) tm.simulate_trees(seed_val=seed_val) tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seed_val) eps = sys.float_info.min topology_list = [] theta_list = [] loglikelihood = [] num_samples = samples.shape[0] num_nodes = samples.shape[1] for iter in range(max_num_iter): r = np.ones((num_samples, num_clusters)) for i, sample in enumerate(samples): for j, t in enumerate(tm.clusters): visitedNodes = [t.root] r[i, j] *= tm.pi[j] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] if len(presentNode.descendants) != 0: visitedNodes = visitedNodes + presentNode.descendants if presentNode.ancestor == None: #root node r[i, j] *= presentNode.cat[sample[int(presentNode.name)]] else: r[i, j] *= presentNode.cat[sample[int( presentNode.ancestor.name)]][sample[int( presentNode.name)]] r += eps rn = np.sum(r, axis=1).reshape(num_samples, 1) r /= rn loglikelihood.append(np.sum(np.log(rn))) tm.pi = np.sum(r, axis=0) / num_samples den = np.sum(r, axis=0) NominatorQk = np.zeros((num_nodes, num_nodes, 2, 2, num_clusters)) for s in range(num_nodes): for t in range(num_nodes): for a in range(2): for b in range(2): matched_index = np.where( (samples[:, (s, t)] == [a, b]).all(1))[0] NominatorQk[s, t, a, b] = np.sum(r[matched_index], axis=0) / den DenominatorQk = np.zeros((num_nodes, 2, num_clusters)) for s in range(num_nodes): for a in range(2): matched_index = np.where((samples[:, s] == a)) DenominatorQk[s, a] = np.sum(r[matched_index], axis=0) / den Iqst = np.zeros((num_nodes, num_nodes, num_clusters)) for s in range(num_nodes): for t in range(num_nodes): for a in range(2): for b in range(2): if (np.all(NominatorQk[s, t, a, b, :] > 0)): Iqst[s, t] += NominatorQk[s, t, a, b] * np.log( (NominatorQk[s, t, a, b] / (DenominatorQk[s, a])) / DenominatorQk[t, b]) else: Iqst[s, t] += 0 for k in range(num_clusters): g = Graph(num_nodes) for s in range(num_nodes): for t in range(s + 1, num_nodes): g.addEdge(s, t, Iqst[s, t, k]) mst_edges = np.array(g.maximum_spanning_tree())[:, [0, 1]] topology_array = np.zeros(num_nodes) topology_array[0] = np.nan visitedNodes = [0] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] child_edges = np.array(np.where(mst_edges == [presentNode])).T for ind in child_edges: child = mst_edges[ind[0]][1 - ind[1]] topology_array[int(child)] = presentNode visitedNodes.append(child) if np.size(child_edges) != 0: mst_edges = np.delete(mst_edges, child_edges[:, 0], 0) new_tree = Tree() new_tree.load_tree_from_direct_arrays(topology_array) new_tree.alpha = [1.0] * 2 new_tree.k = 2 visitedNodes = [new_tree.root] while len(visitedNodes) != 0: presentNode = visitedNodes[0] visitedNodes = visitedNodes[1:] if len(presentNode.descendants) != 0: visitedNodes = visitedNodes + presentNode.descendants if presentNode.ancestor == None: presentNode.cat = DenominatorQk[int(presentNode.name), :, k].tolist() else: presentNode.cat = NominatorQk[ int(presentNode.ancestor.name), int(presentNode.name), :, :, k] presentNode.cat[0] = presentNode.cat[0] / np.sum( presentNode.cat[0]) presentNode.cat[1] = presentNode.cat[1] / np.sum( presentNode.cat[1]) presentNode.cat = [presentNode.cat[0], presentNode.cat[1]] tm.clusters[k] = new_tree for j, t in enumerate(tm.clusters): topology_list.append(t.get_topology_array()) theta_list.append(t.get_theta_array()) loglikelihood = np.array(loglikelihood) topology_list = np.array(topology_list) theta_list = np.array(theta_list) return loglikelihood, topology_list, theta_list