Example #1
0
def em_helper(tm, samples, num_clusters, max_num_iter=10):
    num_samples = np.size(samples, 0)
    num_nodes = np.size(samples, 1)

    loglikelihood = []
    for iter in range(max_num_iter):
        print("==================== " + str(iter) +
              "-th iteration ====================")
        # Step 1: Compute the responsibilities
        r = np.ones((num_samples, num_clusters))

        for n, x in enumerate(samples):
            for k, t in enumerate(tm.clusters):
                r[n, k] *= tm.pi[k]
                visit_list = [t.root]
                while len(visit_list) is not 0:
                    cur_node = visit_list[0]
                    visit_list = visit_list[1:]
                    visit_list = visit_list + cur_node.descendants
                    if cur_node.ancestor is None:
                        r[n, k] *= cur_node.cat[x[int(cur_node.name)]]
                    else:
                        r[n, k] *= cur_node.cat[x[int(
                            cur_node.ancestor.name)]][x[int(cur_node.name)]]

        r += epsilon
        marginal = np.reshape(np.sum(r, axis=1), (num_samples, 1))
        loglikelihood.append(np.sum(np.log(marginal)))
        marginal_expand = np.repeat(marginal, num_clusters, axis=1)
        r /= marginal_expand

        # Step 2: Update categorical distribution
        tm.pi = np.mean(r, axis=0)

        # Step 3: Construct directed graphs
        denom = np.sum(r, axis=0)
        q = np.zeros(
            (num_nodes, num_nodes, 2, 2, num_clusters))  # (s, t, a, b, k)
        for s in range(num_nodes):
            for t in range(num_nodes):
                for a in range(2):
                    for b in range(2):
                        index = np.where((samples[:, (s, t)] == [a,
                                                                 b]).all(1))[0]
                        numer = np.sum(r[index], axis=0)
                        q[s, t, a, b] = numer / denom
        q += epsilon

        q_s = np.zeros((num_nodes, 2, num_clusters))
        for s in range(num_nodes):
            for a in range(2):
                index = np.where(samples[:, s] == a)
                numer = np.sum(r[index], axis=0)
                q_s[s, a] = numer / denom
        q_s += epsilon

        I = np.zeros((num_nodes, num_nodes, num_clusters))  # (s, t, k)
        for s in range(num_nodes):
            for t in range(num_nodes):
                for a in range(2):
                    for b in range(2):
                        I[s, t] += q[s, t, a, b] * np.log(
                            q[s, t, a, b] / q_s[s, a] / q_s[t, b])

        clusters = []
        for k in range(num_clusters):
            g = Graph(num_nodes)
            for s in range(num_nodes):
                for t in range(s + 1, num_nodes):
                    g.addEdge(s, t, I[s, t, k])

            # Step 4: Construct maximum spanning trees
            edges = np.array(g.maximum_spanning_tree())[:, 0:2]
            topology_array = np.zeros(num_nodes)
            topology_array[0] = np.nan
            visit_list = [0]
            while len(visit_list) != 0:
                cur_node = visit_list[0]
                index = np.where(edges == cur_node)
                index = np.transpose(np.stack(index))
                visit_list = visit_list[1:]
                for id in index:
                    child = edges[id[0], 1 - id[1]]
                    topology_array[int(child)] = cur_node
                    visit_list.append(int(child))
                if np.size(index) is not 0:
                    edges = np.delete(edges, index[:, 0], axis=0)

            tree = Tree()
            tree.load_tree_from_direct_arrays(topology_array)
            tree.k = 2
            tree.alpha = [1.0] * 2

            # Step 5: Update CPDs
            visit_list = [tree.root]
            while len(visit_list) != 0:
                cur_node = visit_list[0]
                visit_list = visit_list[1:]
                visit_list = visit_list + cur_node.descendants
                if cur_node.ancestor is None:
                    cur_node.cat = q_s[int(cur_node.name), :, k].tolist()
                else:
                    cat = q[int(cur_node.ancestor.name),
                            int(cur_node.name), :, :, k]
                    cur_node.cat = [cat[0], cat[1]]

            clusters.append(tree)
        tm.clusters = clusters

    return loglikelihood, tm
Example #2
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100):
    """
    This function is for the EM algorithm.
    :param seed_val: Seed value for reproducibility. Type: int
    :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes)
    :param num_clusters: Number of clusters. Type: int
    :param max_num_iter: Maximum number of EM iterations. Type: int
    :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array.
                Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter.
    :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes)
    :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2)

    You can change the function signature and add new parameters. Add them as parameters with some default values.
    i.e.
    Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10):
    You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123):
    """

    # Set the seed
    np.random.seed(seed_val)

    # TODO: Implement EM algorithm here.

    # Start: Example Code Segment. Delete this segment completely before you implement the algorithm.
    print("Running EM algorithm...")

    from Kruskal_v1 import Graph
    # return result in the method
    import sys

    tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
    tm.simulate_pi(seed_val=seed_val)
    tm.simulate_trees(seed_val=seed_val)
    tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seed_val)
    eps = sys.float_info.min
    topology_list = []
    theta_list = []
    loglikelihood = []
    num_samples = samples.shape[0]
    num_nodes = samples.shape[1]
    for iter in range(max_num_iter):
        r = np.ones((num_samples, num_clusters))
        for i, sample in enumerate(samples):
            for j, t in enumerate(tm.clusters):
                visitedNodes = [t.root]
                r[i, j] *= tm.pi[j]
                while len(visitedNodes) != 0:
                    presentNode = visitedNodes[0]
                    visitedNodes = visitedNodes[1:]
                    if len(presentNode.descendants) != 0:
                        visitedNodes = visitedNodes + presentNode.descendants
                    if presentNode.ancestor == None:  #root node
                        r[i,
                          j] *= presentNode.cat[sample[int(presentNode.name)]]
                    else:
                        r[i, j] *= presentNode.cat[sample[int(
                            presentNode.ancestor.name)]][sample[int(
                                presentNode.name)]]

        r += eps
        rn = np.sum(r, axis=1).reshape(num_samples, 1)
        r /= rn
        loglikelihood.append(np.sum(np.log(rn)))
        tm.pi = np.sum(r, axis=0) / num_samples
        den = np.sum(r, axis=0)
        NominatorQk = np.zeros((num_nodes, num_nodes, 2, 2, num_clusters))
        for s in range(num_nodes):
            for t in range(num_nodes):
                for a in range(2):
                    for b in range(2):
                        matched_index = np.where(
                            (samples[:, (s, t)] == [a, b]).all(1))[0]
                        NominatorQk[s, t, a,
                                    b] = np.sum(r[matched_index], axis=0) / den

        DenominatorQk = np.zeros((num_nodes, 2, num_clusters))
        for s in range(num_nodes):
            for a in range(2):
                matched_index = np.where((samples[:, s] == a))
                DenominatorQk[s, a] = np.sum(r[matched_index], axis=0) / den

        Iqst = np.zeros((num_nodes, num_nodes, num_clusters))
        for s in range(num_nodes):
            for t in range(num_nodes):
                for a in range(2):
                    for b in range(2):
                        if (np.all(NominatorQk[s, t, a, b, :] > 0)):
                            Iqst[s, t] += NominatorQk[s, t, a, b] * np.log(
                                (NominatorQk[s, t, a, b] /
                                 (DenominatorQk[s, a])) / DenominatorQk[t, b])
                        else:
                            Iqst[s, t] += 0
        for k in range(num_clusters):
            g = Graph(num_nodes)
            for s in range(num_nodes):
                for t in range(s + 1, num_nodes):
                    g.addEdge(s, t, Iqst[s, t, k])

            mst_edges = np.array(g.maximum_spanning_tree())[:, [0, 1]]
            topology_array = np.zeros(num_nodes)
            topology_array[0] = np.nan
            visitedNodes = [0]
            while len(visitedNodes) != 0:
                presentNode = visitedNodes[0]
                visitedNodes = visitedNodes[1:]
                child_edges = np.array(np.where(mst_edges == [presentNode])).T
                for ind in child_edges:
                    child = mst_edges[ind[0]][1 - ind[1]]
                    topology_array[int(child)] = presentNode
                    visitedNodes.append(child)
                if np.size(child_edges) != 0:
                    mst_edges = np.delete(mst_edges, child_edges[:, 0], 0)

            new_tree = Tree()
            new_tree.load_tree_from_direct_arrays(topology_array)
            new_tree.alpha = [1.0] * 2
            new_tree.k = 2

            visitedNodes = [new_tree.root]
            while len(visitedNodes) != 0:
                presentNode = visitedNodes[0]
                visitedNodes = visitedNodes[1:]

                if len(presentNode.descendants) != 0:
                    visitedNodes = visitedNodes + presentNode.descendants

                if presentNode.ancestor == None:
                    presentNode.cat = DenominatorQk[int(presentNode.name), :,
                                                    k].tolist()
                else:
                    presentNode.cat = NominatorQk[
                        int(presentNode.ancestor.name),
                        int(presentNode.name), :, :, k]
                    presentNode.cat[0] = presentNode.cat[0] / np.sum(
                        presentNode.cat[0])
                    presentNode.cat[1] = presentNode.cat[1] / np.sum(
                        presentNode.cat[1])
                    presentNode.cat = [presentNode.cat[0], presentNode.cat[1]]

            tm.clusters[k] = new_tree

        for j, t in enumerate(tm.clusters):
            topology_list.append(t.get_topology_array())
            theta_list.append(t.get_theta_array())
    loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    theta_list = np.array(theta_list)
    return loglikelihood, topology_list, theta_list