def do_experiment(num_clusters, num_nodes):
    tm = TreeMixture(num_clusters, num_nodes)
    tm.simulate_pi(seed_val)
    tm.simulate_trees(seed_val)
    tm.sample_mixtures(2000, seed_val=seed_val)

    samples = tm.samples

    em = EM_Algorithm(samples, num_clusters, seed_val=seed_val)
    em.initialize(1, 2)
    loglikelihoods, topology_list, theta_list = em.optimize(max_num_iter=100)

    real_likelyhood = tm.likelihood_dataset(samples)
    learned_likelyhood = em.tree_mixture.likelihood_dataset(samples)

    print("Real likelyhood: " + str(real_likelyhood) + ", learned " +
          str(learned_likelyhood))

    plt.title('Actual likelihood: ' + str(real_likelyhood) +
              ', inferred likelyhood: ' + str(learned_likelyhood))
    plt.plot(range(len(loglikelihoods)),
             loglikelihoods,
             label=str(num_clusters) + ' clusters, ' + str(num_nodes) +
             ' nodes')
    plt.legend()
    plt.show()
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100):
    """
    This function is for the EM algorithm.
    :param seed_val: Seed value for reproducibility. Type: int
    :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes)
    :param num_clusters: Number of clusters. Type: int
    :param max_num_iter: Maximum number of EM iterations. Type: int
    :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array.
                Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter.
    :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes)
    :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2)

    You can change the function signature and add new parameters. Add them as parameters with some default values.
    i.e.
    Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10):
    You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123):
    """

    # Set the seed
    np.random.seed(seed_val)

    # TODO: Implement EM algorithm here.

    # Start: Example Code Segment. Delete this segment completely before you implement the algorithm.
    print("Running EM algorithm...")

    loglikelihood = []

    for iter_ in range(max_num_iter):
        loglikelihood.append(np.log((1 + iter_) / max_num_iter))

    from Tree import TreeMixture

    tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
    tm.simulate_pi(seed_val=seed_val)
    tm.simulate_trees(seed_val=seed_val)
    tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seed_val)

    topology_list = []
    theta_list = []
    for i in range(num_clusters):
        topology_list.append(tm.clusters[i].get_topology_array())
        theta_list.append(tm.clusters[i].get_theta_array())

    loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    theta_list = np.array(theta_list)
    # End: Example Code Segment

    ###

    return loglikelihood, topology_list, theta_list
    def initialize(self, sieving_tries=100, sieving_train=10):
        """Initializes the TreeMixtures using sieving.

        Parameters:
            sieving_tries -- Number of random initializations
            sieving_train -- Number of iterations for each initialization
        """
        print("Initializing EM ...")
        best_tree_mix = None
        best_likelihood = -float('inf')
        for t in tqdm(range(sieving_tries)):
            tree_mix = TreeMixture(self.num_clusters, self.num_nodes)
            tree_mix.simulate_trees(self.seed_val)
            tree_mix.simulate_pi(self.seed_val)

            tm, likelihoods = self.__iter_optimize(sieving_train, tree_mix=tree_mix, display_progress=False)
            if likelihoods[-1] > best_likelihood:
                best_likelihood = likelihoods[-1]
                best_tree_mix = tm
        self.tree_mixture = best_tree_mix
Example #4
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=10, tm=None):
    """
    This function is for the EM algorithm.
    :param seed_val: Seed value for reproducibility. Type: int
    :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes)
    :param num_clusters: Number of clusters. Type: int
    :param max_num_iter: Maximum number of EM iterations. Type: int
    :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array.
                Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter.
    :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes)
    :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2)
    This is a suggested template. Feel free to code however you want.
    """
    # Set the seed
    np.random.seed(seed_val)

    # TODO: Implement EM algorithm here.
    N = len(samples)
    K = num_clusters
    V = samples.shape[1]
    if tm == None:
        tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
        tm.simulate_pi(seed_val=seed_val)
        tm.simulate_trees(seed_val=seed_val)
    log_hoods = []

    for iteration in range(max_num_iter):

        #STEP 1
        R = np.zeros(shape=(N, K))
        for n in range(N):
            for k in range(K):
                nth_sample = samples[n]
                kth_tree = tm.clusters[k]
                hood = tree_sample_likelihood(kth_tree, nth_sample)
                R[n, k] = tm.pi[k] * hood
        R = normalize(R, axis=1, norm='l1')

        #STEP 2
        new_pi = np.zeros(shape=(K))
        for k in range(K):
            suma = 0
            for n in range(N):
                suma += R[n, k]
            new_pi[k] = suma / N
        tm.pi = new_pi

        for k in range(K):
            #STEP 3
            Qstab = np.zeros(shape=(V, V, 2,
                                    2))  #Xs x Xt x (0 or 1) x (0 or 1)
            Nstab = np.zeros(shape=(V, V, 2,
                                    2))  #Xs x Xt x (0 or 1) x (0 or 1)
            #2 vertex relation
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    for n in range(N):
                        a = samples[n][Xs]
                        b = samples[n][Xt]
                        r_nk = R[n, k]
                        Nstab[Xs, Xt, a, b] += r_nk
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    denom = sum(R[:, k])
                    for a in range(2):  #for each observation (0 or 1)
                        for b in range(2):
                            num = Nstab[Xs, Xt, a, b]
                            Qstab[Xs, Xt, a, b] = num / denom
            #1 vertex relation
            Qsa = np.zeros(shape=(V, 2))
            Nsa = np.zeros(shape=(V, 2))
            for Xs in range(V):  # foreach vertex
                for n in range(N):
                    a = samples[n][Xs]
                    r_nk = R[n, k]
                    Nsa[Xs, a] += r_nk
            for Xs in range(V):
                for a in range(2):
                    num = Nsa[Xs, a]
                    denom = sum(Nsa[Xs, :])
                    Qsa[Xs, a] = num / denom
            #mutual information
            Info = np.zeros(shape=(V, V))  #information between vertices
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    for a in range(2):
                        for b in range(2):
                            qab = Qstab[Xs, Xt, a, b]
                            qa = Qsa[Xs, a]
                            qb = Qsa[Xt, b]
                            if qab / (qa * qb) != 0:
                                Info[Xs, Xt] += qab * log(qab / (qa * qb))
                            else:
                                Info[Xs, Xt] += 0
            #conditional information (for step 5)
            Qcond_stab = np.zeros(shape=(V, V, 2, 2))
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    for a in range(2):
                        for b in range(2):
                            num = Nstab[Xs, Xt, a, b]
                            denom = sum(Nstab[Xs, Xt, a, :])
                            Qcond_stab[Xs, Xt, a,
                                       b] = num / denom  #p(Xt=b|Xs=a)

            #STEP 4
            g = Graph(V)
            for Xs in range(V):  #foreach vertex pair
                for Xt in range(V):
                    if Xs == Xt:
                        continue
                    g.addEdge(Xs, Xt, Info[Xs, Xt])
            mst = g.maximum_spanning_tree()  #this is an array
            mst = sorted(mst, key=lambda x: x[0])

            #STEP 5
            topology_array = [np.nan for i in range(V)]
            theta_array = [None for i in range(V)]  #placeholder
            topology_array = np.array(topology_array)
            theta_array = np.array(theta_array)
            #root
            root = 0
            theta_array[0] = Qsa[root, :]

            MST = {}
            for u, v, w in mst:
                if u not in MST:
                    MST[u] = []
                MST[u].append(v)
                if v not in MST:
                    MST[v] = []
                MST[v].append(u)

            VISITED = []

            def dfs(curr, prior):
                VISITED.append(curr)
                if prior != -1:
                    cat = Qcond_stab[prior, curr]
                    theta_array[curr] = cat
                    topology_array[curr] = prior

                for child in MST[curr]:
                    if child in VISITED:
                        continue
                    dfs(child, curr)

            dfs(root, -1)

            new_tree = Tree()
            #print(topology_array)
            #print(theta_array)
            new_tree.load_tree_from_direct_arrays(topology_array, theta_array)

            tm.clusters[k] = new_tree

        #print("End iteration ", iteration)
        log_hood = tm_likelihood(tm, samples, N, num_clusters)
        #print(log_hood)
        log_hoods.append(log_hood)

    loglikelihood_list = np.array(log_hoods)

    return loglikelihood_list, tm
Example #5
0
def case3():
    real = TreeMixture(2, 5)
    real.simulate_pi()
    real.simulate_trees(seed_val=123)
    real.sample_mixtures(30, seed_val=123)
    real.save_mixture("q2_4/case3.pkl")
Example #6
0
def case1():
    real = TreeMixture(7, 4)
    real.simulate_pi()
    real.simulate_trees(seed_val=123)
    real.sample_mixtures(100, seed_val=123)
    real.save_mixture("q2_4/case1.pkl")
Example #7
0
def main():
    print("Hello World!")

    seed_val = 123

    #sample_filename = "q2_4/q2_4_tree_mixture.pkl_samples.txt"
    #real_values_filename = "q2_4/q2_4_tree_mixture.pkl"

    #sample_filename = "q2_4/case1.pkl_samples.txt"
    #real_values_filename = "q2_4/case1.pkl"

    #sample_filename = "q2_4/case2.pkl_samples.txt"
    #real_values_filename = "q2_4/case2.pkl"

    sample_filename = "q2_4/case3.pkl_samples.txt"
    real_values_filename = "q2_4/case3.pkl"

    num_clusters = 2  #need to change this fpr each case!

    samples = np.loadtxt(sample_filename, delimiter="\t", dtype=np.int32)

    loglikelihood, my_tm = sieving(seed_val,
                                   samples,
                                   num_clusters=num_clusters)

    plt.figure(figsize=(8, 3))
    plt.subplot(121)
    plt.plot(np.exp(loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    plt.plot(loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend(loc=(1.04, 0))
    plt.show()

    if real_values_filename != "":
        real = TreeMixture(0, 0)
        real.load_mixture(real_values_filename)

        print("\t4.1. Make the Robinson-Foulds distance analysis.\n")
        tns = dendropy.TaxonNamespace()

        real_trees = [i.newick for i in real.clusters]
        my_trees = [i.newick for i in my_tm.clusters]
        print(my_trees)

        i = 0
        for real_tree in real_trees:
            real_den = dendropy.Tree.get(data=real_tree,
                                         schema="newick",
                                         taxon_namespace=tns)
            j = 0
            for my_tree in my_trees:
                my_den = dendropy.Tree.get(data=my_tree,
                                           schema="newick",
                                           taxon_namespace=tns)
                print(
                    "RF distance: $<", i, j, ">$\t=",
                    dendropy.calculate.treecompare.symmetric_difference(
                        real_den, my_den), "\\\\")
                j += 1
            i += 1

        print("4.2. Make the likelihood comparison.\n")
        real_log_hood = tm_likelihood(real, samples, len(samples),
                                      num_clusters)
        print("Real: ", real_log_hood)
        print("Infered: ", loglikelihood)
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=30):
    best_likelihood = -100000000.0
    best_seed = 0
    for seedno in range(100):
        np.random.seed(seedno)
        #print("Running EM algorithm...")
        tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
        tm.simulate_pi(seedno)
        tm.simulate_trees(seedno)
        tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seedno)

        topology_list = []
        theta_list = []
        for k in range(num_clusters):
            topology_list.append(tm.clusters[k].get_topology_array())
            theta_list.append(tm.clusters[k].get_theta_array())

        topology_list = np.array(topology_list)
        theta_list = np.array(theta_list)
        loglikelihood = np.zeros(max_num_iter)
        pi = tm.pi

        for it in range(max_num_iter):
            #print("start iteration",it)
            #1: compute responsibilities
            resp = responsibilities(num_clusters, samples, theta_list,
                                    topology_list, pi)
            #print(resp)
            #2: set pi' = sum(r[n,k]/N)
            pi = np.zeros(num_clusters)
            pi_newdenom = np.sum(resp)
            for k in range(num_clusters):
                pi[k] = np.sum(resp[:, k]) / pi_newdenom
            #print(pi)

        #3: calculate mutual information between x[s] and x[t]
            N_ind1, q_denom1 = q_parts1(num_clusters, samples, resp)
            N_ind0, q_denom0 = q_parts0(num_clusters, samples, resp)

            #4: set Tau'[k] as maximum spanning tree in G[k]
            ##PACKAGE NETWORKX USED TO CONVERT MAXIMUM SPANNING TREE TO TOPOLOGY
            trees = [
                Graph(samples.shape[1]),
                Graph(samples.shape[1]),
                Graph(samples.shape[1]),
                Graph(samples.shape[1])
            ]
            weights = np.zeros(
                (num_clusters, samples.shape[1], samples.shape[1]))
            MST = [
                Graph(samples.shape[1]),
                Graph(samples.shape[1]),
                Graph(samples.shape[1]),
                Graph(samples.shape[1])
            ]
            for k in range(num_clusters):
                for s in range(samples.shape[1]):
                    for t in range(samples.shape[1]):
                        weights[k, s,
                                t] = I_Info(k, s, t, N_ind1, N_ind0, q_denom0,
                                            q_denom1, num_clusters, samples)
                        #print(weights)
                        trees[k].addEdge(s, t, weights[k, s, t])
                MST[k] = trees[k].maximum_spanning_tree()
            tree_graphs = [nx.Graph(), nx.Graph(), nx.Graph(), nx.Graph()]
            treearray = [nx.Graph(), nx.Graph(), nx.Graph(), nx.Graph()]
            for k in range(num_clusters):
                for u_of_edge, v_of_edge, weight in MST[k]:
                    tree_graphs[k].add_edge(u_of_edge=u_of_edge,
                                            v_of_edge=v_of_edge)
                treearray[k] = list(nx.bfs_edges(G=tree_graphs[k], source=0))

            tau_new = topology_list
            for k in range(num_clusters):
                for s in range(0, len(treearray[k])):
                    parent = treearray[k][s][0]
                    child = treearray[k][s][1]
                    tau_new[k][child] = parent

        #5: set Theta'[k](X[r])
            theta_new = theta_list

            for k in range(num_clusters):
                theta_new[k][0][:] = [
                    q0(k, 0, 0, N_ind0, q_denom0),
                    q0(k, 0, 1, N_ind0, q_denom0)
                ]
                for s in range(1, samples.shape[1]):
                    for a in range(0, 2):
                        for b in range(0, 2):
                            theta_new[k][s][a][b] = q_parts1cond(
                                s, int(tau_new[k][s]), a, b, samples, resp[:,
                                                                           k])

        #6: calculate log-likelihood
            theta_list = theta_new
            topology_list = tau_new
            loglikelihood[it] = log_likelihood(num_clusters, samples,
                                               theta_list, topology_list, pi)
            #print("best_likelihood = ",best_likelihood, "loglikelihood = ",loglikelihood[9])
        if best_likelihood < loglikelihood[25]:
            print(best_likelihood, ">", loglikelihood[25])
            best_likelihood = loglikelihood[25]
            best_seed = seedno

    print("seed val = ", best_seed)
    "repeat algorithm after finding best seed"

    np.random.seed(best_seed)
    #print("Running EM algorithm...")
    # TODO: Implement EM algorithm here.
    tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
    tm.simulate_pi(best_seed)
    tm.simulate_trees(best_seed)
    tm.sample_mixtures(num_samples=samples.shape[0], seed_val=best_seed)

    topology_list = []
    theta_list = []
    for k in range(num_clusters):
        topology_list.append(tm.clusters[k].get_topology_array())
        theta_list.append(tm.clusters[k].get_theta_array())

    #loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    theta_list = np.array(theta_list)
    #start iterations
    loglikelihood = np.zeros(max_num_iter)
    pi = tm.pi

    for it in range(max_num_iter):
        #print("start iteration",it)
        #1: compute responsibilities
        resp = responsibilities(num_clusters, samples, theta_list,
                                topology_list, pi)
        #print(resp)
        #2: set pi' = sum(r[n,k]/N)
        pi = np.zeros(num_clusters)
        pi_newdenom = np.sum(resp)
        for k in range(num_clusters):
            pi[k] = np.sum(resp[:, k]) / pi_newdenom
        #print(pi)

    #3: calculate mutual information between x[s] and x[t]
        N_ind1, q_denom1 = q_parts1(num_clusters, samples, resp)
        N_ind0, q_denom0 = q_parts0(num_clusters, samples, resp)

        #4: set Tau'[k] as maximum spanning tree in G[k]
        trees = [
            Graph(samples.shape[1]),
            Graph(samples.shape[1]),
            Graph(samples.shape[1]),
            Graph(samples.shape[1])
        ]
        weights = np.zeros((num_clusters, samples.shape[1], samples.shape[1]))
        MST = [
            Graph(samples.shape[1]),
            Graph(samples.shape[1]),
            Graph(samples.shape[1]),
            Graph(samples.shape[1])
        ]
        for k in range(num_clusters):
            for s in range(samples.shape[1]):
                for t in range(samples.shape[1]):
                    weights[k, s,
                            t] = I_Info(k, s, t, N_ind1, N_ind0, q_denom0,
                                        q_denom1, num_clusters, samples)
                    trees[k].addEdge(s, t, weights[k, s, t])
            MST[k] = trees[k].maximum_spanning_tree()

        ##PACKAGE NETWORKX USED TO CONVERT MAXIMUM SPANNING TREE TO TOPOLOGY
        tree_graphs = [nx.Graph(), nx.Graph(), nx.Graph(), nx.Graph()]
        treearray = [nx.Graph(), nx.Graph(), nx.Graph(), nx.Graph()]
        for k in range(num_clusters):
            for u_of_edge, v_of_edge, weight in MST[k]:
                tree_graphs[k].add_edge(u_of_edge=u_of_edge,
                                        v_of_edge=v_of_edge)
            treearray[k] = list(nx.bfs_edges(G=tree_graphs[k], source=0))

        tau_new = topology_list
        for k in range(num_clusters):
            for s in range(0, len(treearray[k])):
                parent = treearray[k][s][0]
                child = treearray[k][s][1]
                tau_new[k][child] = parent

    #5: set Theta'[k](X[r])
        theta_new = theta_list

        for k in range(num_clusters):
            theta_new[k][0][:] = [
                q0(k, 0, 0, N_ind0, q_denom0),
                q0(k, 0, 1, N_ind0, q_denom0)
            ]
            for s in range(1, samples.shape[1]):
                for a in range(0, 2):
                    for b in range(0, 2):
                        theta_new[k][s][a][b] = q_parts1cond(
                            s, int(tau_new[k][s]), a, b, samples, resp[:, k])

    #6: calculate log-likelihood
        theta_list = theta_new
        topology_list = tau_new
        loglikelihood[it] = log_likelihood(num_clusters, samples, theta_list,
                                           topology_list, pi)

    print("topology_list = ", topology_list)
    print(loglikelihood)
    return loglikelihood, np.array(topology_list), theta_list
Example #9
0
def main():
    # Code to process command line arguments
    parser = argparse.ArgumentParser(
        description='EM algorithm for likelihood of a tree GM.')
    parser.add_argument(
        'sample_filename',
        type=str,
        help=
        'Specify the name of the sample file (i.e data/example_samples.txt)')
    parser.add_argument(
        'output_filename',
        type=str,
        help=
        'Specify the name of the output file (i.e data/example_results.txt)')
    parser.add_argument('num_clusters',
                        type=int,
                        help='Specify the number of clusters (i.e 3)')
    parser.add_argument(
        '--seed_val',
        type=int,
        default=42,
        help='Specify the seed value for reproducibility (i.e 42)')
    parser.add_argument(
        '--real_values_filename',
        type=str,
        default="",
        help=
        'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)'
    )
    # You can add more default parameters if you want.

    print("Hello World!")
    print(
        "This file demonstrates the flow of function templates of question 2.5."
    )

    print("\n0. Load the parameters from command line.\n")

    args = parser.parse_args()
    print("\tArguments are: ", args)

    print("\n1. Load samples from txt file.\n")

    samples = np.loadtxt(args.sample_filename, delimiter="\t", dtype=np.int32)

    sample_filename = args.sample_filename

    customData = False
    if sample_filename == "data/q_2_5_tm_20node_20sample_4clusters.pkl_samples.txt":

        pi_file = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_pi.npy'
        tree0 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_0_topology.npy'
        tree1 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_1_topology.npy'
        tree2 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_2_topology.npy'
        tree3 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_3_topology.npy'
        theta0 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_0_theta.npy'
        theta1 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_1_theta.npy'
        theta2 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_2_theta.npy'
        theta3 = 'data/q_2_5_tm_20node_20sample_4clusters.pkl_tree_3_theta.npy'

    elif sample_filename == "data/q_2_5_tm_10node_50sample_4clusters.pkl_samples.txt":

        pi_file = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_pi.npy'
        tree0 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_0_topology.npy'
        tree1 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_1_topology.npy'
        tree2 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_2_topology.npy'
        tree3 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_3_topology.npy'
        theta0 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_0_theta.npy'
        theta1 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_1_theta.npy'
        theta2 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_2_theta.npy'
        theta3 = 'data/q_2_5_tm_10node_50sample_4clusters.pkl_tree_3_theta.npy'

    elif sample_filename == "data/q_2_5_tm_10node_20sample_4clusters.pkl_samples.txt":

        pi_file = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_pi.npy'
        tree0 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_0_topology.npy'
        tree1 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_1_topology.npy'
        tree2 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_2_topology.npy'
        tree3 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_3_topology.npy'
        theta0 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_0_theta.npy'
        theta1 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_1_theta.npy'
        theta2 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_2_theta.npy'
        theta3 = 'data/q_2_5_tm_10node_20sample_4clusters.pkl_tree_3_theta.npy'

    else:
        print(
            'Testing with Custom File. Please provide true mixture for likelihood and RF Comparisiom'
        )
        customData = True

    num_samples, num_nodes = samples.shape
    print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes)
    print("\tSamples: \n", samples)

    print("\n2. Run EM Algorithm.\n")
    sieving = np.random.randint(20, 999, size=10)
    good_seed = sieving[0]
    bestFit = -np.Infinity
    for sieve in sieving:
        loglikelihood, topology_array, theta_array = em_algorithm(
            sieve, samples, args.num_clusters, 10)
        # print(loglikelihood)
        thisFit = loglikelihood[-1]
        if (thisFit > bestFit):
            bestFit = thisFit
            good_seed = sieve
    loglikelihood, topology_array, theta_array = em_algorithm(
        good_seed, samples, num_clusters=args.num_clusters)

    print("\n3. Save, print and plot the results.\n")
    num_clusters = args.num_clusters
    save_results(loglikelihood, topology_array, theta_array,
                 args.output_filename)

    for i in range(args.num_clusters):
        print("\n\tCluster: ", i)
        print("\tTopology: \t", topology_array[i])
        print("\tTheta: \t", theta_array[i])

    plt.figure(figsize=(8, 3))
    plt.subplot(121)
    plt.plot(np.exp(loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")

    plt.subplot(122)
    plt.title(sample_filename)
    plt.plot(loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend(loc=(1.04, 0))
    plt.show()

    print("\n4. Retrieve real results and compare.\n")
    if args.real_values_filename != "" or True:
        if customData == False:
            print("\tComparing the results with real values...")

            print("\t4.1. Make the Robinson-Foulds distance analysis.\n")
            # TODO: Do RF Comparison
            t0 = Tree()
            t1 = Tree()
            t2 = Tree()
            t3 = Tree()
            true_pi = np.load(pi_file)
            t0.load_tree_from_arrays(tree0, theta0)
            t1.load_tree_from_arrays(tree1, theta1)
            t2.load_tree_from_arrays(tree2, theta2)
            t3.load_tree_from_arrays(tree3, theta3)
            tns = dendropy.TaxonNamespace()
            t0_rf = dendropy.Tree.get(data=t0.get_tree_newick(),
                                      schema="newick",
                                      taxon_namespace=tns)
            t1_rf = dendropy.Tree.get(data=t1.get_tree_newick(),
                                      schema="newick",
                                      taxon_namespace=tns)
            t2_rf = dendropy.Tree.get(data=t2.get_tree_newick(),
                                      schema="newick",
                                      taxon_namespace=tns)
            t3_rf = dendropy.Tree.get(data=t3.get_tree_newick(),
                                      schema="newick",
                                      taxon_namespace=tns)
            t0_infer = Tree()
            t1_infer = Tree()
            t2_infer = Tree()
            t3_infer = Tree()
            t0_infer.load_tree_from_direct_arrays(topology_array[0],
                                                  theta_array[0])
            t1_infer.load_tree_from_direct_arrays(topology_array[1],
                                                  theta_array[1])
            t2_infer.load_tree_from_direct_arrays(topology_array[2],
                                                  theta_array[2])
            t3_infer.load_tree_from_direct_arrays(topology_array[3],
                                                  theta_array[3])
            t0_infer_rf = dendropy.Tree.get(data=t0_infer.get_tree_newick(),
                                            schema="newick",
                                            taxon_namespace=tns)
            t1_infer_rf = dendropy.Tree.get(data=t1_infer.get_tree_newick(),
                                            schema="newick",
                                            taxon_namespace=tns)
            t2_infer_rf = dendropy.Tree.get(data=t2_infer.get_tree_newick(),
                                            schema="newick",
                                            taxon_namespace=tns)
            t3_infer_rf = dendropy.Tree.get(data=t3_infer.get_tree_newick(),
                                            schema="newick",
                                            taxon_namespace=tns)
            print('File:', sample_filename)
            print('------Robinson-Foulds Distance------')
            rfTree0 = [
                RfDist(t0_infer_rf, t0_rf),
                RfDist(t0_infer_rf, t1_rf),
                RfDist(t0_infer_rf, t2_rf),
                RfDist(t0_infer_rf, t3_rf)
            ]
            rfTree1 = [
                RfDist(t1_infer_rf, t0_rf),
                RfDist(t1_infer_rf, t1_rf),
                RfDist(t1_infer_rf, t2_rf),
                RfDist(t1_infer_rf, t3_rf)
            ]
            rfTree2 = [
                RfDist(t2_infer_rf, t0_rf),
                RfDist(t2_infer_rf, t1_rf),
                RfDist(t2_infer_rf, t2_rf),
                RfDist(t2_infer_rf, t3_rf)
            ]
            rfTree3 = [
                RfDist(t3_infer_rf, t0_rf),
                RfDist(t3_infer_rf, t1_rf),
                RfDist(t3_infer_rf, t2_rf),
                RfDist(t3_infer_rf, t3_rf)
            ]
            print('------Real Trees------')
            print(t0.get_tree_newick())
            print(t1.get_tree_newick())
            print(t2.get_tree_newick())
            print(t3.get_tree_newick())
            print('------Inferred Trees------')
            print(t0_infer.get_tree_newick())
            print(t1_infer.get_tree_newick())
            print(t2_infer.get_tree_newick())
            print(t3_infer.get_tree_newick())
            print()
            print('RF Distance of Inferred Tree 0 with each Tree(true):',
                  rfTree0)
            print('RF Distance of Inferred Tree 1 with each Tree(true):',
                  rfTree1)
            print('RF Distance of Inferred Tree 2 with each Tree(true):',
                  rfTree2)
            print('RF Distance of Inferred Tree 3 with each Tree(true):',
                  rfTree3)
            print('------Robinson-Foulds Distance------')
            print("\t4.2. Make the likelihood comparison.\n")

            # TODO: Do Likelihood Comparison

            print('Log Likelihood of real mixture: ' + str(
                truelikelihood([t0, t1, t2, t3], samples, num_samples,
                               num_clusters, true_pi)))
            print('Log Likelihood of inferred mixture: ' +
                  str(loglikelihood[-1]))
        else:
            print('Testing with Custom Data')
            tns = dendropy.TaxonNamespace()
            if sample_filename == "newData/Data1/Dataset1.pkl_samples.txt":
                num_clusters = 5
                num_nodes = 30
                tm = TreeMixture(num_clusters, num_nodes)
                seed_val = 12
                tm.simulate_pi(seed_val=seed_val)
                tm.simulate_trees(seed_val)
                seed_val = 12
                num_samples = 20
                tm.sample_mixtures(num_samples, seed_val=seed_val)
                t0_rf = dendropy.Tree.get(
                    data=tm.clusters[0].get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t1_rf = dendropy.Tree.get(
                    data=tm.clusters[1].get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t2_rf = dendropy.Tree.get(
                    data=tm.clusters[2].get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t3_rf = dendropy.Tree.get(
                    data=tm.clusters[3].get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t4_rf = dendropy.Tree.get(
                    data=tm.clusters[4].get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t0_infer = Tree()
                t1_infer = Tree()
                t2_infer = Tree()
                t3_infer = Tree()
                t4_infer = Tree()
                t0_infer.load_tree_from_direct_arrays(topology_array[0],
                                                      theta_array[0])
                t1_infer.load_tree_from_direct_arrays(topology_array[1],
                                                      theta_array[1])
                t2_infer.load_tree_from_direct_arrays(topology_array[2],
                                                      theta_array[2])
                t3_infer.load_tree_from_direct_arrays(topology_array[3],
                                                      theta_array[3])
                t4_infer.load_tree_from_direct_arrays(topology_array[4],
                                                      theta_array[4])
                t0_infer_rf = dendropy.Tree.get(
                    data=t0_infer.get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t1_infer_rf = dendropy.Tree.get(
                    data=t1_infer.get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t2_infer_rf = dendropy.Tree.get(
                    data=t2_infer.get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t3_infer_rf = dendropy.Tree.get(
                    data=t3_infer.get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t4_infer_rf = dendropy.Tree.get(
                    data=t4_infer.get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                print('File:', sample_filename)
                print('------Robinson-Foulds Distance------')
                rfTree0 = [
                    RfDist(t0_infer_rf, t0_rf),
                    RfDist(t0_infer_rf, t1_rf),
                    RfDist(t0_infer_rf, t2_rf),
                    RfDist(t0_infer_rf, t3_rf)
                ]
                rfTree1 = [
                    RfDist(t1_infer_rf, t0_rf),
                    RfDist(t1_infer_rf, t1_rf),
                    RfDist(t1_infer_rf, t2_rf),
                    RfDist(t1_infer_rf, t3_rf)
                ]
                rfTree2 = [
                    RfDist(t2_infer_rf, t0_rf),
                    RfDist(t2_infer_rf, t1_rf),
                    RfDist(t2_infer_rf, t2_rf),
                    RfDist(t2_infer_rf, t3_rf)
                ]
                rfTree3 = [
                    RfDist(t3_infer_rf, t0_rf),
                    RfDist(t3_infer_rf, t1_rf),
                    RfDist(t3_infer_rf, t2_rf),
                    RfDist(t3_infer_rf, t3_rf)
                ]
                rfTree4 = [
                    RfDist(t4_infer_rf, t0_rf),
                    RfDist(t4_infer_rf, t1_rf),
                    RfDist(t4_infer_rf, t2_rf),
                    RfDist(t4_infer_rf, t4_rf)
                ]
                print('------Real Trees------')
                print(tm.clusters[0].get_tree_newick())
                print(tm.clusters[1].get_tree_newick())
                print(tm.clusters[2].get_tree_newick())
                print(tm.clusters[3].get_tree_newick())
                print(tm.clusters[4].get_tree_newick())
                print('------Inferred Trees------')
                print(t0_infer.get_tree_newick())
                print(t1_infer.get_tree_newick())
                print(t2_infer.get_tree_newick())
                print(t3_infer.get_tree_newick())
                print(t4_infer.get_tree_newick())
                print()
                print('RF Distance of Inferred Tree 0 with each Tree(true):',
                      rfTree0)
                print('RF Distance of Inferred Tree 1 with each Tree(true):',
                      rfTree1)
                print('RF Distance of Inferred Tree 2 with each Tree(true):',
                      rfTree2)
                print('RF Distance of Inferred Tree 3 with each Tree(true):',
                      rfTree3)
                print('RF Distance of Inferred Tree 4 with each Tree(true):',
                      rfTree4)
                print('------Robinson-Foulds Distance------')
                print("\t4.2. Make the likelihood comparison.\n")
                print('Log Likelihood of real mixture: ' + str(
                    truelikelihood([
                        tm.clusters[0], tm.clusters[1], tm.clusters[2],
                        tm.clusters[3], tm.clusters[4]
                    ], samples, num_samples, num_clusters, tm.pi)))
                print('Log Likelihood of inferred mixture: ' +
                      str(loglikelihood[-1]))
            elif sample_filename == "newData/Data2/Dataset2.pkl_samples.txt":
                num_clusters = 3
                num_nodes = 50
                tm = TreeMixture(num_clusters, num_nodes)
                seed_val = 123
                tm.simulate_pi(seed_val=seed_val)
                tm.simulate_trees(seed_val)
                seed_val = 12
                num_samples = 100
                tm.sample_mixtures(num_samples, seed_val=seed_val)
                t0_rf = dendropy.Tree.get(
                    data=tm.clusters[0].get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t1_rf = dendropy.Tree.get(
                    data=tm.clusters[1].get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t2_rf = dendropy.Tree.get(
                    data=tm.clusters[2].get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t0_infer = Tree()
                t1_infer = Tree()
                t2_infer = Tree()
                t0_infer.load_tree_from_direct_arrays(topology_array[0],
                                                      theta_array[0])
                t1_infer.load_tree_from_direct_arrays(topology_array[1],
                                                      theta_array[1])
                t2_infer.load_tree_from_direct_arrays(topology_array[2],
                                                      theta_array[2])
                t0_infer_rf = dendropy.Tree.get(
                    data=t0_infer.get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t1_infer_rf = dendropy.Tree.get(
                    data=t1_infer.get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                t2_infer_rf = dendropy.Tree.get(
                    data=t2_infer.get_tree_newick(),
                    schema="newick",
                    taxon_namespace=tns)
                print('File:', sample_filename)
                print('------Robinson-Foulds Distance------')
                rfTree0 = [
                    RfDist(t0_infer_rf, t0_rf),
                    RfDist(t0_infer_rf, t1_rf),
                    RfDist(t0_infer_rf, t2_rf)
                ]
                rfTree1 = [
                    RfDist(t1_infer_rf, t0_rf),
                    RfDist(t1_infer_rf, t1_rf),
                    RfDist(t1_infer_rf, t2_rf)
                ]
                rfTree2 = [
                    RfDist(t2_infer_rf, t0_rf),
                    RfDist(t2_infer_rf, t1_rf),
                    RfDist(t2_infer_rf, t2_rf)
                ]
                print('------Real Trees------')
                print(tm.clusters[0].get_tree_newick())
                print(tm.clusters[1].get_tree_newick())
                print(tm.clusters[2].get_tree_newick())
                print('------Inferred Trees------')
                print(t0_infer.get_tree_newick())
                print(t1_infer.get_tree_newick())
                print(t2_infer.get_tree_newick())

                print()
                print('RF Distance of Inferred Tree 0 with each Tree(true):',
                      rfTree0)
                print('RF Distance of Inferred Tree 1 with each Tree(true):',
                      rfTree1)
                print('RF Distance of Inferred Tree 2 with each Tree(true):',
                      rfTree2)
                print('------Robinson-Foulds Distance------')
                print("\t4.2. Make the likelihood comparison.\n")
                print('Log Likelihood of real mixture: ' + str(
                    truelikelihood(
                        [tm.clusters[0], tm.clusters[1], tm.clusters[2]],
                        samples, num_samples, num_clusters, tm.pi)))
                print('Log Likelihood of inferred mixture: ' +
                      str(loglikelihood[-1]))
Example #10
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100):
    """
    This function is for the EM algorithm.
    :param seed_val: Seed value for reproducibility. Type: int
    :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes)
    :param num_clusters: Number of clusters. Type: int
    :param max_num_iter: Maximum number of EM iterations. Type: int
    :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array.
                Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter.
    :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes)
    :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2)

    You can change the function signature and add new parameters. Add them as parameters with some default values.
    i.e.
    Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10):
    You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123):
    """
    print("Running EM algorithm...")

    # Set threshold for convergence
    THRES = 1e-4

    num_sieving = 10
    num_samples = np.size(samples, 0)
    num_nodes = np.size(samples, 1)

    np.random.seed(seed_val)
    seeds = np.random.randint(0, 100000000, num_sieving)
    last_loglikelihoods = []
    tms = []
    for seed in seeds:
        np.random.seed(seed)
        tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes)
        tm.simulate_pi(seed_val=seed)
        tm.simulate_trees(seed_val=seed)
        tm_loglikelihood, tm = em_helper(tm,
                                         samples,
                                         num_clusters,
                                         max_num_iter=10)
        last_loglikelihoods.append(tm_loglikelihood[-1])
        tms.append(tm)

    print("=> Sieving finished")
    seed = seeds[last_loglikelihoods.index(max(last_loglikelihoods))]
    # tm = tms[last_loglikelihoods.index(max(last_loglikelihoods))]
    tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes)
    tm.simulate_pi(seed_val=seed)
    tm.simulate_trees(seed_val=seed)
    loglikelihood, tm = em_helper(tm,
                                  samples,
                                  num_clusters,
                                  max_num_iter=max_num_iter)

    print("=> EM finished")
    topology_list = []
    theta_list = []
    for t in tm.clusters:
        topology_list.append(t.get_topology_array())
        theta_list.append(t.get_theta_array())
    loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    # theta_list = np.array(theta_list)

    return loglikelihood, topology_list, theta_list, tm
Example #11
0
def main():
    # Code to process command line arguments
    parser = argparse.ArgumentParser(
        description='EM algorithm for likelihood of a tree GM.')
    parser.add_argument(
        '--sample_filename',
        type=str,
        default='data/q_2_5_tm_20node_20sample_4clusters.pkl_samples.txt',
        help=
        'Specify the name of the sample file (i.e data/example_samples.txt)')
    parser.add_argument(
        '--real_values_filename',
        type=str,
        default='data/q_2_5_tm_20node_20sample_4clusters.pkl',
        help=
        'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)'
    )
    parser.add_argument(
        '--output_filename',
        type=str,
        default='q_2_5_tm_20node_20sample_4clusters_result.txt',
        help=
        'Specify the name of the output file (i.e data/example_results.txt)')
    parser.add_argument('--num_nodes',
                        type=int,
                        default=10,
                        help='Specify the number of nodes of trees (i.e 10)')
    parser.add_argument('--num_clusters',
                        type=int,
                        default=10,
                        help='Specify the number of clusters (i.e 3)')
    parser.add_argument(
        '--seed_val',
        type=int,
        default=123,
        help='Specify the seed value for reproducibility (i.e 42)')
    parser.add_argument(
        '--if_simulate',
        type=bool,
        default=True,
        help='Specify whether the sampling is enabled (i.e False)')
    parser.add_argument(
        '--num_samples',
        type=int,
        default=50,
        help='Specify the number of samples if sampling is enabled (i.e 1000)')
    # You can add more default parameters if you want.

    print("Hello World!")
    print(
        "This file demonstrates the flow of function templates of question 2.5."
    )

    print("\n0. Load the parameters from command line.\n")

    args = parser.parse_args()
    print("\tArguments are: ", args)

    if args.if_simulate:
        print("\n1. Make new tree and sample.\n")
        tm_truth = TreeMixture(num_clusters=args.num_clusters,
                               num_nodes=args.num_nodes)
        tm_truth.simulate_pi(seed_val=args.seed_val)
        tm_truth.simulate_trees(seed_val=args.seed_val)
        tm_truth.sample_mixtures(args.num_samples, seed_val=args.seed_val)
    else:
        print("\n1. Load true tree from file.\n")
        tm_truth = TreeMixture(0, 0)
        tm_truth.load_mixture(args.real_values_filename)
    print("Load samples.")
    samples = tm_truth.samples
    num_samples, num_nodes = samples.shape
    print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes)
    print("\tSamples: \n", samples)

    print("\n2. Run EM Algorithm.\n")
    loglikelihood, topology_array, theta_array, tm = em_algorithm(
        args.seed_val, samples, num_clusters=args.num_clusters)

    print("\n3. Save, print and plot the results.\n")
    # save_results(loglikelihood, topology_array, theta_array, args.output_filename)
    for i in range(args.num_clusters):
        print("\n\tCluster: ", i)
        print("\tTopology: \t", topology_array[i])
        print("\tTheta: \t", theta_array[i])

    plt.figure(figsize=(8, 3))
    plt.subplot(121)
    plt.plot(np.exp(loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    plt.plot(loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend(loc=(1.04, 0))
    plt.show()

    print("\n4. Retrieve real results and compare.\n")
    if args.real_values_filename != "":
        print(
            "\n=> Compare trees and print Robinson-Foulds (RF) distance (result v.s truth):\n"
        )
        N = len(samples)
        K = tm_truth.num_clusters
        tns = dendropy.TaxonNamespace()
        print("\\hline")
        for k in range(K):
            print(k, end=" & ")
            for j in range(K):
                t_0 = tm.clusters[k]
                t_0.get_tree_newick()
                t_0 = dendropy.Tree.get(data=t_0.newick,
                                        schema="newick",
                                        taxon_namespace=tns)
                t_t = tm_truth.clusters[j]
                t_t.get_tree_newick()
                t_t = dendropy.Tree.get(data=t_t.newick,
                                        schema="newick",
                                        taxon_namespace=tns)
                print(dendropy.calculate.treecompare.symmetric_difference(
                    t_0, t_t),
                      end=" & ")
            print("\\\\")

        print("\n=> Compare log-likelihood (result v.s truth):\n")
        posterior = np.ones((N, K))
        prior = np.ones(N)
        for n, x in enumerate(samples):
            for k, tree in enumerate(tm_truth.clusters):
                visit_list = [tree.root]
                while len(visit_list) != 0:
                    cur_node = visit_list[0]
                    visit_list = visit_list[1:]
                    visit_list = visit_list + cur_node.descendants
                    if cur_node.ancestor is None:
                        posterior[n, k] *= cur_node.cat[x[int(cur_node.name)]]
                    else:
                        posterior[n, k] *= cur_node.cat[x[int(
                            cur_node.ancestor.name)]][x[int(cur_node.name)]]
            prior[n] *= np.sum(posterior[n] * tm_truth.pi)
        loglikelihood_truth = np.sum(np.log(prior))
        print("%f : %f" % (loglikelihood[-1], loglikelihood_truth))
Example #12
0
def main():
    # Code to process command line arguments
    parser = argparse.ArgumentParser(
        description='EM algorithm for likelihood of a tree GM.')
    parser.add_argument(
        'sample_filename',
        type=str,
        help=
        'Specify the name of the sample file (i.e data/example_samples.txt)')
    parser.add_argument(
        'output_filename',
        type=str,
        help=
        'Specify the name of the output file (i.e data/example_results.txt)')
    parser.add_argument('num_clusters',
                        type=int,
                        help='Specify the number of clusters (i.e 3)')
    parser.add_argument(
        '--seed_val',
        type=int,
        default=42,
        help='Specify the seed value for reproducibility (i.e 42)')
    parser.add_argument(
        '--real_values_filename',
        type=str,
        default="",
        help=
        'Specify the name of the real values file (i.e data/example_tree_mixture.pkl)'
    )
    # You can add more default parameters if you want.

    print("Hello World!")
    print(
        "This file demonstrates the flow of function templates of question 2.5."
    )

    print("\n0. Load the parameters from command line.\n")

    args = parser.parse_args()
    print("\tArguments are: ", args)

    print("\n1. Load samples from txt file.\n")

    samples = np.loadtxt(args.sample_filename, delimiter="\t", dtype=np.int32)
    num_samples, num_nodes = samples.shape
    print("\tnum_samples: ", num_samples, "\tnum_nodes: ", num_nodes)
    print("\tSamples: \n", samples)

    print("\n2. Run EM Algorithm.\n")

    loglikelihood, topology_array, theta_array = em_algorithm(
        args.seed_val, samples, num_clusters=args.num_clusters)

    print("\n3. Save, print and plot the results.\n")

    save_results(loglikelihood, topology_array, theta_array,
                 args.output_filename)

    for i in range(args.num_clusters):
        print("\n\tCluster: ", i)
        print("\tTopology: \t", topology_array[i])
        print("\tTheta: \t", theta_array[i])

    plt.figure(figsize=(8, 3))
    plt.subplot(121)
    plt.plot(np.exp(loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    plt.plot(loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend(loc=(1.04, 0))
    plt.show()

    print("\n4. Retrieve real results and compare.\n")
    if args.real_values_filename != "":
        print("\tComparing the results with real values...")
        actual_tm = TreeMixture(args.num_clusters, num_nodes)
        actual_tm.load_mixture(args.real_values_filename)

        inferred_tm = TreeMixture(args.num_clusters, num_nodes)
        inferred_tm.load_mixture(args.output_filename)

        print("\t4.1. Make the Robinson-Foulds distance analysis.\n")
        diff = compute_tree_mix_diff(actual_tm, inferred_tm)
        print("Total Robinson-Foulds distance: " + str(diff))

        print("\t4.2. Make the likelihood comparison.\n")
        actual_lik = actual_tm.likelihood_dataset(samples)
        inferred_lik = inferred_tm.likelihood_dataset(samples)
        print("Log-Likelihood of actual tree: " + str(actual_lik) +
              ", inferred tree: " + str(inferred_lik))
Example #13
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter, tm=None):

    num_samples = samples.shape[0]
    num_nodes = samples.shape[1]

    loglikelihood = []
    if tm is None:
        tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes)
        tm.simulate_pi(None)
        tm.simulate_trees(None)
        #samples = tm.samples

    for iter_ in range(max_num_iter):
        # 1. Compute responsibilities for all trees
        sample_likelihoods = np.array([[sample_likelihood(tm.clusters[ii], samples[jj,:]\
            , tm.pi[ii]) for ii in range(num_clusters)] for jj in range(num_samples)])
        sum_over_trees_likelihoods = np.reshape(
            np.sum(sample_likelihoods, axis=1), (num_samples, 1))
        Responsibilities = np.divide(sample_likelihoods,
                                     sum_over_trees_likelihoods)
        # Computing loglikelihood
        ll = np.sum(np.log(np.sum(sample_likelihoods, axis=1)), axis=None)
        loglikelihood.append(ll)

        tm.loglikelihood.append(ll)
        # 2. Updating pi for all trees
        tm.pi = np.sum(Responsibilities, axis=0) / num_samples
        vertices = list(range(num_nodes))
        # 3. Updating each tree
        for i in range(num_clusters):
            tree = tm.clusters[i]
            responsibilities = Responsibilities[:, i]
            # Creating the symmetric mutual information matrix
            mutual_information_matrix = np.asarray([[mutual_information(responsibilities, samples, s_idx, t_idx) \
                for s_idx in vertices] for t_idx in vertices])
            # Computing the graph
            graph = create_graph(num_nodes, responsibilities, samples,
                                 mutual_information_matrix, vertices)
            # Finding the maximum spanning tree
            MST = maximum_spanning_tree(graph)
            # Choosing the root as 0
            root_name = 0
            # Finding the order of nodes in the tree
            ordered_nodes, I_sum_tree = create_ordered_nodes(MST, root_name)
            # Getting attributes for tree to enable update
            topology_array, theta_array = create_tree_attributes1(
                ordered_nodes, root_name, samples, responsibilities, num_nodes)
            # Updating the tree
            tree.load_tree_from_direct_arrays(topology_array, theta_array)

    # -------------------------------------------
    topology_list = []
    theta_list = []
    for i in range(num_clusters):
        topology_list.append(tm.clusters[i].get_topology_array())
        theta_list.append(tm.clusters[i].get_theta_array())

    loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    theta_list = np.array(theta_list)

    return loglikelihood, topology_list, theta_list, tm
Example #14
0
def main():

    num_clusters = 3
    new_tm = TreeMixture(num_clusters=num_clusters, num_nodes=3)
    new_tm.simulate_pi(None)
    new_tm.simulate_trees(None)
    new_tm.sample_mixtures(100)
    new_samples = new_tm.samples
    #samples = tm.samples
    seed_val = None
    directory = '/Users/filipbergentoft/Desktop/Github/DD2434/Assignment 2/2_4/'
    sample_filename = directory + "data/q2_4/q2_4_tree_mixture.pkl_samples.txt"
    output_filename = directory + "data/q2_4/q2_4_own_results"
    real_values_filename = directory + "data/q2_4/q2_4_tree_mixture.pkl"

    samples = np.loadtxt(sample_filename, delimiter="\t", dtype=np.int32)
    np.random.shuffle(samples)

    best_tm = sieving(n_first_mixtures=50,
                      n_second_mixtures=10,
                      n_first_iterations=10,
                      n_second_iterations=100,
                      samples=samples,
                      num_clusters=num_clusters)

    real_tm = TreeMixture(num_clusters=3, num_nodes=5)
    real_tm.load_mixture(real_values_filename)

    print('best tree', mixture_likelihood(best_tm, samples))
    print('best tree', best_tm.pi)
    print('real tree', mixture_likelihood(real_tm, samples))
    print('real tree', real_tm.pi)

    print(RF_comparison(best_tm, real_tm))
    for tree in new_tm.clusters:
        print('Real tree topology')
        print(tree.get_topology_array())

    for tree in best_tm.clusters:
        print('Inferred tree topology')
        print(tree.get_topology_array())

    sns.set_style('darkgrid')
    """
    plt.subplot(121)
    plt.plot(np.exp(best_tm.loglikelihood), label='Estimated')
    plt.ylabel("Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.subplot(122)
    """
    plt.plot(best_tm.loglikelihood, label='Estimated')
    plt.ylabel("Log-Likelihood of Mixture")
    plt.xlabel("Iterations")
    plt.legend()
    plt.show()
Example #15
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100):
    """
    This function is for the EM algorithm.
    :param seed_val: Seed value for reproducibility. Type: int
    :param samples: Observed x values. Type: numpy array. Dimensions: (num_samples, num_nodes)
    :param num_clusters: Number of clusters. Type: int
    :param max_num_iter: Maximum number of EM iterations. Type: int
    :return: loglikelihood: Array of log-likelihood of each EM iteration. Type: numpy array.
                Dimensions: (num_iterations, ) Note: num_iterations does not have to be equal to max_num_iter.
    :return: topology_list: A list of tree topologies. Type: numpy array. Dimensions: (num_clusters, num_nodes)
    :return: theta_list: A list of tree CPDs. Type: numpy array. Dimensions: (num_clusters, num_nodes, 2)

    You can change the function signature and add new parameters. Add them as parameters with some default values.
    i.e.
    Function template: def em_algorithm(seed_val, samples, k, max_num_iter=10):
    You can change it to: def em_algorithm(seed_val, samples, k, max_num_iter=10, new_param_1=[], new_param_2=123):
    """

    # Set the seed
    np.random.seed(seed_val)

    # TODO: Implement EM algorithm here.

    # Start: Example Code Segment. Delete this segment completely before you implement the algorithm.
    print("Running EM algorithm...")

    from Kruskal_v1 import Graph
    # return result in the method
    import sys

    tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
    tm.simulate_pi(seed_val=seed_val)
    tm.simulate_trees(seed_val=seed_val)
    tm.sample_mixtures(num_samples=samples.shape[0], seed_val=seed_val)
    eps = sys.float_info.min
    topology_list = []
    theta_list = []
    loglikelihood = []
    num_samples = samples.shape[0]
    num_nodes = samples.shape[1]
    for iter in range(max_num_iter):
        r = np.ones((num_samples, num_clusters))
        for i, sample in enumerate(samples):
            for j, t in enumerate(tm.clusters):
                visitedNodes = [t.root]
                r[i, j] *= tm.pi[j]
                while len(visitedNodes) != 0:
                    presentNode = visitedNodes[0]
                    visitedNodes = visitedNodes[1:]
                    if len(presentNode.descendants) != 0:
                        visitedNodes = visitedNodes + presentNode.descendants
                    if presentNode.ancestor == None:  #root node
                        r[i,
                          j] *= presentNode.cat[sample[int(presentNode.name)]]
                    else:
                        r[i, j] *= presentNode.cat[sample[int(
                            presentNode.ancestor.name)]][sample[int(
                                presentNode.name)]]

        r += eps
        rn = np.sum(r, axis=1).reshape(num_samples, 1)
        r /= rn
        loglikelihood.append(np.sum(np.log(rn)))
        tm.pi = np.sum(r, axis=0) / num_samples
        den = np.sum(r, axis=0)
        NominatorQk = np.zeros((num_nodes, num_nodes, 2, 2, num_clusters))
        for s in range(num_nodes):
            for t in range(num_nodes):
                for a in range(2):
                    for b in range(2):
                        matched_index = np.where(
                            (samples[:, (s, t)] == [a, b]).all(1))[0]
                        NominatorQk[s, t, a,
                                    b] = np.sum(r[matched_index], axis=0) / den

        DenominatorQk = np.zeros((num_nodes, 2, num_clusters))
        for s in range(num_nodes):
            for a in range(2):
                matched_index = np.where((samples[:, s] == a))
                DenominatorQk[s, a] = np.sum(r[matched_index], axis=0) / den

        Iqst = np.zeros((num_nodes, num_nodes, num_clusters))
        for s in range(num_nodes):
            for t in range(num_nodes):
                for a in range(2):
                    for b in range(2):
                        if (np.all(NominatorQk[s, t, a, b, :] > 0)):
                            Iqst[s, t] += NominatorQk[s, t, a, b] * np.log(
                                (NominatorQk[s, t, a, b] /
                                 (DenominatorQk[s, a])) / DenominatorQk[t, b])
                        else:
                            Iqst[s, t] += 0
        for k in range(num_clusters):
            g = Graph(num_nodes)
            for s in range(num_nodes):
                for t in range(s + 1, num_nodes):
                    g.addEdge(s, t, Iqst[s, t, k])

            mst_edges = np.array(g.maximum_spanning_tree())[:, [0, 1]]
            topology_array = np.zeros(num_nodes)
            topology_array[0] = np.nan
            visitedNodes = [0]
            while len(visitedNodes) != 0:
                presentNode = visitedNodes[0]
                visitedNodes = visitedNodes[1:]
                child_edges = np.array(np.where(mst_edges == [presentNode])).T
                for ind in child_edges:
                    child = mst_edges[ind[0]][1 - ind[1]]
                    topology_array[int(child)] = presentNode
                    visitedNodes.append(child)
                if np.size(child_edges) != 0:
                    mst_edges = np.delete(mst_edges, child_edges[:, 0], 0)

            new_tree = Tree()
            new_tree.load_tree_from_direct_arrays(topology_array)
            new_tree.alpha = [1.0] * 2
            new_tree.k = 2

            visitedNodes = [new_tree.root]
            while len(visitedNodes) != 0:
                presentNode = visitedNodes[0]
                visitedNodes = visitedNodes[1:]

                if len(presentNode.descendants) != 0:
                    visitedNodes = visitedNodes + presentNode.descendants

                if presentNode.ancestor == None:
                    presentNode.cat = DenominatorQk[int(presentNode.name), :,
                                                    k].tolist()
                else:
                    presentNode.cat = NominatorQk[
                        int(presentNode.ancestor.name),
                        int(presentNode.name), :, :, k]
                    presentNode.cat[0] = presentNode.cat[0] / np.sum(
                        presentNode.cat[0])
                    presentNode.cat[1] = presentNode.cat[1] / np.sum(
                        presentNode.cat[1])
                    presentNode.cat = [presentNode.cat[0], presentNode.cat[1]]

            tm.clusters[k] = new_tree

        for j, t in enumerate(tm.clusters):
            topology_list.append(t.get_topology_array())
            theta_list.append(t.get_theta_array())
    loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    theta_list = np.array(theta_list)
    return loglikelihood, topology_list, theta_list
Example #16
0
from Tree import TreeMixture
import Exercise2_5
import argparse

# This file has the pupose of creating new samples for testing exercise 5

parser = argparse.ArgumentParser()
parser.add_argument("seed",
                    help="Introduce the seed to generate trees",
                    type=int)
parser.add_argument("samples",
                    help="Introduce the number of samples",
                    type=int)
parser.add_argument("nodes", help="Introduce the number of nodes", type=int)
parser.add_argument("clusters",
                    help="Introduce the number of clusters",
                    type=int)
args = parser.parse_args()
print("Generating tree with seed:", args.seed, "\tsamples:", args.samples,
      "\tnodes:", args.nodes, "\tclusters:", args.clusters)
tm = TreeMixture(num_clusters=args.clusters, num_nodes=args.nodes)
tm.simulate_pi(seed_val=args.seed)
tm.simulate_trees(seed_val=args.seed)
tm.sample_mixtures(num_samples=args.samples, seed_val=args.seed)
path = 'data/q_2_5_tm_' + str(args.nodes) + 'node_' + str(
    args.samples) + 'sample_' + str(args.clusters) + 'clusters.pkl'
tm.save_mixture(path, True)
Example #17
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter):

    # Initialize the needed variables    
    sieving = 100
    max_log = float("-inf")
    best_seed = 0

    # Get the best seed for likelihood
    for siev in tqdm(range(sieving)):
        # Set the seed
        aux_seed = seed_val + siev # Try with all seeds from @param:seed_val to @param:seed_val + sieving
        np.random.seed(aux_seed)

        # Generate tree mixture
        tm = TreeMixture(num_clusters=num_clusters, num_nodes=samples.shape[1])
        tm.simulate_pi(seed_val=aux_seed)
        tm.simulate_trees(seed_val=aux_seed)
        topology_list = []
        theta_list = []

        for i in range(num_clusters):
            topology_list.append(tm.clusters[i].get_topology_array())
            theta_list.append(tm.clusters[i].get_theta_array())

        # Run 10 iterations according to this mixture
        loglikelihood = computationsEM(10, samples, num_clusters, tm, topology_list, theta_list)          

        aux = loglikelihood[-1]
        if (aux > max_log):
            max_log = aux
            best_seed = aux_seed

    # -------------------- End of sieving -------------------- #

    # Variable initialization
    np.random.seed(best_seed)
    topology_list = [] # Dimensions: (num_clusters, num_nodes)
    theta_list = [] # Dimensions: (num_clusters, num_nodes, 2)
    tm = TreeMixture(num_clusters = num_clusters, num_nodes = samples.shape[1])
    tm.simulate_pi(seed_val = best_seed)
    tm.simulate_trees(seed_val = best_seed)

    for k in range(num_clusters):
        topology_list.append(tm.clusters[k].get_topology_array())
        theta_list.append(tm.clusters[k].get_theta_array())

    # Beginning of iterations
    pi = tm.pi
    loglikelihood = computationsEM(max_num_iter, samples, num_clusters, tm, topology_list, theta_list)

    return loglikelihood, topology_list, theta_list
Example #18
0
def em_algorithm(seed_val, samples, num_clusters, max_num_iter=100):
    print("Running EM algorithm...")

    # Set threshold for convergence
    THRES = 1e-4

    # Set rounds for sieving
    num_sieving = 10

    # Get the dimension of the data
    num_samples = np.size(samples, 0)
    num_nodes = np.size(samples, 1)

    # Sieving
    np.random.seed(seed_val)
    seeds = np.random.randint(0, 100000000, num_sieving)
    last_loglikelihoods = []
    tms = []
    for seed in seeds:
        np.random.seed(seed)
        tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes)
        tm.simulate_pi(seed_val=seed)
        tm.simulate_trees(seed_val=seed)
        tm_loglikelihood, tm = em_helper(tm,
                                         samples,
                                         num_clusters,
                                         max_num_iter=10)
        last_loglikelihoods.append(tm_loglikelihood[-1])
        tms.append(tm)

    # Main procedure for EM algorithm
    print("=> Sieving finished")
    seed = seeds[last_loglikelihoods.index(max(last_loglikelihoods))]
    tm = TreeMixture(num_clusters=num_clusters, num_nodes=num_nodes)
    tm.simulate_pi(seed_val=seed)
    tm.simulate_trees(seed_val=seed)
    loglikelihood, tm = em_helper(tm,
                                  samples,
                                  num_clusters,
                                  max_num_iter=max_num_iter)

    print("=> EM finished")
    topology_list = []
    theta_list = []
    for t in tm.clusters:
        topology_list.append(t.get_topology_array())
        theta_list.append(t.get_theta_array())
    loglikelihood = np.array(loglikelihood)
    topology_list = np.array(topology_list)
    theta_list = np.array(theta_list)

    return loglikelihood, topology_list, theta_list, tm