def all_pairs_BIC_serial(self, iter_bic_list, em_iters, X, gmm_list):
     """
     Computes the BIC score for all pairs in a "serial" way and returns
     the pair with the best score
     """
     #print "Serial execution"
         
     l = len(iter_bic_list)
     best_merged_gmm = None
     best_BIC_score = 0.0
     merged_tuple = None
     merged_tuple_indices = None
     
     for gmm1idx in range(l):
         for gmm2idx in range(gmm1idx+1, l):
             score = 0.0
             gidx1, idx1 = iter_bic_list[gmm1idx]
             gidx2, idx2 = iter_bic_list[gmm2idx] 
             d1 = tools.get_data_from_indices(X, idx1)
             d2 = tools.get_data_from_indices(X, idx2)
             data = np.concatenate((d1, d2))
             g1 = gmm_list[gidx1]
             g2 = gmm_list[gidx2]
             new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
             
             if score > best_BIC_score: 
                 best_merged_gmm = new_gmm
                 merged_tuple = (g1, g2)
                 merged_tuple_indices = (gmm1idx, gmm2idx)
                 best_BIC_score = score
     
     return best_merged_gmm, merged_tuple, merged_tuple_indices, best_BIC_score
    def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list):
        """
        Computes the BIC score for all pairs by using MapReduce and returns
        the pair with the best score
        """
        
        print "Map-Reduce execution"
#        iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list)
#        pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w'))
#        os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
#                                 S_IRGRP | S_IXGRP |           \
#                                 S_IROTH | S_IXOTH             )
        
        from subprocess import call
        call(["mkdir", "-p", "gmm"])
        for i in range (0, len(iteration_bic_list)):
            gidx, didx = iteration_bic_list[i]
            pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w'))
            os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
                                      S_IRGRP | S_IXGRP |           \
                                      S_IROTH | S_IXOTH             )
        import mrjob.util as util
        util.tar_and_gzip('gmm', 'gmm.tgz') 
        
        input = []
        l = len(iteration_bic_list)
        for gmm1idx in range(l):
            for gmm2idx in range(gmm1idx+1, l):
                gidx1, didx1 = iteration_bic_list[gmm1idx]
                gidx2, didx2 = iteration_bic_list[gmm2idx] 
                an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters))
                input.append(an_item+"\n")     
        
        mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
        job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input)
        runner = job.make_runner()
        runner.run()
        kv_pairs = map(job.parse_output_line, runner.stream_output())
        assert len(kv_pairs) == 1
        merged_tuple_indices, best_score = kv_pairs[0][1]
    
        # Re-merge the GMM pair with the highest score *here*, otherwise the next
        # segment_majority_vote will crash (issue with data ownership). If we don't
        # find a different workaround, we can simplify more the mapper and the reducer.
        # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and
        # merged GMMs. Instead, we can move just indices and scores.
        # However, this re-merging is serialized...
        ind1, ind2 = merged_tuple_indices
        gidx1, idx1 = iteration_bic_list[ind1]
        gidx2, idx2 = iteration_bic_list[ind2]
        d1 = tools.get_data_from_indices(X, idx1)
        d2 = tools.get_data_from_indices(X, idx2)
        data = np.concatenate((d1,d2))
        g1 = gmm_list[gidx1]
        g2 = gmm_list[gidx2]
        new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
            
        return new_gmm, (g1, g2), merged_tuple_indices, best_score
Example #3
0
 def mapper(self, key, value):
     index1, index2 = key        
     g1, g2, data, em_iters = value
     new_gmm = g1
     score = 0
     try:
         new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
     except:
         #print sys.stderr, "SKIPPING", g1, g2
         raise
     data_to_yield = (score, new_gmm, g1, g2, index1, index2)
     #print "MAP YIELDS", data_to_yield
     yield 1, data_to_yield
 def mapper(self, key, value):
     """
     Each mapper computes the BIC score for a GMM pair
     """
     index1, index2 = key        
     g1, g2, data, em_iters = value
     new_gmm = g1
     score = 0
     try:
         new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
     except:
         raise
     data_to_yield = (score, new_gmm, g1, g2, index1, index2)
     yield 1, data_to_yield
 def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters):
     """
     Computes the BIC score for all pairs by using MapReduce and returns
     the pair with the best score
     """
     
     print "Map-Reduce execution"
     X = tools.binary_read('self_X')
     input = []
     l = len(iteration_bic_list)
     for gmm1idx in range(l):
         for gmm2idx in range(gmm1idx+1, l):
             g1, idx1 = iteration_bic_list[gmm1idx]
             g2, idx2 = iteration_bic_list[gmm2idx] 
             d1 = tools.get_data_from_indices(X, idx1)
             d2 = tools.get_data_from_indices(X, idx2)
             data = np.concatenate((d1, d2))
             an_item = protocol().write((gmm1idx,gmm2idx),(g1, g2, data, em_iters))
             input.append(an_item+"\n")     
     
     mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
     job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input)
     runner = job.make_runner()
     runner.run()
     kv_pairs = map(job.parse_output_line, runner.stream_output())
     assert len(kv_pairs) == 1
     best_merged_gmm, merged_tuple, merged_tuple_indices, best_score = kv_pairs[0][1]
 
     # Re-merge the GMM pair with the highest score *here*, otherwise the next
     # segment_majority_vote will crash (issue with data ownership). If we don't
     # find a different workaround, we can simplify more the mapper and the reducer.
     # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and
     # merged GMMs. Instead, we can move just indices and scores.
     # However, this re-merging is serialized...
     ind1, ind2 = merged_tuple_indices
     g1, idx1 = iteration_bic_list[ind1]
     g2, idx2 = iteration_bic_list[ind2]
     d1 = tools.get_data_from_indices(X, idx1)
     d2 = tools.get_data_from_indices(X, idx2)
     data = np.concatenate((d1,d2))
     new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
         
     return new_gmm, (g1, g2), merged_tuple_indices, best_score
    def mapper(self, key, value):
        """
        Each mapper computes the BIC score for a GMM pair
        """
        
        overall = t = time.time()
        
        index1, index2 = key        
        didx1, didx2, em_iters = value

        t = time.time()
#        X = tools.binary_read('self_X')
#        d1 = tools.get_data_from_indices(X, didx1)
#        d2 = tools.get_data_from_indices(X, didx2)
#        sys.stderr.write("get_data_from_indices: {0}\n".format(time.time()-t))
        d1 = tools.get_data_from_file_from_indices('self_X', didx1)
        d2 = tools.get_data_from_file_from_indices('self_X', didx2)
        sys.stderr.write("get_data_from_file_from_indices: {0}\n".format(time.time()-t))
        data = np.concatenate((d1, d2))
        
        t = time.time()
        util.unarchive('gmm.tgz', 'gmm')
        g1 = pickle.load(open('gmm/'+str(index1), 'r'))
        g2 = pickle.load(open('gmm/'+str(index2), 'r'))
        sys.stderr.write("read iter_gmm_list: {0}\n".format(time.time()-t))
        new_gmm = g1
        score = 0
        t = time.time()
        try:
            new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
        except:
            raise
        #data_to_yield = (score, new_gmm, g1, g2, index1, index2)
        data_to_yield = (score, index1, index2)
        sys.stderr.write("compute_distance_BIC: {0}\n".format(time.time()-t))
        sys.stderr.write("total BIC time: {0}\n".format(time.time()-overall))
        yield 1, data_to_yield
Example #7
0
    def do_bic_agglomeration(self, gmm_list):
        # Get the events, divide them into an initial k clusters and train each GMM on a cluster
        per_cluster = self.N / self.init_num_clusters
        init_training = zip(
            gmm_list, np.vsplit(self.X, range(per_cluster, self.N,
                                              per_cluster)))
        for g, x in init_training:
            g.train(x)

        # Perform hierarchical agglomeration based on BIC scores
        best_BIC_score = 1.0
        while (best_BIC_score > 0 and len(gmm_list) > 1):
            num_clusters = len(gmm_list)
            # Resegment data based on likelihood scoring
            likelihoods = gmm_list[0].score(self.X)
            for g in gmm_list[1:]:
                likelihoods = np.column_stack((likelihoods, g.score(self.X)))
            most_likely = likelihoods.argmax(axis=1)
            # Across 2.5 secs of observations, vote on which cluster they should be associated with
            iter_training = {}
            for i in range(250, self.N, 250):
                votes = np.zeros(num_clusters)
                for j in range(i - 250, i):
                    votes[most_likely[j]] += 1
                iter_training.setdefault(gmm_list[votes.argmax()],
                                         []).append(self.X[i - 250:i, :])
            votes = np.zeros(num_clusters)
            for j in range((self.N / 250) * 250, self.N):
                votes[most_likely[j]] += 1
            iter_training.setdefault(gmm_list[votes.argmax()], []).append(
                self.X[(self.N / 250) * 250:self.N, :])
            # Retrain the GMMs on the clusters for which they were voted most likely and
            # make a list of candidates for merging
            iter_bic_list = []
            for g, data_list in iter_training.iteritems():
                cluster_data = data_list[0]
                for d in data_list[1:]:
                    cluster_data = np.concatenate((cluster_data, d))
                cluster_data = np.ascontiguousarray(cluster_data)
                g.train(cluster_data)
                iter_bic_list.append((g, cluster_data))

            # Keep any GMMs that lost all votes in candidate list for merging
            for g in gmm_list:
                if g not in iter_training.keys():
                    iter_bic_list.append((g, None))

            # Score all pairs of GMMs using BIC
            best_merged_gmm = None
            best_BIC_score = 0.0
            merged_tuple = None
            for gmm1idx in range(len(iter_bic_list)):
                for gmm2idx in range(gmm1idx + 1, len(iter_bic_list)):
                    g1, d1 = iter_bic_list[gmm1idx]
                    g2, d2 = iter_bic_list[gmm2idx]
                    score = 0.0
                    if d1 is not None or d2 is not None:
                        if d1 is not None and d2 is not None:
                            new_gmm, score = compute_distance_BIC(
                                g1, g2,
                                np.ascontiguousarray(np.concatenate((d1, d2))))
                        elif d1 is not None:
                            new_gmm, score = compute_distance_BIC(g1, g2, d1)
                        else:
                            new_gmm, score = compute_distance_BIC(g1, g2, d2)
                    if score > best_BIC_score:
                        best_merged_gmm = new_gmm
                        merged_tuple = (g1, g2)
                        best_BIC_score = score

            # Merge the winning candidate pair if its deriable to do so
            if best_BIC_score > 0.0:
                gmm_list.remove(merged_tuple[0])
                gmm_list.remove(merged_tuple[1])
                gmm_list.append(best_merged_gmm)

        return [g.M for g in gmm_list]
Example #8
0
    def do_bic_agglomeration(self, gmm_list):
        # Get the events, divide them into an initial k clusters and train each GMM on a cluster
        per_cluster = self.N/self.init_num_clusters
        init_training = zip(gmm_list,np.vsplit(self.X, range(per_cluster, self.N, per_cluster)))
        for g, x in init_training:
            g.train(x)

        # Perform hierarchical agglomeration based on BIC scores
        best_BIC_score = 1.0
        while (best_BIC_score > 0 and len(gmm_list) > 1):
            num_clusters = len(gmm_list)
            # Resegment data based on likelihood scoring
            likelihoods = gmm_list[0].score(self.X)
            for g in gmm_list[1:]:
                likelihoods = np.column_stack((likelihoods, g.score(self.X)))
            most_likely = likelihoods.argmax(axis=1)
            # Across 2.5 secs of observations, vote on which cluster they should be associated with
            iter_training = {}
            for i in range(250, self.N, 250):
                votes = np.zeros(num_clusters)
                for j in range(i-250, i):
                    votes[most_likely[j]] += 1
                iter_training.setdefault(gmm_list[votes.argmax()],[]).append(self.X[i-250:i,:])
            votes = np.zeros(num_clusters)
            for j in range((self.N/250)*250, self.N):
                votes[most_likely[j]] += 1
            iter_training.setdefault(gmm_list[votes.argmax()],[]).append(self.X[(self.N/250)*250:self.N,:])
            # Retrain the GMMs on the clusters for which they were voted most likely and
            # make a list of candidates for merging
            iter_bic_list = []
            for g, data_list in iter_training.iteritems():
                cluster_data =  data_list[0]
                for d in data_list[1:]:
                    cluster_data = np.concatenate((cluster_data, d))
                cluster_data = np.ascontiguousarray(cluster_data)
                g.train(cluster_data)
                iter_bic_list.append((g,cluster_data))
    
            # Keep any GMMs that lost all votes in candidate list for merging
            for g in gmm_list:
                if g not in iter_training.keys():
                    iter_bic_list.append((g,None))            

            # Score all pairs of GMMs using BIC
            best_merged_gmm = None
            best_BIC_score = 0.0
            merged_tuple = None
            for gmm1idx in range(len(iter_bic_list)):
                for gmm2idx in range(gmm1idx+1, len(iter_bic_list)):
                    g1, d1 = iter_bic_list[gmm1idx]
                    g2, d2 = iter_bic_list[gmm2idx] 
                    score = 0.0
                    if d1 is not None or d2 is not None:
                        if d1 is not None and d2 is not None:
                            new_gmm, score = compute_distance_BIC(g1, g2, np.ascontiguousarray(np.concatenate((d1, d2))))
                        elif d1 is not None:
                            new_gmm, score = compute_distance_BIC(g1, g2, d1)
                        else:
                            new_gmm, score = compute_distance_BIC(g1, g2, d2)
                    if score > best_BIC_score: 
                        best_merged_gmm = new_gmm
                        merged_tuple = (g1, g2)
                        best_BIC_score = score
            
            # Merge the winning candidate pair if its deriable to do so
            if best_BIC_score > 0.0:
                gmm_list.remove(merged_tuple[0]) 
                gmm_list.remove(merged_tuple[1]) 
                gmm_list.append(best_merged_gmm)

        return [ g.M for g in gmm_list]