def all_pairs_BIC_serial(self, iter_bic_list, em_iters, X, gmm_list):
     """
     Computes the BIC score for all pairs in a "serial" way and returns
     the pair with the best score
     """
     #print "Serial execution"
         
     l = len(iter_bic_list)
     best_merged_gmm = None
     best_BIC_score = 0.0
     merged_tuple = None
     merged_tuple_indices = None
     
     for gmm1idx in range(l):
         for gmm2idx in range(gmm1idx+1, l):
             score = 0.0
             gidx1, idx1 = iter_bic_list[gmm1idx]
             gidx2, idx2 = iter_bic_list[gmm2idx] 
             d1 = tools.get_data_from_indices(X, idx1)
             d2 = tools.get_data_from_indices(X, idx2)
             data = np.concatenate((d1, d2))
             g1 = gmm_list[gidx1]
             g2 = gmm_list[gidx2]
             new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
             
             if score > best_BIC_score: 
                 best_merged_gmm = new_gmm
                 merged_tuple = (g1, g2)
                 merged_tuple_indices = (gmm1idx, gmm2idx)
                 best_BIC_score = score
     
     return best_merged_gmm, merged_tuple, merged_tuple_indices, best_BIC_score
    def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list):
        """
        Computes the BIC score for all pairs by using MapReduce and returns
        the pair with the best score
        """
        
        print "Map-Reduce execution"
#        iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list)
#        pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w'))
#        os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
#                                 S_IRGRP | S_IXGRP |           \
#                                 S_IROTH | S_IXOTH             )
        
        from subprocess import call
        call(["mkdir", "-p", "gmm"])
        for i in range (0, len(iteration_bic_list)):
            gidx, didx = iteration_bic_list[i]
            pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w'))
            os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
                                      S_IRGRP | S_IXGRP |           \
                                      S_IROTH | S_IXOTH             )
        import mrjob.util as util
        util.tar_and_gzip('gmm', 'gmm.tgz') 
        
        input = []
        l = len(iteration_bic_list)
        for gmm1idx in range(l):
            for gmm2idx in range(gmm1idx+1, l):
                gidx1, didx1 = iteration_bic_list[gmm1idx]
                gidx2, didx2 = iteration_bic_list[gmm2idx] 
                an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters))
                input.append(an_item+"\n")     
        
        mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
        job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input)
        runner = job.make_runner()
        runner.run()
        kv_pairs = map(job.parse_output_line, runner.stream_output())
        assert len(kv_pairs) == 1
        merged_tuple_indices, best_score = kv_pairs[0][1]
    
        # Re-merge the GMM pair with the highest score *here*, otherwise the next
        # segment_majority_vote will crash (issue with data ownership). If we don't
        # find a different workaround, we can simplify more the mapper and the reducer.
        # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and
        # merged GMMs. Instead, we can move just indices and scores.
        # However, this re-merging is serialized...
        ind1, ind2 = merged_tuple_indices
        gidx1, idx1 = iteration_bic_list[ind1]
        gidx2, idx2 = iteration_bic_list[ind2]
        d1 = tools.get_data_from_indices(X, idx1)
        d2 = tools.get_data_from_indices(X, idx2)
        data = np.concatenate((d1,d2))
        g1 = gmm_list[gidx1]
        g2 = gmm_list[gidx2]
        new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
            
        return new_gmm, (g1, g2), merged_tuple_indices, best_score
    def segment_map(self, iter_item):
        gp, data_indices = iter_item
        g = gp[0]
        p = gp[1]
        cluster_data =  tools.get_data_from_indices(self.X, data_indices)

        g.train(cluster_data, max_em_iters=self.em_iters)
        return (g, p, data_indices)
 def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters):
     """
     Computes the BIC score for all pairs by using MapReduce and returns
     the pair with the best score
     """
     
     print "Map-Reduce execution"
     X = tools.binary_read('self_X')
     input = []
     l = len(iteration_bic_list)
     for gmm1idx in range(l):
         for gmm2idx in range(gmm1idx+1, l):
             g1, idx1 = iteration_bic_list[gmm1idx]
             g2, idx2 = iteration_bic_list[gmm2idx] 
             d1 = tools.get_data_from_indices(X, idx1)
             d2 = tools.get_data_from_indices(X, idx2)
             data = np.concatenate((d1, d2))
             an_item = protocol().write((gmm1idx,gmm2idx),(g1, g2, data, em_iters))
             input.append(an_item+"\n")     
     
     mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
     job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input)
     runner = job.make_runner()
     runner.run()
     kv_pairs = map(job.parse_output_line, runner.stream_output())
     assert len(kv_pairs) == 1
     best_merged_gmm, merged_tuple, merged_tuple_indices, best_score = kv_pairs[0][1]
 
     # Re-merge the GMM pair with the highest score *here*, otherwise the next
     # segment_majority_vote will crash (issue with data ownership). If we don't
     # find a different workaround, we can simplify more the mapper and the reducer.
     # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and
     # merged GMMs. Instead, we can move just indices and scores.
     # However, this re-merging is serialized...
     ind1, ind2 = merged_tuple_indices
     g1, idx1 = iteration_bic_list[ind1]
     g2, idx2 = iteration_bic_list[ind2]
     d1 = tools.get_data_from_indices(X, idx1)
     d2 = tools.get_data_from_indices(X, idx2)
     data = np.concatenate((d1,d2))
     new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
         
     return new_gmm, (g1, g2), merged_tuple_indices, best_score
Esempio n. 5
0
 def reducer(self, gmm_id, indices):
     gmm_id = int(gmm_id)
     gmm_list = pickle.load(open('self_gmmlist', 'r'))
     em_iter = pickle.load(open('self_em_iter', 'r'))
     X = tools.binary_read('self_X')
     
     data_indices = []
     for i in indices:
         data_indices.append(i)
     cluster_data = tools.get_data_from_indices(X, data_indices)
     gmm_list[gmm_id].train(cluster_data, max_em_iters=em_iter)
     yield (gmm_id, data_indices), gmm_list[gmm_id]
Esempio n. 6
0
    def segment_majority_vote(self, interval_size, em_iters):
        
        cloud_flag = False
        hadoop = True
        self.em_iters = em_iters
        #print "In segment majority vote"
        # Resegment data based on likelihood scoring
        score_time = time.time()
        num_clusters = len(self.gmm_list)
        if cloud_flag == False:
            likelihoods = self.gmm_list[0].score(self.X)
            for g in self.gmm_list[1:]:
                likelihoods = np.column_stack((likelihoods, g.score(self.X)))
    
            if num_clusters == 1:
                most_likely = np.zeros(len(self.X))
            else:
                most_likely = likelihoods.argmax(axis=1)            
        else:
            if num_clusters == 1:
                most_likely = np.zeros(len(self.X))
            elif hadoop == False:
                map_res = map(self.MRhelper.score_map, self.gmm_list)
                most_likely = reduce(self.MRhelper.score_reduce, map_res).argmax(axis=1) #likelihoods.argmax
            else:
                lst = self.MRhelper.score_using_mapreduce(self.gmm_list)
                likelihoods = lst[0]
                for l in lst[1:]:
                    likelihoods = np.column_stack((likelihoods, l))
                most_likely = likelihoods.argmax(axis=1)
                
        self.ftime.write("Score: {0}\n".format(time.time() - score_time))
        cloud_flag = True
        segment_time = time.time()
        if cloud_flag == False:
            # Across 2.5 secs of observations, vote on which cluster they should be associated with
            iter_training = {}
            iter_indices = {}
            for i in range(interval_size, self.N, interval_size):
    
                arr = np.array(most_likely[(range(i-interval_size, i))])
                max_gmm = int(stats.mode(arr)[0][0])
                iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append(self.X[i-interval_size:i,:])
                iter_indices.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append((i-interval_size, i))
    
            arr = np.array(most_likely[(range((self.N/interval_size)*interval_size, self.N))])
            max_gmm = int(stats.mode(arr)[0][0])
            iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).append(self.X[(self.N/interval_size)*interval_size:self.N,:])            
            iter_indices.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append((((self.N)/interval_size)*interval_size, self.N))
            

            # for each gmm, append all the segments and retrain
            iter_bic_dict = {}
            iter_bic_list = [] 
#            for gp, data_list in iter_training.iteritems():
#                g = gp[0]
#                p = gp[1]
#                cluster_data =  data_list[0]
#    
#                for d in data_list[1:]:
#                    cluster_data = np.concatenate((cluster_data, d))
##                
##                if self.compare_data_list(cluster_data, iter_bic_dict2[p]) == False:
##                    sys.exit()
##                cluster_data = iter_bic_dict2[p]
#                g.train(cluster_data, max_em_iters=em_iters)
#    
#                iter_bic_list.append((g,cluster_data))
#                iter_bic_dict[p] = cluster_data
                
            for gp, data_indices in iter_indices.iteritems():
                g, p = gp
                cluster_data = tools.get_data_from_indices(self.X, data_indices)
                    
                g.train(cluster_data, max_em_iters=em_iters)
                iter_bic_list.append((p, data_indices))
                iter_bic_dict[p] = data_indices
                
                
        elif hadoop == False:
            # Across 2.5 secs of observations, vote on which cluster they should be associated with
            iter_training = {}
            map_input = zip(np.hsplit(np.array(most_likely), range(interval_size, len(most_likely), interval_size)),
                            map(lambda(x):(x, x+interval_size), range(0, len(most_likely), interval_size)))
            map_res = map(self.MRhelper.vote_map, map_input)
            map_res.insert(0, iter_training)
            iter_training = reduce(self.MRhelper.vote_reduce, map_res)
            
            # for each gmm, append all the segments and retrain
            iter_bic_dict = {}
            iter_bic_list = []   
            map_res = map(self.MRhelper.segment_map, iter_training.iteritems())
            map_res.insert(0, (iter_bic_dict, iter_bic_list))
            iter_bic_dict, iter_bic_list = reduce(self.MRhelper.segment_reduce, map_res)
#            
        else:
            map_input = zip(np.hsplit(np.array(most_likely), range(interval_size, len(most_likely), interval_size)),
                            map(lambda(x): (x, x+interval_size), range(0, len(most_likely), interval_size)))
            iter_bic_dict, iter_bic_list = self.MRhelper.segment_using_mapreduce(self.gmm_list, map_input, em_iters)

        
        self.ftime.write("Segment: {0}\n".format(time.time() - segment_time))

        return iter_bic_dict, iter_bic_list, most_likely