def all_pairs_BIC_serial(self, iter_bic_list, em_iters, X, gmm_list): """ Computes the BIC score for all pairs in a "serial" way and returns the pair with the best score """ #print "Serial execution" l = len(iter_bic_list) best_merged_gmm = None best_BIC_score = 0.0 merged_tuple = None merged_tuple_indices = None for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): score = 0.0 gidx1, idx1 = iter_bic_list[gmm1idx] gidx2, idx2 = iter_bic_list[gmm2idx] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1, d2)) g1 = gmm_list[gidx1] g2 = gmm_list[gidx2] new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) if score > best_BIC_score: best_merged_gmm = new_gmm merged_tuple = (g1, g2) merged_tuple_indices = (gmm1idx, gmm2idx) best_BIC_score = score return best_merged_gmm, merged_tuple, merged_tuple_indices, best_BIC_score
def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list): """ Computes the BIC score for all pairs by using MapReduce and returns the pair with the best score """ print "Map-Reduce execution" # iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list) # pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w')) # os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \ # S_IRGRP | S_IXGRP | \ # S_IROTH | S_IXOTH ) from subprocess import call call(["mkdir", "-p", "gmm"]) for i in range (0, len(iteration_bic_list)): gidx, didx = iteration_bic_list[i] pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w')) os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \ S_IRGRP | S_IXGRP | \ S_IROTH | S_IXOTH ) import mrjob.util as util util.tar_and_gzip('gmm', 'gmm.tgz') input = [] l = len(iteration_bic_list) for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): gidx1, didx1 = iteration_bic_list[gmm1idx] gidx2, didx2 = iteration_bic_list[gmm2idx] an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters)) input.append(an_item+"\n") mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle'] job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input) runner = job.make_runner() runner.run() kv_pairs = map(job.parse_output_line, runner.stream_output()) assert len(kv_pairs) == 1 merged_tuple_indices, best_score = kv_pairs[0][1] # Re-merge the GMM pair with the highest score *here*, otherwise the next # segment_majority_vote will crash (issue with data ownership). If we don't # find a different workaround, we can simplify more the mapper and the reducer. # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and # merged GMMs. Instead, we can move just indices and scores. # However, this re-merging is serialized... ind1, ind2 = merged_tuple_indices gidx1, idx1 = iteration_bic_list[ind1] gidx2, idx2 = iteration_bic_list[ind2] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1,d2)) g1 = gmm_list[gidx1] g2 = gmm_list[gidx2] new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) return new_gmm, (g1, g2), merged_tuple_indices, best_score
def segment_map(self, iter_item): gp, data_indices = iter_item g = gp[0] p = gp[1] cluster_data = tools.get_data_from_indices(self.X, data_indices) g.train(cluster_data, max_em_iters=self.em_iters) return (g, p, data_indices)
def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters): """ Computes the BIC score for all pairs by using MapReduce and returns the pair with the best score """ print "Map-Reduce execution" X = tools.binary_read('self_X') input = [] l = len(iteration_bic_list) for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): g1, idx1 = iteration_bic_list[gmm1idx] g2, idx2 = iteration_bic_list[gmm2idx] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1, d2)) an_item = protocol().write((gmm1idx,gmm2idx),(g1, g2, data, em_iters)) input.append(an_item+"\n") mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle'] job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input) runner = job.make_runner() runner.run() kv_pairs = map(job.parse_output_line, runner.stream_output()) assert len(kv_pairs) == 1 best_merged_gmm, merged_tuple, merged_tuple_indices, best_score = kv_pairs[0][1] # Re-merge the GMM pair with the highest score *here*, otherwise the next # segment_majority_vote will crash (issue with data ownership). If we don't # find a different workaround, we can simplify more the mapper and the reducer. # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and # merged GMMs. Instead, we can move just indices and scores. # However, this re-merging is serialized... ind1, ind2 = merged_tuple_indices g1, idx1 = iteration_bic_list[ind1] g2, idx2 = iteration_bic_list[ind2] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1,d2)) new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) return new_gmm, (g1, g2), merged_tuple_indices, best_score
def reducer(self, gmm_id, indices): gmm_id = int(gmm_id) gmm_list = pickle.load(open('self_gmmlist', 'r')) em_iter = pickle.load(open('self_em_iter', 'r')) X = tools.binary_read('self_X') data_indices = [] for i in indices: data_indices.append(i) cluster_data = tools.get_data_from_indices(X, data_indices) gmm_list[gmm_id].train(cluster_data, max_em_iters=em_iter) yield (gmm_id, data_indices), gmm_list[gmm_id]
def segment_majority_vote(self, interval_size, em_iters): cloud_flag = False hadoop = True self.em_iters = em_iters #print "In segment majority vote" # Resegment data based on likelihood scoring score_time = time.time() num_clusters = len(self.gmm_list) if cloud_flag == False: likelihoods = self.gmm_list[0].score(self.X) for g in self.gmm_list[1:]: likelihoods = np.column_stack((likelihoods, g.score(self.X))) if num_clusters == 1: most_likely = np.zeros(len(self.X)) else: most_likely = likelihoods.argmax(axis=1) else: if num_clusters == 1: most_likely = np.zeros(len(self.X)) elif hadoop == False: map_res = map(self.MRhelper.score_map, self.gmm_list) most_likely = reduce(self.MRhelper.score_reduce, map_res).argmax(axis=1) #likelihoods.argmax else: lst = self.MRhelper.score_using_mapreduce(self.gmm_list) likelihoods = lst[0] for l in lst[1:]: likelihoods = np.column_stack((likelihoods, l)) most_likely = likelihoods.argmax(axis=1) self.ftime.write("Score: {0}\n".format(time.time() - score_time)) cloud_flag = True segment_time = time.time() if cloud_flag == False: # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} iter_indices = {} for i in range(interval_size, self.N, interval_size): arr = np.array(most_likely[(range(i-interval_size, i))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append(self.X[i-interval_size:i,:]) iter_indices.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append((i-interval_size, i)) arr = np.array(most_likely[(range((self.N/interval_size)*interval_size, self.N))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).append(self.X[(self.N/interval_size)*interval_size:self.N,:]) iter_indices.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append((((self.N)/interval_size)*interval_size, self.N)) # for each gmm, append all the segments and retrain iter_bic_dict = {} iter_bic_list = [] # for gp, data_list in iter_training.iteritems(): # g = gp[0] # p = gp[1] # cluster_data = data_list[0] # # for d in data_list[1:]: # cluster_data = np.concatenate((cluster_data, d)) ## ## if self.compare_data_list(cluster_data, iter_bic_dict2[p]) == False: ## sys.exit() ## cluster_data = iter_bic_dict2[p] # g.train(cluster_data, max_em_iters=em_iters) # # iter_bic_list.append((g,cluster_data)) # iter_bic_dict[p] = cluster_data for gp, data_indices in iter_indices.iteritems(): g, p = gp cluster_data = tools.get_data_from_indices(self.X, data_indices) g.train(cluster_data, max_em_iters=em_iters) iter_bic_list.append((p, data_indices)) iter_bic_dict[p] = data_indices elif hadoop == False: # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} map_input = zip(np.hsplit(np.array(most_likely), range(interval_size, len(most_likely), interval_size)), map(lambda(x):(x, x+interval_size), range(0, len(most_likely), interval_size))) map_res = map(self.MRhelper.vote_map, map_input) map_res.insert(0, iter_training) iter_training = reduce(self.MRhelper.vote_reduce, map_res) # for each gmm, append all the segments and retrain iter_bic_dict = {} iter_bic_list = [] map_res = map(self.MRhelper.segment_map, iter_training.iteritems()) map_res.insert(0, (iter_bic_dict, iter_bic_list)) iter_bic_dict, iter_bic_list = reduce(self.MRhelper.segment_reduce, map_res) # else: map_input = zip(np.hsplit(np.array(most_likely), range(interval_size, len(most_likely), interval_size)), map(lambda(x): (x, x+interval_size), range(0, len(most_likely), interval_size))) iter_bic_dict, iter_bic_list = self.MRhelper.segment_using_mapreduce(self.gmm_list, map_input, em_iters) self.ftime.write("Segment: {0}\n".format(time.time() - segment_time)) return iter_bic_dict, iter_bic_list, most_likely