def all_pairs_BIC_serial(self, iter_bic_list, em_iters, X, gmm_list): """ Computes the BIC score for all pairs in a "serial" way and returns the pair with the best score """ #print "Serial execution" l = len(iter_bic_list) best_merged_gmm = None best_BIC_score = 0.0 merged_tuple = None merged_tuple_indices = None for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): score = 0.0 gidx1, idx1 = iter_bic_list[gmm1idx] gidx2, idx2 = iter_bic_list[gmm2idx] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1, d2)) g1 = gmm_list[gidx1] g2 = gmm_list[gidx2] new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) if score > best_BIC_score: best_merged_gmm = new_gmm merged_tuple = (g1, g2) merged_tuple_indices = (gmm1idx, gmm2idx) best_BIC_score = score return best_merged_gmm, merged_tuple, merged_tuple_indices, best_BIC_score
def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list): """ Computes the BIC score for all pairs by using MapReduce and returns the pair with the best score """ print "Map-Reduce execution" # iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list) # pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w')) # os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \ # S_IRGRP | S_IXGRP | \ # S_IROTH | S_IXOTH ) from subprocess import call call(["mkdir", "-p", "gmm"]) for i in range (0, len(iteration_bic_list)): gidx, didx = iteration_bic_list[i] pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w')) os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \ S_IRGRP | S_IXGRP | \ S_IROTH | S_IXOTH ) import mrjob.util as util util.tar_and_gzip('gmm', 'gmm.tgz') input = [] l = len(iteration_bic_list) for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): gidx1, didx1 = iteration_bic_list[gmm1idx] gidx2, didx2 = iteration_bic_list[gmm2idx] an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters)) input.append(an_item+"\n") mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle'] job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input) runner = job.make_runner() runner.run() kv_pairs = map(job.parse_output_line, runner.stream_output()) assert len(kv_pairs) == 1 merged_tuple_indices, best_score = kv_pairs[0][1] # Re-merge the GMM pair with the highest score *here*, otherwise the next # segment_majority_vote will crash (issue with data ownership). If we don't # find a different workaround, we can simplify more the mapper and the reducer. # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and # merged GMMs. Instead, we can move just indices and scores. # However, this re-merging is serialized... ind1, ind2 = merged_tuple_indices gidx1, idx1 = iteration_bic_list[ind1] gidx2, idx2 = iteration_bic_list[ind2] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1,d2)) g1 = gmm_list[gidx1] g2 = gmm_list[gidx2] new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) return new_gmm, (g1, g2), merged_tuple_indices, best_score
def mapper(self, key, value): index1, index2 = key g1, g2, data, em_iters = value new_gmm = g1 score = 0 try: new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) except: #print sys.stderr, "SKIPPING", g1, g2 raise data_to_yield = (score, new_gmm, g1, g2, index1, index2) #print "MAP YIELDS", data_to_yield yield 1, data_to_yield
def mapper(self, key, value): """ Each mapper computes the BIC score for a GMM pair """ index1, index2 = key g1, g2, data, em_iters = value new_gmm = g1 score = 0 try: new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) except: raise data_to_yield = (score, new_gmm, g1, g2, index1, index2) yield 1, data_to_yield
def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters): """ Computes the BIC score for all pairs by using MapReduce and returns the pair with the best score """ print "Map-Reduce execution" X = tools.binary_read('self_X') input = [] l = len(iteration_bic_list) for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): g1, idx1 = iteration_bic_list[gmm1idx] g2, idx2 = iteration_bic_list[gmm2idx] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1, d2)) an_item = protocol().write((gmm1idx,gmm2idx),(g1, g2, data, em_iters)) input.append(an_item+"\n") mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle'] job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input) runner = job.make_runner() runner.run() kv_pairs = map(job.parse_output_line, runner.stream_output()) assert len(kv_pairs) == 1 best_merged_gmm, merged_tuple, merged_tuple_indices, best_score = kv_pairs[0][1] # Re-merge the GMM pair with the highest score *here*, otherwise the next # segment_majority_vote will crash (issue with data ownership). If we don't # find a different workaround, we can simplify more the mapper and the reducer. # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and # merged GMMs. Instead, we can move just indices and scores. # However, this re-merging is serialized... ind1, ind2 = merged_tuple_indices g1, idx1 = iteration_bic_list[ind1] g2, idx2 = iteration_bic_list[ind2] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1,d2)) new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) return new_gmm, (g1, g2), merged_tuple_indices, best_score
def mapper(self, key, value): """ Each mapper computes the BIC score for a GMM pair """ overall = t = time.time() index1, index2 = key didx1, didx2, em_iters = value t = time.time() # X = tools.binary_read('self_X') # d1 = tools.get_data_from_indices(X, didx1) # d2 = tools.get_data_from_indices(X, didx2) # sys.stderr.write("get_data_from_indices: {0}\n".format(time.time()-t)) d1 = tools.get_data_from_file_from_indices('self_X', didx1) d2 = tools.get_data_from_file_from_indices('self_X', didx2) sys.stderr.write("get_data_from_file_from_indices: {0}\n".format(time.time()-t)) data = np.concatenate((d1, d2)) t = time.time() util.unarchive('gmm.tgz', 'gmm') g1 = pickle.load(open('gmm/'+str(index1), 'r')) g2 = pickle.load(open('gmm/'+str(index2), 'r')) sys.stderr.write("read iter_gmm_list: {0}\n".format(time.time()-t)) new_gmm = g1 score = 0 t = time.time() try: new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) except: raise #data_to_yield = (score, new_gmm, g1, g2, index1, index2) data_to_yield = (score, index1, index2) sys.stderr.write("compute_distance_BIC: {0}\n".format(time.time()-t)) sys.stderr.write("total BIC time: {0}\n".format(time.time()-overall)) yield 1, data_to_yield
def do_bic_agglomeration(self, gmm_list): # Get the events, divide them into an initial k clusters and train each GMM on a cluster per_cluster = self.N / self.init_num_clusters init_training = zip( gmm_list, np.vsplit(self.X, range(per_cluster, self.N, per_cluster))) for g, x in init_training: g.train(x) # Perform hierarchical agglomeration based on BIC scores best_BIC_score = 1.0 while (best_BIC_score > 0 and len(gmm_list) > 1): num_clusters = len(gmm_list) # Resegment data based on likelihood scoring likelihoods = gmm_list[0].score(self.X) for g in gmm_list[1:]: likelihoods = np.column_stack((likelihoods, g.score(self.X))) most_likely = likelihoods.argmax(axis=1) # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} for i in range(250, self.N, 250): votes = np.zeros(num_clusters) for j in range(i - 250, i): votes[most_likely[j]] += 1 iter_training.setdefault(gmm_list[votes.argmax()], []).append(self.X[i - 250:i, :]) votes = np.zeros(num_clusters) for j in range((self.N / 250) * 250, self.N): votes[most_likely[j]] += 1 iter_training.setdefault(gmm_list[votes.argmax()], []).append( self.X[(self.N / 250) * 250:self.N, :]) # Retrain the GMMs on the clusters for which they were voted most likely and # make a list of candidates for merging iter_bic_list = [] for g, data_list in iter_training.iteritems(): cluster_data = data_list[0] for d in data_list[1:]: cluster_data = np.concatenate((cluster_data, d)) cluster_data = np.ascontiguousarray(cluster_data) g.train(cluster_data) iter_bic_list.append((g, cluster_data)) # Keep any GMMs that lost all votes in candidate list for merging for g in gmm_list: if g not in iter_training.keys(): iter_bic_list.append((g, None)) # Score all pairs of GMMs using BIC best_merged_gmm = None best_BIC_score = 0.0 merged_tuple = None for gmm1idx in range(len(iter_bic_list)): for gmm2idx in range(gmm1idx + 1, len(iter_bic_list)): g1, d1 = iter_bic_list[gmm1idx] g2, d2 = iter_bic_list[gmm2idx] score = 0.0 if d1 is not None or d2 is not None: if d1 is not None and d2 is not None: new_gmm, score = compute_distance_BIC( g1, g2, np.ascontiguousarray(np.concatenate((d1, d2)))) elif d1 is not None: new_gmm, score = compute_distance_BIC(g1, g2, d1) else: new_gmm, score = compute_distance_BIC(g1, g2, d2) if score > best_BIC_score: best_merged_gmm = new_gmm merged_tuple = (g1, g2) best_BIC_score = score # Merge the winning candidate pair if its deriable to do so if best_BIC_score > 0.0: gmm_list.remove(merged_tuple[0]) gmm_list.remove(merged_tuple[1]) gmm_list.append(best_merged_gmm) return [g.M for g in gmm_list]
def do_bic_agglomeration(self, gmm_list): # Get the events, divide them into an initial k clusters and train each GMM on a cluster per_cluster = self.N/self.init_num_clusters init_training = zip(gmm_list,np.vsplit(self.X, range(per_cluster, self.N, per_cluster))) for g, x in init_training: g.train(x) # Perform hierarchical agglomeration based on BIC scores best_BIC_score = 1.0 while (best_BIC_score > 0 and len(gmm_list) > 1): num_clusters = len(gmm_list) # Resegment data based on likelihood scoring likelihoods = gmm_list[0].score(self.X) for g in gmm_list[1:]: likelihoods = np.column_stack((likelihoods, g.score(self.X))) most_likely = likelihoods.argmax(axis=1) # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} for i in range(250, self.N, 250): votes = np.zeros(num_clusters) for j in range(i-250, i): votes[most_likely[j]] += 1 iter_training.setdefault(gmm_list[votes.argmax()],[]).append(self.X[i-250:i,:]) votes = np.zeros(num_clusters) for j in range((self.N/250)*250, self.N): votes[most_likely[j]] += 1 iter_training.setdefault(gmm_list[votes.argmax()],[]).append(self.X[(self.N/250)*250:self.N,:]) # Retrain the GMMs on the clusters for which they were voted most likely and # make a list of candidates for merging iter_bic_list = [] for g, data_list in iter_training.iteritems(): cluster_data = data_list[0] for d in data_list[1:]: cluster_data = np.concatenate((cluster_data, d)) cluster_data = np.ascontiguousarray(cluster_data) g.train(cluster_data) iter_bic_list.append((g,cluster_data)) # Keep any GMMs that lost all votes in candidate list for merging for g in gmm_list: if g not in iter_training.keys(): iter_bic_list.append((g,None)) # Score all pairs of GMMs using BIC best_merged_gmm = None best_BIC_score = 0.0 merged_tuple = None for gmm1idx in range(len(iter_bic_list)): for gmm2idx in range(gmm1idx+1, len(iter_bic_list)): g1, d1 = iter_bic_list[gmm1idx] g2, d2 = iter_bic_list[gmm2idx] score = 0.0 if d1 is not None or d2 is not None: if d1 is not None and d2 is not None: new_gmm, score = compute_distance_BIC(g1, g2, np.ascontiguousarray(np.concatenate((d1, d2)))) elif d1 is not None: new_gmm, score = compute_distance_BIC(g1, g2, d1) else: new_gmm, score = compute_distance_BIC(g1, g2, d2) if score > best_BIC_score: best_merged_gmm = new_gmm merged_tuple = (g1, g2) best_BIC_score = score # Merge the winning candidate pair if its deriable to do so if best_BIC_score > 0.0: gmm_list.remove(merged_tuple[0]) gmm_list.remove(merged_tuple[1]) gmm_list.append(best_merged_gmm) return [ g.M for g in gmm_list]