def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list): """ Computes the BIC score for all pairs by using MapReduce and returns the pair with the best score """ print "Map-Reduce execution" # iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list) # pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w')) # os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \ # S_IRGRP | S_IXGRP | \ # S_IROTH | S_IXOTH ) from subprocess import call call(["mkdir", "-p", "gmm"]) for i in range (0, len(iteration_bic_list)): gidx, didx = iteration_bic_list[i] pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w')) os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \ S_IRGRP | S_IXGRP | \ S_IROTH | S_IXOTH ) import mrjob.util as util util.tar_and_gzip('gmm', 'gmm.tgz') input = [] l = len(iteration_bic_list) for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): gidx1, didx1 = iteration_bic_list[gmm1idx] gidx2, didx2 = iteration_bic_list[gmm2idx] an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters)) input.append(an_item+"\n") mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle'] job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input) runner = job.make_runner() runner.run() kv_pairs = map(job.parse_output_line, runner.stream_output()) assert len(kv_pairs) == 1 merged_tuple_indices, best_score = kv_pairs[0][1] # Re-merge the GMM pair with the highest score *here*, otherwise the next # segment_majority_vote will crash (issue with data ownership). If we don't # find a different workaround, we can simplify more the mapper and the reducer. # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and # merged GMMs. Instead, we can move just indices and scores. # However, this re-merging is serialized... ind1, ind2 = merged_tuple_indices gidx1, idx1 = iteration_bic_list[ind1] gidx2, idx2 = iteration_bic_list[ind2] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1,d2)) g1 = gmm_list[gidx1] g2 = gmm_list[gidx2] new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) return new_gmm, (g1, g2), merged_tuple_indices, best_score
def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters): """ Computes the BIC score for all pairs by using MapReduce and returns the pair with the best score """ print "Map-Reduce execution" X = tools.binary_read('self_X') input = [] l = len(iteration_bic_list) for gmm1idx in range(l): for gmm2idx in range(gmm1idx+1, l): g1, idx1 = iteration_bic_list[gmm1idx] g2, idx2 = iteration_bic_list[gmm2idx] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1, d2)) an_item = protocol().write((gmm1idx,gmm2idx),(g1, g2, data, em_iters)) input.append(an_item+"\n") mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle'] job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input) runner = job.make_runner() runner.run() kv_pairs = map(job.parse_output_line, runner.stream_output()) assert len(kv_pairs) == 1 best_merged_gmm, merged_tuple, merged_tuple_indices, best_score = kv_pairs[0][1] # Re-merge the GMM pair with the highest score *here*, otherwise the next # segment_majority_vote will crash (issue with data ownership). If we don't # find a different workaround, we can simplify more the mapper and the reducer. # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and # merged GMMs. Instead, we can move just indices and scores. # However, this re-merging is serialized... ind1, ind2 = merged_tuple_indices g1, idx1 = iteration_bic_list[ind1] g2, idx2 = iteration_bic_list[ind2] d1 = tools.get_data_from_indices(X, idx1) d2 = tools.get_data_from_indices(X, idx2) data = np.concatenate((d1,d2)) new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters) return new_gmm, (g1, g2), merged_tuple_indices, best_score