Python protocol Examples

Programming Language: Python

Namespace/Package Name: mrjob.protocol

Method/Function: protocol

Examples at hotexamples.com: 2

Python protocol - 2 examples found. These are the top rated real world Python examples of mrjob.protocol.protocol extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: all_pairs_BIC_score.py Project: kaewgb/cluster_mrjob

    def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters, X, gmm_list):
        """
        Computes the BIC score for all pairs by using MapReduce and returns
        the pair with the best score
        """
        
        print "Map-Reduce execution"
#        iter_gmm_list = map(lambda(gidx, didx): gmm_list[gidx], iteration_bic_list)
#        pickle.dump(iter_gmm_list, open('iter_gmm_list', 'w'))
#        os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
#                                 S_IRGRP | S_IXGRP |           \
#                                 S_IROTH | S_IXOTH             )
        
        from subprocess import call
        call(["mkdir", "-p", "gmm"])
        for i in range (0, len(iteration_bic_list)):
            gidx, didx = iteration_bic_list[i]
            pickle.dump(gmm_list[gidx], open('gmm/'+str(i), 'w'))
            os.chmod("iter_gmm_list", S_IRUSR | S_IWUSR | S_IXUSR | \
                                      S_IRGRP | S_IXGRP |           \
                                      S_IROTH | S_IXOTH             )
        import mrjob.util as util
        util.tar_and_gzip('gmm', 'gmm.tgz') 
        
        input = []
        l = len(iteration_bic_list)
        for gmm1idx in range(l):
            for gmm2idx in range(gmm1idx+1, l):
                gidx1, didx1 = iteration_bic_list[gmm1idx]
                gidx2, didx2 = iteration_bic_list[gmm2idx] 
                an_item = protocol().write((gmm1idx,gmm2idx),(didx1, didx2, em_iters))
                input.append(an_item+"\n")     
        
        mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
        job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input)
        runner = job.make_runner()
        runner.run()
        kv_pairs = map(job.parse_output_line, runner.stream_output())
        assert len(kv_pairs) == 1
        merged_tuple_indices, best_score = kv_pairs[0][1]
    
        # Re-merge the GMM pair with the highest score *here*, otherwise the next
        # segment_majority_vote will crash (issue with data ownership). If we don't
        # find a different workaround, we can simplify more the mapper and the reducer.
        # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and
        # merged GMMs. Instead, we can move just indices and scores.
        # However, this re-merging is serialized...
        ind1, ind2 = merged_tuple_indices
        gidx1, idx1 = iteration_bic_list[ind1]
        gidx2, idx2 = iteration_bic_list[ind2]
        d1 = tools.get_data_from_indices(X, idx1)
        d2 = tools.get_data_from_indices(X, idx2)
        data = np.concatenate((d1,d2))
        g1 = gmm_list[gidx1]
        g2 = gmm_list[gidx2]
        new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
            
        return new_gmm, (g1, g2), merged_tuple_indices, best_score

Example #2

Show file

File: all_pairs_BIC_score.py Project: mbdriscoll/cluster_mrjob

 def all_pairs_BIC_using_mapreduce(self, iteration_bic_list, em_iters):
     """
     Computes the BIC score for all pairs by using MapReduce and returns
     the pair with the best score
     """
     
     print "Map-Reduce execution"
     X = tools.binary_read('self_X')
     input = []
     l = len(iteration_bic_list)
     for gmm1idx in range(l):
         for gmm2idx in range(gmm1idx+1, l):
             g1, idx1 = iteration_bic_list[gmm1idx]
             g2, idx2 = iteration_bic_list[gmm2idx] 
             d1 = tools.get_data_from_indices(X, idx1)
             d2 = tools.get_data_from_indices(X, idx2)
             data = np.concatenate((d1, d2))
             an_item = protocol().write((gmm1idx,gmm2idx),(g1, g2, data, em_iters))
             input.append(an_item+"\n")     
     
     mr_args = ['-v', '-r', 'hadoop','--input-protocol', 'pickle','--output-protocol','pickle','--protocol','pickle']
     job = AllPairsBicScoreMRJob(args=mr_args).sandbox(stdin=input)
     runner = job.make_runner()
     runner.run()
     kv_pairs = map(job.parse_output_line, runner.stream_output())
     assert len(kv_pairs) == 1
     best_merged_gmm, merged_tuple, merged_tuple_indices, best_score = kv_pairs[0][1]
 
     # Re-merge the GMM pair with the highest score *here*, otherwise the next
     # segment_majority_vote will crash (issue with data ownership). If we don't
     # find a different workaround, we can simplify more the mapper and the reducer.
     # Essentially, we can avoid moving from mappers to the reducer the GMM pairs and
     # merged GMMs. Instead, we can move just indices and scores.
     # However, this re-merging is serialized...
     ind1, ind2 = merged_tuple_indices
     g1, idx1 = iteration_bic_list[ind1]
     g2, idx2 = iteration_bic_list[ind2]
     d1 = tools.get_data_from_indices(X, idx1)
     d2 = tools.get_data_from_indices(X, idx2)
     data = np.concatenate((d1,d2))
     new_gmm, score = compute_distance_BIC(g1, g2, data, em_iters)
         
     return new_gmm, (g1, g2), merged_tuple_indices, best_score