コード例 #1
0
    def run_mr(self,
               prefix,
               input_data,
               input_format=parallel.LineInput(),
               mapper=parallel.IdentityMapper(),
               reducer=parallel.IdentityReducer(),
               output_format=parallel.LevelDBOutput(),
               num_shards=5):
        os.system('rm -rf "%s"' % prefix)
        source = self.make_files(os.path.join(prefix, 'input'), input_data,
                                 input_format)
        output_prefix = os.path.join(prefix, 'output')

        parallel.mapreduce(source,
                           mapper=mapper,
                           reducer=reducer,
                           output_format=output_format,
                           output_prefix=output_prefix,
                           num_shards=num_shards)

        if isinstance(output_format, parallel.LevelDBOutput):
            return sorted(list(parallel.ShardedDB.open(output_prefix)))

        if isinstance(output_format, parallel.JSONOutput):
            return json.load(open(output_prefix))

        if isinstance(output_format, parallel.JSONLineOutput):
            result = []
            with open(output_prefix, 'r') as input_f:
                for line in input_f:
                    result.append(json.loads(line))
            return result
コード例 #2
0
 def mapreduce_inputs(self):
     return parallel.Collection.from_glob(self.batch, parallel.LineInput())