def run_mr(self, prefix, input_data, input_format=parallel.LineInput(), mapper=parallel.IdentityMapper(), reducer=parallel.IdentityReducer(), output_format=parallel.LevelDBOutput(), num_shards=5): os.system('rm -rf "%s"' % prefix) source = self.make_files(os.path.join(prefix, 'input'), input_data, input_format) output_prefix = os.path.join(prefix, 'output') parallel.mapreduce(source, mapper=mapper, reducer=reducer, output_format=output_format, output_prefix=output_prefix, num_shards=num_shards) if isinstance(output_format, parallel.LevelDBOutput): return sorted(list(parallel.ShardedDB.open(output_prefix))) if isinstance(output_format, parallel.JSONOutput): return json.load(open(output_prefix)) if isinstance(output_format, parallel.JSONLineOutput): result = [] with open(output_prefix, 'r') as input_f: for line in input_f: result.append(json.loads(line)) return result
def mapreduce_inputs(self): return parallel.Collection.from_glob(self.batch, parallel.LineInput())