def run_mr(self, prefix, input_data, input_format=parallel.LineInput(), mapper=parallel.IdentityMapper(), reducer=parallel.IdentityReducer(), output_format=parallel.LevelDBOutput(), num_shards=5): os.system('rm -rf "%s"' % prefix) source = self.make_files(os.path.join(prefix, 'input'), input_data, input_format) output_prefix = os.path.join(prefix, 'output') parallel.mapreduce(source, mapper=mapper, reducer=reducer, output_format=output_format, output_prefix=output_prefix, num_shards=num_shards) if isinstance(output_format, parallel.LevelDBOutput): return sorted(list(parallel.ShardedDB.open(output_prefix))) if isinstance(output_format, parallel.JSONOutput): return json.load(open(output_prefix)) if isinstance(output_format, parallel.JSONLineOutput): result = [] with open(output_prefix, 'r') as input_f: for line in input_f: result.append(json.loads(line)) return result
def run(self): parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.JSONLineInput()), mapper=parallel.IdentityMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def test_identity(self): os.system('rm -rf /tmp/test-identity*') source_files = ['/tmp/test-identity-%d' % i for i in range(10)] for f in source_files: os.system('touch "%s"' % f) source = parallel.Collection(source_files, parallel.FilenameInput) parallel.mapreduce(source, parallel.IdentityMapper(), parallel.IdentityReducer(), '/tmp/test-identity', 2) results = sorted(list(parallel.ShardedDB.open('/tmp/test-identity/'))) for i in range(10): key, value = results[i] assert key == '/tmp/test-identity-%d' % i, results[i] assert value == ''
def test_sum(self): os.system('rm -rf /tmp/test-sum*') source_files = ['/tmp/test-sum-%d' % i for i in range(10)] for filename in source_files: with open(filename, 'w') as f: print >> f, '\n'.join([str(i) for i in range(100)]) source = parallel.Collection(source_files, parallel.LineInput) parallel.mapreduce(source, parallel.IdentityMapper(), parallel.SumReducer(), '/tmp/test-sum', 5) results = dict(parallel.ShardedDB.open('/tmp/test-sum/')) for i in range(100): assert str(i) in results, str(i) value = results[str(i)] self.assertEqual(value, str(i * 10.0))