def test_sum(self): # 10 files with 100 lines each results = self.run_mr( '/tmp/test-sum', ['\n'.join([str(i) for i in range(100)]) for i in range(10)], reducer=parallel.SumReducer()) results = dict(results) for i in range(100): assert str(i) in results, str(i) value = results[str(i)] self.assertEqual(value, i * 10.0)
def test_sum(self): # 10 files with 100 lines each results = self.run_mr( '/tmp/test-sum', ['\n'.join([str(i) for i in range(100)]) for i in range(10)], reducer=parallel.SumReducer()) results = set(dict(results).values()) for i in range(100): assert i * 10 in results results.remove(i * 10) assert len(results) == 0, 'Unexpected output: %s' % results
def test_sum(self): os.system('rm -rf /tmp/test-sum*') source_files = ['/tmp/test-sum-%d' % i for i in range(10)] for filename in source_files: with open(filename, 'w') as f: print >> f, '\n'.join([str(i) for i in range(100)]) source = parallel.Collection(source_files, parallel.LineInput) parallel.mapreduce(source, parallel.IdentityMapper(), parallel.SumReducer(), '/tmp/test-sum', 5) results = dict(parallel.ShardedDB.open('/tmp/test-sum/')) for i in range(100): assert str(i) in results, str(i) value = results[str(i)] self.assertEqual(value, str(i * 10.0))
def test_csv_line_split(self): dir = '/tmp/openfda-test-csv-line-split' level_db_prefix = os.path.join(dir, 'leveldb') file = 'csv_split.csv' os.system('rm -rf "%s"' % level_db_prefix) os.makedirs(dir, exist_ok=True) shutil.copyfile( os.path.join(dirname(os.path.abspath(__file__)), 'data/%s' % file), os.path.join(dir, file)) col = parallel.Collection.from_glob( os.path.join(dir, file), parallel.CSVSplitLineInput(quoting=csv.QUOTE_NONE, delimiter='|', fixed_splits=3)) splits = list(col) assert len(splits) == 3 # Check every split's start and end. Start must be at the beginning of a line, end must be after carriage return or EOF. assert (splits[0].start_pos) == 0 assert (splits[0].end_pos) == 81 assert (splits[1].start_pos) == 81 assert (splits[1].end_pos) == 169 assert (splits[2].start_pos) == 169 assert (splits[2].end_pos) == 196 # Run M/R with these splits and ensure the result is as expected parallel.mapreduce(col, mapper=CsvMapper(), reducer=parallel.SumReducer(), map_workers=len(splits), output_format=parallel.LevelDBOutput(), output_prefix=level_db_prefix, num_shards=len(splits)) result = sorted(list(parallel.ShardedDB.open(level_db_prefix))) print(result) # if we got the sum right we know all the splits have been processed correctly assert (result[0][1] ) == 401 + 402 + 403 + 404 + 405 + 407 + 408 + 409 + 410