Example #1
0
    def test_sum(self):
        # 10 files with 100 lines each
        results = self.run_mr(
            '/tmp/test-sum',
            ['\n'.join([str(i) for i in range(100)]) for i in range(10)],
            reducer=parallel.SumReducer())

        results = dict(results)
        for i in range(100):
            assert str(i) in results, str(i)
            value = results[str(i)]
            self.assertEqual(value, i * 10.0)
    def test_sum(self):
        # 10 files with 100 lines each
        results = self.run_mr(
            '/tmp/test-sum',
            ['\n'.join([str(i) for i in range(100)]) for i in range(10)],
            reducer=parallel.SumReducer())

        results = set(dict(results).values())
        for i in range(100):
            assert i * 10 in results
            results.remove(i * 10)
        assert len(results) == 0, 'Unexpected output: %s' % results
Example #3
0
    def test_sum(self):
        os.system('rm -rf /tmp/test-sum*')
        source_files = ['/tmp/test-sum-%d' % i for i in range(10)]
        for filename in source_files:
            with open(filename, 'w') as f:
                print >> f, '\n'.join([str(i) for i in range(100)])

        source = parallel.Collection(source_files, parallel.LineInput)
        parallel.mapreduce(source, parallel.IdentityMapper(),
                           parallel.SumReducer(), '/tmp/test-sum', 5)

        results = dict(parallel.ShardedDB.open('/tmp/test-sum/'))
        for i in range(100):
            assert str(i) in results, str(i)
            value = results[str(i)]
            self.assertEqual(value, str(i * 10.0))
Example #4
0
    def test_csv_line_split(self):
        dir = '/tmp/openfda-test-csv-line-split'
        level_db_prefix = os.path.join(dir, 'leveldb')
        file = 'csv_split.csv'

        os.system('rm -rf "%s"' % level_db_prefix)
        os.makedirs(dir, exist_ok=True)
        shutil.copyfile(
            os.path.join(dirname(os.path.abspath(__file__)), 'data/%s' % file),
            os.path.join(dir, file))

        col = parallel.Collection.from_glob(
            os.path.join(dir, file),
            parallel.CSVSplitLineInput(quoting=csv.QUOTE_NONE,
                                       delimiter='|',
                                       fixed_splits=3))
        splits = list(col)
        assert len(splits) == 3
        # Check every split's start and end. Start must be at the beginning of a line, end must be after carriage return or EOF.
        assert (splits[0].start_pos) == 0
        assert (splits[0].end_pos) == 81
        assert (splits[1].start_pos) == 81
        assert (splits[1].end_pos) == 169
        assert (splits[2].start_pos) == 169
        assert (splits[2].end_pos) == 196

        # Run M/R with these splits and ensure the result is as expected
        parallel.mapreduce(col,
                           mapper=CsvMapper(),
                           reducer=parallel.SumReducer(),
                           map_workers=len(splits),
                           output_format=parallel.LevelDBOutput(),
                           output_prefix=level_db_prefix,
                           num_shards=len(splits))

        result = sorted(list(parallel.ShardedDB.open(level_db_prefix)))
        print(result)
        # if we got the sum right we know all the splits have been processed correctly
        assert (result[0][1]
                ) == 401 + 402 + 403 + 404 + 405 + 407 + 408 + 409 + 410