Beispiel #1
0
    def test_one_file(self):
        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRCountLinesByFile(
            ['-r', 'spark', '--emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, {'file://' + two_lines_path: 2})
Beispiel #2
0
    def test_emulate_map_input_file_in_conf(self):
        self.start(
            mrjob_conf_patcher(
                dict(runners=dict(spark=dict(emulate_map_input_file=True)))))

        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRCountLinesByFile(['-r', 'spark', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, {'file://' + two_lines_path: 2})
Beispiel #3
0
    def test_override_emulate_map_input_file_in_conf(self):
        self.start(
            mrjob_conf_patcher(
                dict(runners=dict(spark=dict(emulate_map_input_file=True)))))

        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRCountLinesByFile(
            ['-r', 'spark', '--no-emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            # without emulate_map_input_file, there is no input file path
            self.assertEqual(output, {None: 2})
Beispiel #4
0
    def test_input_dir(self):
        input_dir = self.makedirs('input')

        two_lines_path = self.makefile('input/two_lines', b'line 1\nline 2\n')
        three_lines_path = self.makefile('input/three_lines', b'A\nBB\nCCC\n')

        job = MRCountLinesByFile(
            ['-r', 'spark', '--emulate-map-input-file', input_dir])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, {
                'file://' + two_lines_path: 2,
                'file://' + three_lines_path: 3
            })