Exemple #1
0
    def test_one_file(self):
        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRCountLinesByFile(
            ['-r', 'spark', '--emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, {'file://' + two_lines_path: 2})
Exemple #2
0
    def test_emulate_map_input_file_in_conf(self):
        self.start(
            mrjob_conf_patcher(
                dict(runners=dict(spark=dict(emulate_map_input_file=True)))))

        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRCountLinesByFile(['-r', 'spark', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, {'file://' + two_lines_path: 2})
Exemple #3
0
    def test_override_emulate_map_input_file_in_conf(self):
        self.start(
            mrjob_conf_patcher(
                dict(runners=dict(spark=dict(emulate_map_input_file=True)))))

        two_lines_path = self.makefile('two_lines', b'line\nother line\n')

        job = MRCountLinesByFile(
            ['-r', 'spark', '--no-emulate-map-input-file', two_lines_path])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            # without emulate_map_input_file, there is no input file path
            self.assertEqual(output, {None: 2})
Exemple #4
0
    def test_input_dir(self):
        input_dir = self.makedirs('input')

        two_lines_path = self.makefile('input/two_lines', b'line 1\nline 2\n')
        three_lines_path = self.makefile('input/three_lines', b'A\nBB\nCCC\n')

        job = MRCountLinesByFile(
            ['-r', 'spark', '--emulate-map-input-file', input_dir])

        with job.make_runner() as runner:
            runner.run()

            output = dict(job.parse_output(runner.cat_output()))

            self.assertEqual(output, {
                'file://' + two_lines_path: 2,
                'file://' + three_lines_path: 3
            })
Exemple #5
0
    def test_files(self):
        cat_file = self.makefile('cats.txt', b'cats are the best')
        dog_file = self.makefile('dogs.txt', b'woof woof woof\nwoof woof')
        empty_file = self.makefile('empty.txt')

        self.assertEqual(
            run_job(MRCountLinesByFile([cat_file, dog_file, empty_file])), {
                'file://' + cat_file: 1,
                'file://' + dog_file: 2,
            })
Exemple #6
0
 def test_empty(self):
     self.assertEqual(run_job(MRCountLinesByFile([])), {})