Example #1
0
    def test_multi_step_counters(self):
        stdin = BytesIO(b"foo\nbar\n")

        mr_job = MRCountingJob(["-r", "local", "-"])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            runner.run()

            self.assertEqual(
                runner.counters(),
                [{"group": {"counter_name": 2}}, {"group": {"counter_name": 2}}, {"group": {"counter_name": 2}}],
            )
Example #2
0
    def test_multi_step_counters(self):
        stdin = StringIO('foo\nbar\n')

        mr_job = MRCountingJob(['-r', 'local', '-'])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            runner.run()

            self.assertEqual(runner.counters(),
                             [{'group': {'counter_name': 2}},
                              {'group': {'counter_name': 2}},
                              {'group': {'counter_name': 2}}])
Example #3
0
    def test_multi_step_counters(self):
        # read from STDIN, a regular file, and a .gz
        stdin = StringIO('foo\nbar\n')

        mr_job = MRCountingJob(['-c', self.mrjob_conf_path, '-'])
        mr_job.sandbox(stdin=stdin)

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            assert_equal(runner._counters, [{'group': {'counter_name': 2}},
                                            {'group': {'counter_name': 2}},
                                            {'group': {'counter_name': 2}}])
Example #4
0
    def test_local_output_dir_and_step_output_dir(self):
        input1_path = self.makefile('input1')
        input2_path = self.makefile('input2')

        output_dir = self.makedirs('output')
        step_output_dir = self.makedirs('step_output')

        # this has three steps, which lets us test step numbering
        job = MRCountingJob([
            '-r', 'local',
            '--output-dir', output_dir,
            '--step-output-dir', step_output_dir,
            input1_path, input2_path])
        job.sandbox()

        with job.make_runner() as runner:
            self.assertEqual(runner._num_steps(), 3)

            input_uris_0 = runner._step_input_uris(0)
            self.assertEqual([os.path.basename(uri) for uri in input_uris_0],
                             ['input1', 'input2'])
            self.assertEqual([uri[:8] for uri in input_uris_0],
                             ['file:///', 'file:///'])

            output_uri_0 = runner._step_output_uri(0)
            self.assertEqual(output_uri_0,
                             to_uri(os.path.join(step_output_dir, '0000')))

            input_uris_1 = runner._step_input_uris(1)
            self.assertEqual(input_uris_1, [output_uri_0])

            output_uri_1 = runner._step_output_uri(1)
            self.assertEqual(output_uri_1,
                             to_uri(os.path.join(step_output_dir, '0001')))

            input_uris_2 = runner._step_input_uris(2)
            self.assertEqual(input_uris_2, [output_uri_1])

            output_uri_2 = runner._step_output_uri(2)
            self.assertEqual(output_uri_2, to_uri(output_dir))
Example #5
0
    def test_multi_step_counters(self):
        stdin = StringIO('foo\nbar\n')

        mr_job = MRCountingJob(['-r', 'local', '-'])
        mr_job.sandbox(stdin=stdin)

        with mr_job.make_runner() as runner:
            runner.run()

            self.assertEqual(runner.counters(), [{
                'group': {
                    'counter_name': 2
                }
            }, {
                'group': {
                    'counter_name': 2
                }
            }, {
                'group': {
                    'counter_name': 2
                }
            }])
Example #6
0
    def test_local_output_dir_and_step_output_dir(self):
        input1_path = self.makefile('input1')
        input2_path = self.makefile('input2')

        output_dir = self.makedirs('output')
        step_output_dir = self.makedirs('step_output')

        # this has three steps, which lets us test step numbering
        job = MRCountingJob([
            '-r', 'local', '--output-dir', output_dir, '--step-output-dir',
            step_output_dir, input1_path, input2_path
        ])
        job.sandbox()

        with job.make_runner() as runner:
            self.assertEqual(runner._num_steps(), 3)

            input_uris_0 = runner._step_input_uris(0)
            self.assertEqual([os.path.basename(uri) for uri in input_uris_0],
                             ['input1', 'input2'])
            self.assertEqual([uri[:8] for uri in input_uris_0],
                             ['file:///', 'file:///'])

            output_uri_0 = runner._step_output_uri(0)
            self.assertEqual(output_uri_0,
                             to_uri(os.path.join(step_output_dir, '0000')))

            input_uris_1 = runner._step_input_uris(1)
            self.assertEqual(input_uris_1, [output_uri_0])

            output_uri_1 = runner._step_output_uri(1)
            self.assertEqual(output_uri_1,
                             to_uri(os.path.join(step_output_dir, '0001')))

            input_uris_2 = runner._step_input_uris(2)
            self.assertEqual(input_uris_2, [output_uri_1])

            output_uri_2 = runner._step_output_uri(2)
            self.assertEqual(output_uri_2, to_uri(output_dir))
Example #7
0
    def test_output_dir_and_step_output_dir(self):
        input1_path = self.makefile('input1')
        input2_path = self.makefile('input2')

        # this has three steps, which lets us test step numbering
        job = MRCountingJob([
            '-r', 'hadoop',
            '--hadoop-bin', 'false',  # shouldn't run; just in case
            '--output-dir', 'hdfs:///tmp/output',
            '--step-output-dir', 'hdfs://tmp/step-output',
            input1_path, input2_path])
        job.sandbox()

        with job.make_runner() as runner:
            self.assertEqual(runner._num_steps(), 3)

            runner._add_job_files_for_upload()

            input_uris_0 = runner._step_input_uris(0)
            self.assertEqual([os.path.basename(uri) for uri in input_uris_0],
                             ['input1', 'input2'])

            output_uri_0 = runner._step_output_uri(0)
            self.assertEqual(output_uri_0, 'hdfs://tmp/step-output/0000')

            input_uris_1 = runner._step_input_uris(1)
            self.assertEqual(input_uris_1, [output_uri_0])

            output_uri_1 = runner._step_output_uri(1)
            self.assertEqual(output_uri_1, 'hdfs://tmp/step-output/0001')

            input_uris_2 = runner._step_input_uris(2)
            self.assertEqual(input_uris_2, [output_uri_1])

            output_uri_2 = runner._step_output_uri(2)
            self.assertEqual(output_uri_2, 'hdfs:///tmp/output')
Example #8
0
    def test_output_dir_and_step_output_dir(self):
        input1_path = self.makefile('input1')
        input2_path = self.makefile('input2')

        # this has three steps, which lets us test step numbering
        job = MRCountingJob([
            '-r', 'hadoop',
            '--hadoop-bin', 'false',  # shouldn't run; just in case
            '--output-dir', 'hdfs:///tmp/output',
            '--step-output-dir', 'hdfs://tmp/step-output',
            input1_path, input2_path])
        job.sandbox()

        with job.make_runner() as runner:
            self.assertEqual(runner._num_steps(), 3)

            runner._add_job_files_for_upload()

            input_uris_0 = runner._step_input_uris(0)
            self.assertEqual([os.path.basename(uri) for uri in input_uris_0],
                             ['input1', 'input2'])

            output_uri_0 = runner._step_output_uri(0)
            self.assertEqual(output_uri_0, 'hdfs://tmp/step-output/0000')

            input_uris_1 = runner._step_input_uris(1)
            self.assertEqual(input_uris_1, [output_uri_0])

            output_uri_1 = runner._step_output_uri(1)
            self.assertEqual(output_uri_1, 'hdfs://tmp/step-output/0001')

            input_uris_2 = runner._step_input_uris(2)
            self.assertEqual(input_uris_2, [output_uri_1])

            output_uri_2 = runner._step_output_uri(2)
            self.assertEqual(output_uri_2, 'hdfs:///tmp/output')