Esempio n. 1
0
    def test_persistent_cluster(self):
        mr_job = MRWordCount(["-r", "dataproc", "--max-hours-idle", "0.01"])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {"mrjob-max-secs-idle": "36"})
Esempio n. 2
0
    def test_partitioner(self):
        partitioner = 'org.apache.hadoop.mapreduce.Partitioner'
        job = MRWordCount(['--partitioner', partitioner])

        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0),
                             ['-partitioner', partitioner])
Esempio n. 3
0
    def test_default(self):
        mr_job = MRWordCount(["-r", "dataproc"])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {"mrjob-max-secs-idle": "360"})
Esempio n. 4
0
    def test_partitioner(self):
        partitioner = 'org.apache.hadoop.mapreduce.Partitioner'
        job = MRWordCount(['--partitioner', partitioner])

        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0),
                             ['-partitioner', partitioner])
Esempio n. 5
0
 def test_environment_variables_021(self):
     job = MRWordCount(['-r', 'local', '--hadoop-version', '0.21'])
     with job.make_runner() as runner:
         simulated_jobconf = runner._simulate_jobconf_for_step(
             0, 'mapper', 0, '/tmp/foo')
         self.assertIn('mapreduce.job.cache.local.archives',
                       simulated_jobconf)
         self.assertNotIn('mapred.cache.localArchives', simulated_jobconf)
Esempio n. 6
0
 def test_environment_variables_version_agnostic(self):
     job = MRWordCount(['-r', 'local'])
     with job.make_runner() as runner:
         simulated_jobconf = runner._simulate_jobconf_for_step(
             'mapper', 0, 0)
         self.assertIn('mapred.cache.localArchives', simulated_jobconf)
         self.assertIn('mapreduce.job.cache.local.archives',
                       simulated_jobconf)
Esempio n. 7
0
    def test_empty_jobconf_values(self):
        # value of None means to omit that jobconf
        job = MRWordCount()
        # no way to pass in None from the command line
        job.JOBCONF = {"foo": "", "bar": None}

        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0), ["-D", "foo="])
Esempio n. 8
0
    def test_empty_jobconf_values(self):
        # value of None means to omit that jobconf
        job = MRWordCount()
        # no way to pass in None from the command line
        job.JOBCONF = {'foo': '', 'bar': None}

        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0), ['-D', 'foo='])
Esempio n. 9
0
    def test_default(self):
        job = MRWordCount(['-r', 'dataproc'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertFalse(self.mock_Popen.called)
Esempio n. 10
0
 def test_environment_variables_hadoop_2(self):
     job = MRWordCount(['-r', 'local', '--hadoop-version', '2.7.2'])
     with job.make_runner() as runner:
         simulated_jobconf = runner._simulate_jobconf_for_step(
             'mapper', 0, 0)
         self.assertIn(
             'mapreduce.job.cache.local.archives', simulated_jobconf)
         self.assertNotIn(
             'mapred.cache.localArchives', simulated_jobconf)
Esempio n. 11
0
    def test_persistent_cluster(self):
        mr_job = MRWordCount(['-r', 'dataproc', '--max-hours-idle', '0.01'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {
                'mrjob-max-secs-idle': '36',
            })
Esempio n. 12
0
    def test_configuration_translation(self):
        job = MRWordCount(["--jobconf", "mapred.jobtracker.maxtasks.per.job=1", "--hadoop-version", "0.21"])

        with job.make_runner() as runner:
            with no_handlers_for_logger("mrjob.compat"):
                self.assertEqual(
                    runner._hadoop_args_for_step(0),
                    ["-D", "mapred.jobtracker.maxtasks.per.job=1", "-D", "mapreduce.jobtracker.maxtasks.perjob=1"],
                )
Esempio n. 13
0
    def test_empty(self):
        job = MRWordCount(['-r', 'hadoop'])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()
            args = runner._args_for_streaming_step(0)

            self.assertNotIn('-libjars', args)
Esempio n. 14
0
    def test_persistent_cluster(self):
        mr_job = MRWordCount(['-r', 'dataproc', '--max-hours-idle', '0.01'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {
                'mrjob-max-secs-idle': '36',
            })
Esempio n. 15
0
    def test_default(self):
        mr_job = MRWordCount(['-r', 'dataproc'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {
                'mrjob-max-secs-idle': '360',
            })
Esempio n. 16
0
    def test_empty(self):
        job = MRWordCount(['-r', 'hadoop'])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()
            args = runner._args_for_streaming_step(0)

            self.assertNotIn('-libjars', args)
Esempio n. 17
0
 def test_environment_variables_version_agnostic(self):
     job = MRWordCount(['-r', 'local'])
     with job.make_runner() as runner:
         simulated_jobconf = runner._simulate_jobconf_for_step(
             0, 'mapper', 0, '/tmp/foo')
         self.assertIn(
             'mapred.cache.localArchives', simulated_jobconf)
         self.assertIn(
             'mapreduce.job.cache.local.archives', simulated_jobconf)
Esempio n. 18
0
    def test_default(self):
        mr_job = MRWordCount(['-r', 'dataproc'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()
            self.assertRanIdleTimeoutScriptWith(runner, {
                'mrjob-max-secs-idle': '360',
            })
Esempio n. 19
0
    def test_configuration_translation(self):
        job = MRWordCount(["--jobconf", "mapred.jobtracker.maxtasks.per.job=1"])

        with job.make_runner() as runner:
            with no_handlers_for_logger("mrjob.runner"):
                with patch.object(runner, "get_hadoop_version", return_value="2.7.1"):
                    self.assertEqual(
                        runner._hadoop_args_for_step(0),
                        ["-D", "mapred.jobtracker.maxtasks.per.job=1", "-D", "mapreduce.jobtracker.maxtasks.perjob=1"],
                    )
Esempio n. 20
0
    def test_disable_check_input_paths(self):
        missing_data = os.path.join(self.tmp_dir, 'data')

        job = MRWordCount(['--no-check-input-paths', missing_data])

        self.start(patch('mrjob.inline.InlineMRJobRunner._run',
                   side_effect=StopIteration))

        with job.make_runner() as runner:
            self.assertRaises(StopIteration, runner.run)
Esempio n. 21
0
    def test_disable_check_input_paths(self):
        missing_data = os.path.join(self.tmp_dir, 'data')

        job = MRWordCount(['--no-check-input-paths', missing_data])

        self.start(patch('mrjob.inline.InlineMRJobRunner._run',
                   side_effect=StopIteration))

        with job.make_runner() as runner:
            self.assertRaises(StopIteration, runner.run)
Esempio n. 22
0
    def test_job_passes_in_steps(self):
        job = MRWordCount()
        job.sandbox()

        with job.make_runner() as runner:
            self.assertTrue(runner._steps)

            runner.run()

            self.assertFalse(self.log.warning.called)
Esempio n. 23
0
    def test_job_passes_in_steps(self):
        job = MRWordCount([])
        job.sandbox()

        with job.make_runner() as runner:
            self.assertTrue(runner._steps)

            runner.run()

            self.assertFalse(self.log.warning.called)
Esempio n. 24
0
 def test_can_disable_check_input_paths_in_config(self):
     job = MRWordCount()
     with mrjob_conf_patcher(
         {'runners': {
             'inline': {
                 'check_input_paths': False
             }
         }}):
         with job.make_runner() as runner:
             self.assertFalse(runner._opts['check_input_paths'])
Esempio n. 25
0
 def test_cmdenv(self):
     job = MRWordCount(['--cmdenv', 'FOO=bar',
                        '--cmdenv', 'BAZ=qux',
                        '--cmdenv', 'BAX=Arnold'])
     with job.make_runner() as runner:
         self.assertEqual(runner._hadoop_args_for_step(0),
                          ['-cmdenv', 'BAX=Arnold',
                           '-cmdenv', 'BAZ=qux',
                           '-cmdenv', 'FOO=bar',
                           ])
Esempio n. 26
0
 def test_cmdenv(self):
     job = MRWordCount(['--cmdenv', 'FOO=bar',
                        '--cmdenv', 'BAZ=qux',
                        '--cmdenv', 'BAX=Arnold'])
     with job.make_runner() as runner:
         self.assertEqual(runner._hadoop_args_for_step(0),
                          ['-cmdenv', 'BAX=Arnold',
                           '-cmdenv', 'BAZ=qux',
                           '-cmdenv', 'FOO=bar',
                           ])
Esempio n. 27
0
    def test_configuration_translation(self):
        job = MRWordCount(
            ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1',
             '--hadoop-version', '0.21'])

        with job.make_runner() as runner:
            with no_handlers_for_logger('mrjob.compat'):
                self.assertEqual(runner._hadoop_args_for_step(0),
                         ['-D', 'mapred.jobtracker.maxtasks.per.job=1',
                          '-D', 'mapreduce.jobtracker.maxtasks.perjob=1'
                          ])
Esempio n. 28
0
    def test_missing_gcloud_bin(self):
        self.mock_Popen.side_effect = OSError(2, 'No such file or directory')

        job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertEqual(self.mock_Popen.call_count, 1)
        self.assertTrue(runner._give_up_on_ssh_tunnel)
Esempio n. 29
0
    def test_configuration_translation(self):
        job = MRWordCount(
            ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1',
             '--hadoop-version', '0.21'])

        with job.make_runner() as runner:
            with no_handlers_for_logger('mrjob.compat'):
                self.assertEqual(runner._hadoop_args_for_step(0),
                         ['-D', 'mapred.jobtracker.maxtasks.per.job=1',
                          '-D', 'mapreduce.jobtracker.maxtasks.perjob=1'
                          ])
Esempio n. 30
0
    def test_load_steps(self):
        job = MRWordCount()
        job.sandbox()

        with job.make_runner() as runner:
            runner._steps = None

            runner.run()

            self.assertTrue(runner._steps)
            self.assertTrue(self.log.warning.called)
Esempio n. 31
0
    def test_load_steps(self):
        job = MRWordCount()
        job.sandbox()

        with job.make_runner() as runner:
            runner._steps = None

            runner.run()

            self.assertTrue(runner._steps)
            self.assertTrue(self.log.warning.called)
Esempio n. 32
0
    def test_jobconf(self):
        jobconf_args = ['--jobconf', 'FOO=bar',
                        '--jobconf', 'BAZ=qux',
                        '--jobconf', 'BAX=Arnold']

        job = MRWordCount(jobconf_args)
        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0),
                             ['-D', 'BAX=Arnold',
                              '-D', 'BAZ=qux',
                              '-D', 'FOO=bar',
                              ])
Esempio n. 33
0
    def test_error_from_gcloud_bin(self):
        self.mock_Popen.return_value.returncode = 255

        job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel'])

        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertGreater(self.mock_Popen.call_count, 1)
        self.assertFalse(runner._give_up_on_ssh_tunnel)
Esempio n. 34
0
    def test_jobconf(self):
        jobconf_args = ['--jobconf', 'FOO=bar',
                        '--jobconf', 'BAZ=qux',
                        '--jobconf', 'BAX=Arnold']

        job = MRWordCount(jobconf_args)
        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0),
                             ['-D', 'BAX=Arnold',
                              '-D', 'BAZ=qux',
                              '-D', 'FOO=bar',
                              ])
Esempio n. 35
0
    def test_configuration_translation(self):
        job = MRWordCount(
            ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1'])

        with job.make_runner() as runner:
            with no_handlers_for_logger('mrjob.runner'):
                with patch.object(runner,
                                  'get_hadoop_version', return_value='2.7.1'):
                    self.assertEqual(
                        runner._hadoop_args_for_step(0),
                        ['-D', 'mapred.jobtracker.maxtasks.per.job=1',
                         '-D', 'mapreduce.jobtracker.maxtasks.perjob=1'
                         ])
Esempio n. 36
0
    def test_jobconf(self):
        jobconf_args = ["--jobconf", "FOO=bar", "--jobconf", "BAZ=qux", "--jobconf", "BAX=Arnold"]

        job = MRWordCount(jobconf_args)
        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0), ["-D", "BAX=Arnold", "-D", "BAZ=qux", "-D", "FOO=bar"])

        job_0_18 = MRWordCount(jobconf_args + ["--hadoop-version", "0.18"])
        with job_0_18.make_runner() as runner_0_18:
            self.assertEqual(
                runner_0_18._hadoop_args_for_step(0),
                ["-jobconf", "BAX=Arnold", "-jobconf", "BAZ=qux", "-jobconf", "FOO=bar"],
            )
Esempio n. 37
0
    def test_hadoop_extra_args_comes_first(self):
        job = MRWordCount(
            ['--cmdenv', 'FOO=bar',
             '--hadoop-arg', '-libjar', '--hadoop-arg', 'qux.jar',
             '--jobconf', 'baz=qux',
             '--partitioner', 'java.lang.Object'])
        job.HADOOP_INPUT_FORMAT = 'FooInputFormat'
        job.HADOOP_OUTPUT_FORMAT = 'BarOutputFormat'

        with job.make_runner() as runner:
            hadoop_args = runner._hadoop_args_for_step(0)
            self.assertEqual(hadoop_args[:2], ['-libjar', 'qux.jar'])
            self.assertEqual(len(hadoop_args), 12)
Esempio n. 38
0
    def test_one_jar(self):
        job = MRWordCount([
            '-r', 'hadoop',
            '--libjar', '/path/to/a.jar',
        ])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()
            args = runner._args_for_streaming_step(0)

            self.assertIn('-libjars', args)
            self.assertIn('/path/to/a.jar', args)
Esempio n. 39
0
    def test_hadoop_extra_args_comes_first(self):
        job = MRWordCount([
            '--cmdenv', 'FOO=bar', '--hadoop-arg', '-libjar', '--hadoop-arg',
            'qux.jar', '--jobconf', 'baz=qux', '--partitioner',
            'java.lang.Object'
        ])
        job.HADOOP_INPUT_FORMAT = 'FooInputFormat'
        job.HADOOP_OUTPUT_FORMAT = 'BarOutputFormat'

        with job.make_runner() as runner:
            hadoop_args = runner._hadoop_args_for_step(0)
            self.assertEqual(hadoop_args[:2], ['-libjar', 'qux.jar'])
            self.assertEqual(len(hadoop_args), 12)
Esempio n. 40
0
    def test_configuration_translation(self):
        job = MRWordCount(
            ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1'])

        with job.make_runner() as runner:
            with no_handlers_for_logger('mrjob.runner'):
                with patch.object(runner,
                                  'get_hadoop_version', return_value='2.7.1'):
                    self.assertEqual(
                        runner._hadoop_args_for_step(0),
                        ['-D', 'mapred.jobtracker.maxtasks.per.job=1',
                         '-D', 'mapreduce.jobtracker.maxtasks.perjob=1'
                         ])
Esempio n. 41
0
    def test_custom_gcloud_bin(self):
        job = MRWordCount([
            '-r', 'dataproc', '--ssh-tunnel', '--gcloud-bin',
            '/path/to/gcloud -v'
        ])

        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertEqual(self.mock_Popen.call_count, 1)
        args = self.mock_Popen.call_args[0][0]

        self.assertEqual(args[:4], ['/path/to/gcloud', '-v', 'compute', 'ssh'])
Esempio n. 42
0
    def test_one_jar(self):
        job = MRWordCount([
            '-r',
            'hadoop',
            '--libjar',
            '/path/to/a.jar',
        ])
        job.sandbox()

        with job.make_runner() as runner:
            runner._add_job_files_for_upload()
            args = runner._args_for_streaming_step(0)

            self.assertIn('-libjars', args)
            self.assertIn('/path/to/a.jar', args)
Esempio n. 43
0
    def test_classic_streaming_step_without_mr_job_script(self):
        # classic MRJob mappers and reducers require a MRJob script
        steps = MRWordCount([])._steps_desc()

        self.assertRaises(ValueError,
                          LocalMRJobRunner,
                          steps=steps, stdin=BytesIO(b'one\ntwo\n'))
Esempio n. 44
0
    def test_hadoop_output_format(self):
        output_format = "org.apache.hadoop.mapred.SequenceFileOutputFormat"

        # one-step job
        job1 = MRWordCount()
        # no cmd-line argument for this because it's part of job semantics
        job1.HADOOP_OUTPUT_FORMAT = output_format
        with job1.make_runner() as runner1:
            self.assertEqual(runner1._hadoop_args_for_step(0), ["-outputformat", output_format])

        # multi-step job: only use -outputformat on the last step
        job2 = MRTwoStepJob()
        job2.HADOOP_OUTPUT_FORMAT = output_format
        with job2.make_runner() as runner2:
            self.assertEqual(runner2._hadoop_args_for_step(0), [])
            self.assertEqual(runner2._hadoop_args_for_step(1), ["-outputformat", output_format])
Esempio n. 45
0
    def test_open_ssh_tunnel(self):
        job = MRWordCount(
            ['-r', 'dataproc', '--ssh-tunnel', '--ssh-tunnel-is-open'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertEqual(self.mock_Popen.call_count, 1)
        args = self.mock_Popen.call_args[0][0]

        self.assertIn('-L', args)
        self.assertIn('-N', args)
        self.assertIn('-n', args)
        self.assertIn('-q', args)

        self.assertIn('-g', args)
        self.assertIn('-4', args)
Esempio n. 46
0
    def test_hadoop_output_format(self):
        output_format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat'

        # one-step job
        job1 = MRWordCount()
        # no cmd-line argument for this because it's part of job semantics
        job1.HADOOP_OUTPUT_FORMAT = output_format
        with job1.make_runner() as runner1:
            self.assertEqual(runner1._hadoop_args_for_step(0),
                             ['-outputformat', output_format])

        # multi-step job: only use -outputformat on the last step
        job2 = MRTwoStepJob()
        job2.HADOOP_OUTPUT_FORMAT = output_format
        with job2.make_runner() as runner2:
            self.assertEqual(runner2._hadoop_args_for_step(0), [])
            self.assertEqual(runner2._hadoop_args_for_step(1),
                             ['-outputformat', output_format])
Esempio n. 47
0
    def test_jobconf(self):
        jobconf_args = ['--jobconf', 'FOO=bar',
                        '--jobconf', 'BAZ=qux',
                        '--jobconf', 'BAX=Arnold']

        job = MRWordCount(jobconf_args)
        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0),
                             ['-D', 'BAX=Arnold',
                              '-D', 'BAZ=qux',
                              '-D', 'FOO=bar',
                              ])

        job_0_18 = MRWordCount(jobconf_args + ['--hadoop-version', '0.18'])
        with job_0_18.make_runner() as runner_0_18:
            self.assertEqual(runner_0_18._hadoop_args_for_step(0),
                             ['-jobconf', 'BAX=Arnold',
                              '-jobconf', 'BAZ=qux',
                              '-jobconf', 'FOO=bar',
                              ])
Esempio n. 48
0
    def test_log_messages(self):
        self.get_lines.return_value = [
            '18/04/17 22:06:15 INFO mapreduce.Job:  map 100% reduce 0%\n',
            '18/04/17 22:07:34 INFO mapreduce.Job: Counters: 1\n',
            '\tFile System Counters\n',
            '\t\tFILE: Number of bytes read=819\n',
        ]

        mr_job = MRWordCount(['-r', 'dataproc'])
        mr_job.sandbox()

        with mr_job.make_runner() as runner:
            runner.run()

        self.assertIn(call('  map 100% reduce 0%'),
                      self.log.info.call_args_list)

        self.assertIn(
            call('Counters: 1\n\tFile System Counters\n\t\tFILE:'
                 ' Number of bytes read=819'), self.log.info.call_args_list)
Esempio n. 49
0
    def test_input_files_and_setting_number_of_tasks(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nqux\nfoo\n')

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b'foo\n')
        input_gz.close()

        mr_job = MRWordCount(['-r', self.RUNNER,
                              '--jobconf=mapred.map.tasks=3',
                              '--jobconf=mapred.reduce.tasks=3',
                              input_path, input_gz_path])
        mr_job.sandbox()

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            for line in runner.stream_output():
                key, value = mr_job.parse_output_line(line)
                results.append((key, value))

            self.assertEqual(runner.counters()[0]['count']['combiners'], 3)

        self.assertEqual(sorted(results),
                         [(input_path, 3), (input_gz_path, 1)])
Esempio n. 50
0
    def test_input_files(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nqux\nfoo\n')

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        with gzip.GzipFile(input_gz_path, 'wb') as input_gz:
            input_gz.write(b'foo\n')

        mr_job = MRWordCount(['-r', self.RUNNER,
                              input_path, input_gz_path])
        mr_job.sandbox()

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            results.extend(mr_job.parse_output(runner.cat_output()))

            self.assertGreater(runner.counters()[0]['count']['combiners'], 2)

        self.assertEqual(sorted(results),
                         [('file://' + input_path, 3),
                          ('file://' + input_gz_path, 1)])
Esempio n. 51
0
    def test_jobconf(self):
        jobconf_args = [
            '--jobconf', 'FOO=bar', '--jobconf', 'BAZ=qux', '--jobconf',
            'BAX=Arnold'
        ]

        job = MRWordCount(jobconf_args)
        with job.make_runner() as runner:
            self.assertEqual(runner._hadoop_args_for_step(0), [
                '-D',
                'BAX=Arnold',
                '-D',
                'BAZ=qux',
                '-D',
                'FOO=bar',
            ])

        job_0_18 = MRWordCount(jobconf_args + ['--hadoop-version', '0.18'])
        with job_0_18.make_runner() as runner_0_18:
            self.assertEqual(runner_0_18._hadoop_args_for_step(0), [
                '-jobconf',
                'BAX=Arnold',
                '-jobconf',
                'BAZ=qux',
                '-jobconf',
                'FOO=bar',
            ])
Esempio n. 52
0
    def test_hadoop_extra_args_comes_first(self):
        job = MRWordCount(
            [
                "--cmdenv",
                "FOO=bar",
                "--hadoop-arg",
                "-libjar",
                "--hadoop-arg",
                "qux.jar",
                "--jobconf",
                "baz=qux",
                "--partitioner",
                "java.lang.Object",
            ]
        )
        job.HADOOP_INPUT_FORMAT = "FooInputFormat"
        job.HADOOP_OUTPUT_FORMAT = "BarOutputFormat"

        with job.make_runner() as runner:
            hadoop_args = runner._hadoop_args_for_step(0)
            self.assertEqual(hadoop_args[:2], ["-libjar", "qux.jar"])
            self.assertEqual(len(hadoop_args), 12)
Esempio n. 53
0
    def test_default_ssh_tunnel(self):
        job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel'])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

        self.assertEqual(self.mock_Popen.call_count, 1)
        args_tuple, kwargs = self.mock_Popen.call_args
        args = args_tuple[0]

        self.assertEqual(kwargs, dict(stdin=PIPE, stdout=PIPE, stderr=PIPE))

        self.assertEqual(args[:3], ['gcloud', 'compute', 'ssh'])

        self.assertIn('-L', args)
        self.assertIn('-N', args)
        self.assertIn('-n', args)
        self.assertIn('-q', args)

        self.assertNotIn('-g', args)
        self.assertNotIn('-4', args)

        self.mock_Popen.stdin.called_once_with(b'\n\n')
Esempio n. 54
0
    def test_input_files(self):
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(b'bar\nqux\nfoo\n')

        input_gz_path = os.path.join(self.tmp_dir, 'input.gz')
        with gzip.GzipFile(input_gz_path, 'wb') as input_gz:
            input_gz.write(b'foo\n')

        mr_job = MRWordCount(['-r', self.RUNNER,
                              input_path, input_gz_path])
        mr_job.sandbox()

        results = []

        with mr_job.make_runner() as runner:
            runner.run()

            results.extend(mr_job.parse_output(runner.cat_output()))

            self.assertGreater(runner.counters()[0]['count']['combiners'], 2)

        self.assertEqual(sorted(results),
                         [(input_path, 3), (input_gz_path, 1)])
Esempio n. 55
0
 def test_job_name_specified_run_twice(self):
     job_name = datetime.datetime.now().strftime('WordCount2-%Y%m%d%H%M%S')
     try:
         job = MRWordCount(['--job-name', job_name,
                            '--cleanup', 'NONE', __file__])
         with job.make_runner() as runner:
             runner.run()
         job2 = MRWordCount(['--job-name', job_name, __file__])
         with job2.make_runner() as runner2:
             runner2.run()
     except OSError:
         self.fail('Local scratch was not auto-deleted')
Esempio n. 56
0
 def test_job_closed_on_cleanup(self):
     job = MRWordCount()
     with job.make_runner() as runner:
         # do nothing
         self.assertFalse(runner._closed)
     self.assertTrue(runner._closed)
Esempio n. 57
0
 def test_can_disable_check_input_paths_in_config(self):
     job = MRWordCount()
     with mrjob_conf_patcher(
             {'runners': {'inline': {'check_input_paths': False}}}):
         with job.make_runner() as runner:
             self.assertFalse(runner._opts['check_input_paths'])
Esempio n. 58
0
 def test_check_input_paths_disabled(self):
     job = MRWordCount(['--no-check-input-paths'])
     with job.make_runner() as runner:
         self.assertFalse(runner._opts['check_input_paths'])