def test_persistent_cluster(self): mr_job = MRWordCount(["-r", "dataproc", "--max-hours-idle", "0.01"]) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertRanIdleTimeoutScriptWith(runner, {"mrjob-max-secs-idle": "36"})
def test_partitioner(self): partitioner = 'org.apache.hadoop.mapreduce.Partitioner' job = MRWordCount(['--partitioner', partitioner]) with job.make_runner() as runner: self.assertEqual(runner._hadoop_args_for_step(0), ['-partitioner', partitioner])
def test_default(self): mr_job = MRWordCount(["-r", "dataproc"]) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertRanIdleTimeoutScriptWith(runner, {"mrjob-max-secs-idle": "360"})
def test_environment_variables_021(self): job = MRWordCount(['-r', 'local', '--hadoop-version', '0.21']) with job.make_runner() as runner: simulated_jobconf = runner._simulate_jobconf_for_step( 0, 'mapper', 0, '/tmp/foo') self.assertIn('mapreduce.job.cache.local.archives', simulated_jobconf) self.assertNotIn('mapred.cache.localArchives', simulated_jobconf)
def test_environment_variables_version_agnostic(self): job = MRWordCount(['-r', 'local']) with job.make_runner() as runner: simulated_jobconf = runner._simulate_jobconf_for_step( 'mapper', 0, 0) self.assertIn('mapred.cache.localArchives', simulated_jobconf) self.assertIn('mapreduce.job.cache.local.archives', simulated_jobconf)
def test_empty_jobconf_values(self): # value of None means to omit that jobconf job = MRWordCount() # no way to pass in None from the command line job.JOBCONF = {"foo": "", "bar": None} with job.make_runner() as runner: self.assertEqual(runner._hadoop_args_for_step(0), ["-D", "foo="])
def test_empty_jobconf_values(self): # value of None means to omit that jobconf job = MRWordCount() # no way to pass in None from the command line job.JOBCONF = {'foo': '', 'bar': None} with job.make_runner() as runner: self.assertEqual(runner._hadoop_args_for_step(0), ['-D', 'foo='])
def test_default(self): job = MRWordCount(['-r', 'dataproc']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertFalse(self.mock_Popen.called)
def test_environment_variables_hadoop_2(self): job = MRWordCount(['-r', 'local', '--hadoop-version', '2.7.2']) with job.make_runner() as runner: simulated_jobconf = runner._simulate_jobconf_for_step( 'mapper', 0, 0) self.assertIn( 'mapreduce.job.cache.local.archives', simulated_jobconf) self.assertNotIn( 'mapred.cache.localArchives', simulated_jobconf)
def test_persistent_cluster(self): mr_job = MRWordCount(['-r', 'dataproc', '--max-hours-idle', '0.01']) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertRanIdleTimeoutScriptWith(runner, { 'mrjob-max-secs-idle': '36', })
def test_configuration_translation(self): job = MRWordCount(["--jobconf", "mapred.jobtracker.maxtasks.per.job=1", "--hadoop-version", "0.21"]) with job.make_runner() as runner: with no_handlers_for_logger("mrjob.compat"): self.assertEqual( runner._hadoop_args_for_step(0), ["-D", "mapred.jobtracker.maxtasks.per.job=1", "-D", "mapreduce.jobtracker.maxtasks.perjob=1"], )
def test_empty(self): job = MRWordCount(['-r', 'hadoop']) job.sandbox() with job.make_runner() as runner: runner._add_job_files_for_upload() args = runner._args_for_streaming_step(0) self.assertNotIn('-libjars', args)
def test_default(self): mr_job = MRWordCount(['-r', 'dataproc']) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertRanIdleTimeoutScriptWith(runner, { 'mrjob-max-secs-idle': '360', })
def test_environment_variables_version_agnostic(self): job = MRWordCount(['-r', 'local']) with job.make_runner() as runner: simulated_jobconf = runner._simulate_jobconf_for_step( 0, 'mapper', 0, '/tmp/foo') self.assertIn( 'mapred.cache.localArchives', simulated_jobconf) self.assertIn( 'mapreduce.job.cache.local.archives', simulated_jobconf)
def test_configuration_translation(self): job = MRWordCount(["--jobconf", "mapred.jobtracker.maxtasks.per.job=1"]) with job.make_runner() as runner: with no_handlers_for_logger("mrjob.runner"): with patch.object(runner, "get_hadoop_version", return_value="2.7.1"): self.assertEqual( runner._hadoop_args_for_step(0), ["-D", "mapred.jobtracker.maxtasks.per.job=1", "-D", "mapreduce.jobtracker.maxtasks.perjob=1"], )
def test_disable_check_input_paths(self): missing_data = os.path.join(self.tmp_dir, 'data') job = MRWordCount(['--no-check-input-paths', missing_data]) self.start(patch('mrjob.inline.InlineMRJobRunner._run', side_effect=StopIteration)) with job.make_runner() as runner: self.assertRaises(StopIteration, runner.run)
def test_job_passes_in_steps(self): job = MRWordCount() job.sandbox() with job.make_runner() as runner: self.assertTrue(runner._steps) runner.run() self.assertFalse(self.log.warning.called)
def test_job_passes_in_steps(self): job = MRWordCount([]) job.sandbox() with job.make_runner() as runner: self.assertTrue(runner._steps) runner.run() self.assertFalse(self.log.warning.called)
def test_can_disable_check_input_paths_in_config(self): job = MRWordCount() with mrjob_conf_patcher( {'runners': { 'inline': { 'check_input_paths': False } }}): with job.make_runner() as runner: self.assertFalse(runner._opts['check_input_paths'])
def test_cmdenv(self): job = MRWordCount(['--cmdenv', 'FOO=bar', '--cmdenv', 'BAZ=qux', '--cmdenv', 'BAX=Arnold']) with job.make_runner() as runner: self.assertEqual(runner._hadoop_args_for_step(0), ['-cmdenv', 'BAX=Arnold', '-cmdenv', 'BAZ=qux', '-cmdenv', 'FOO=bar', ])
def test_configuration_translation(self): job = MRWordCount( ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1', '--hadoop-version', '0.21']) with job.make_runner() as runner: with no_handlers_for_logger('mrjob.compat'): self.assertEqual(runner._hadoop_args_for_step(0), ['-D', 'mapred.jobtracker.maxtasks.per.job=1', '-D', 'mapreduce.jobtracker.maxtasks.perjob=1' ])
def test_missing_gcloud_bin(self): self.mock_Popen.side_effect = OSError(2, 'No such file or directory') job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(self.mock_Popen.call_count, 1) self.assertTrue(runner._give_up_on_ssh_tunnel)
def test_load_steps(self): job = MRWordCount() job.sandbox() with job.make_runner() as runner: runner._steps = None runner.run() self.assertTrue(runner._steps) self.assertTrue(self.log.warning.called)
def test_jobconf(self): jobconf_args = ['--jobconf', 'FOO=bar', '--jobconf', 'BAZ=qux', '--jobconf', 'BAX=Arnold'] job = MRWordCount(jobconf_args) with job.make_runner() as runner: self.assertEqual(runner._hadoop_args_for_step(0), ['-D', 'BAX=Arnold', '-D', 'BAZ=qux', '-D', 'FOO=bar', ])
def test_error_from_gcloud_bin(self): self.mock_Popen.return_value.returncode = 255 job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertGreater(self.mock_Popen.call_count, 1) self.assertFalse(runner._give_up_on_ssh_tunnel)
def test_configuration_translation(self): job = MRWordCount( ['--jobconf', 'mapred.jobtracker.maxtasks.per.job=1']) with job.make_runner() as runner: with no_handlers_for_logger('mrjob.runner'): with patch.object(runner, 'get_hadoop_version', return_value='2.7.1'): self.assertEqual( runner._hadoop_args_for_step(0), ['-D', 'mapred.jobtracker.maxtasks.per.job=1', '-D', 'mapreduce.jobtracker.maxtasks.perjob=1' ])
def test_jobconf(self): jobconf_args = ["--jobconf", "FOO=bar", "--jobconf", "BAZ=qux", "--jobconf", "BAX=Arnold"] job = MRWordCount(jobconf_args) with job.make_runner() as runner: self.assertEqual(runner._hadoop_args_for_step(0), ["-D", "BAX=Arnold", "-D", "BAZ=qux", "-D", "FOO=bar"]) job_0_18 = MRWordCount(jobconf_args + ["--hadoop-version", "0.18"]) with job_0_18.make_runner() as runner_0_18: self.assertEqual( runner_0_18._hadoop_args_for_step(0), ["-jobconf", "BAX=Arnold", "-jobconf", "BAZ=qux", "-jobconf", "FOO=bar"], )
def test_hadoop_extra_args_comes_first(self): job = MRWordCount( ['--cmdenv', 'FOO=bar', '--hadoop-arg', '-libjar', '--hadoop-arg', 'qux.jar', '--jobconf', 'baz=qux', '--partitioner', 'java.lang.Object']) job.HADOOP_INPUT_FORMAT = 'FooInputFormat' job.HADOOP_OUTPUT_FORMAT = 'BarOutputFormat' with job.make_runner() as runner: hadoop_args = runner._hadoop_args_for_step(0) self.assertEqual(hadoop_args[:2], ['-libjar', 'qux.jar']) self.assertEqual(len(hadoop_args), 12)
def test_one_jar(self): job = MRWordCount([ '-r', 'hadoop', '--libjar', '/path/to/a.jar', ]) job.sandbox() with job.make_runner() as runner: runner._add_job_files_for_upload() args = runner._args_for_streaming_step(0) self.assertIn('-libjars', args) self.assertIn('/path/to/a.jar', args)
def test_hadoop_extra_args_comes_first(self): job = MRWordCount([ '--cmdenv', 'FOO=bar', '--hadoop-arg', '-libjar', '--hadoop-arg', 'qux.jar', '--jobconf', 'baz=qux', '--partitioner', 'java.lang.Object' ]) job.HADOOP_INPUT_FORMAT = 'FooInputFormat' job.HADOOP_OUTPUT_FORMAT = 'BarOutputFormat' with job.make_runner() as runner: hadoop_args = runner._hadoop_args_for_step(0) self.assertEqual(hadoop_args[:2], ['-libjar', 'qux.jar']) self.assertEqual(len(hadoop_args), 12)
def test_custom_gcloud_bin(self): job = MRWordCount([ '-r', 'dataproc', '--ssh-tunnel', '--gcloud-bin', '/path/to/gcloud -v' ]) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(self.mock_Popen.call_count, 1) args = self.mock_Popen.call_args[0][0] self.assertEqual(args[:4], ['/path/to/gcloud', '-v', 'compute', 'ssh'])
def test_classic_streaming_step_without_mr_job_script(self): # classic MRJob mappers and reducers require a MRJob script steps = MRWordCount([])._steps_desc() self.assertRaises(ValueError, LocalMRJobRunner, steps=steps, stdin=BytesIO(b'one\ntwo\n'))
def test_hadoop_output_format(self): output_format = "org.apache.hadoop.mapred.SequenceFileOutputFormat" # one-step job job1 = MRWordCount() # no cmd-line argument for this because it's part of job semantics job1.HADOOP_OUTPUT_FORMAT = output_format with job1.make_runner() as runner1: self.assertEqual(runner1._hadoop_args_for_step(0), ["-outputformat", output_format]) # multi-step job: only use -outputformat on the last step job2 = MRTwoStepJob() job2.HADOOP_OUTPUT_FORMAT = output_format with job2.make_runner() as runner2: self.assertEqual(runner2._hadoop_args_for_step(0), []) self.assertEqual(runner2._hadoop_args_for_step(1), ["-outputformat", output_format])
def test_open_ssh_tunnel(self): job = MRWordCount( ['-r', 'dataproc', '--ssh-tunnel', '--ssh-tunnel-is-open']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(self.mock_Popen.call_count, 1) args = self.mock_Popen.call_args[0][0] self.assertIn('-L', args) self.assertIn('-N', args) self.assertIn('-n', args) self.assertIn('-q', args) self.assertIn('-g', args) self.assertIn('-4', args)
def test_hadoop_output_format(self): output_format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat' # one-step job job1 = MRWordCount() # no cmd-line argument for this because it's part of job semantics job1.HADOOP_OUTPUT_FORMAT = output_format with job1.make_runner() as runner1: self.assertEqual(runner1._hadoop_args_for_step(0), ['-outputformat', output_format]) # multi-step job: only use -outputformat on the last step job2 = MRTwoStepJob() job2.HADOOP_OUTPUT_FORMAT = output_format with job2.make_runner() as runner2: self.assertEqual(runner2._hadoop_args_for_step(0), []) self.assertEqual(runner2._hadoop_args_for_step(1), ['-outputformat', output_format])
def test_jobconf(self): jobconf_args = ['--jobconf', 'FOO=bar', '--jobconf', 'BAZ=qux', '--jobconf', 'BAX=Arnold'] job = MRWordCount(jobconf_args) with job.make_runner() as runner: self.assertEqual(runner._hadoop_args_for_step(0), ['-D', 'BAX=Arnold', '-D', 'BAZ=qux', '-D', 'FOO=bar', ]) job_0_18 = MRWordCount(jobconf_args + ['--hadoop-version', '0.18']) with job_0_18.make_runner() as runner_0_18: self.assertEqual(runner_0_18._hadoop_args_for_step(0), ['-jobconf', 'BAX=Arnold', '-jobconf', 'BAZ=qux', '-jobconf', 'FOO=bar', ])
def test_log_messages(self): self.get_lines.return_value = [ '18/04/17 22:06:15 INFO mapreduce.Job: map 100% reduce 0%\n', '18/04/17 22:07:34 INFO mapreduce.Job: Counters: 1\n', '\tFile System Counters\n', '\t\tFILE: Number of bytes read=819\n', ] mr_job = MRWordCount(['-r', 'dataproc']) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() self.assertIn(call(' map 100% reduce 0%'), self.log.info.call_args_list) self.assertIn( call('Counters: 1\n\tFile System Counters\n\t\tFILE:' ' Number of bytes read=819'), self.log.info.call_args_list)
def test_input_files_and_setting_number_of_tasks(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRWordCount(['-r', self.RUNNER, '--jobconf=mapred.map.tasks=3', '--jobconf=mapred.reduce.tasks=3', input_path, input_gz_path]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) self.assertEqual(runner.counters()[0]['count']['combiners'], 3) self.assertEqual(sorted(results), [(input_path, 3), (input_gz_path, 1)])
def test_input_files(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') mr_job = MRWordCount(['-r', self.RUNNER, input_path, input_gz_path]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() results.extend(mr_job.parse_output(runner.cat_output())) self.assertGreater(runner.counters()[0]['count']['combiners'], 2) self.assertEqual(sorted(results), [('file://' + input_path, 3), ('file://' + input_gz_path, 1)])
def test_jobconf(self): jobconf_args = [ '--jobconf', 'FOO=bar', '--jobconf', 'BAZ=qux', '--jobconf', 'BAX=Arnold' ] job = MRWordCount(jobconf_args) with job.make_runner() as runner: self.assertEqual(runner._hadoop_args_for_step(0), [ '-D', 'BAX=Arnold', '-D', 'BAZ=qux', '-D', 'FOO=bar', ]) job_0_18 = MRWordCount(jobconf_args + ['--hadoop-version', '0.18']) with job_0_18.make_runner() as runner_0_18: self.assertEqual(runner_0_18._hadoop_args_for_step(0), [ '-jobconf', 'BAX=Arnold', '-jobconf', 'BAZ=qux', '-jobconf', 'FOO=bar', ])
def test_hadoop_extra_args_comes_first(self): job = MRWordCount( [ "--cmdenv", "FOO=bar", "--hadoop-arg", "-libjar", "--hadoop-arg", "qux.jar", "--jobconf", "baz=qux", "--partitioner", "java.lang.Object", ] ) job.HADOOP_INPUT_FORMAT = "FooInputFormat" job.HADOOP_OUTPUT_FORMAT = "BarOutputFormat" with job.make_runner() as runner: hadoop_args = runner._hadoop_args_for_step(0) self.assertEqual(hadoop_args[:2], ["-libjar", "qux.jar"]) self.assertEqual(len(hadoop_args), 12)
def test_default_ssh_tunnel(self): job = MRWordCount(['-r', 'dataproc', '--ssh-tunnel']) job.sandbox() with job.make_runner() as runner: runner.run() self.assertEqual(self.mock_Popen.call_count, 1) args_tuple, kwargs = self.mock_Popen.call_args args = args_tuple[0] self.assertEqual(kwargs, dict(stdin=PIPE, stdout=PIPE, stderr=PIPE)) self.assertEqual(args[:3], ['gcloud', 'compute', 'ssh']) self.assertIn('-L', args) self.assertIn('-N', args) self.assertIn('-n', args) self.assertIn('-q', args) self.assertNotIn('-g', args) self.assertNotIn('-4', args) self.mock_Popen.stdin.called_once_with(b'\n\n')
def test_input_files(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\nfoo\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') with gzip.GzipFile(input_gz_path, 'wb') as input_gz: input_gz.write(b'foo\n') mr_job = MRWordCount(['-r', self.RUNNER, input_path, input_gz_path]) mr_job.sandbox() results = [] with mr_job.make_runner() as runner: runner.run() results.extend(mr_job.parse_output(runner.cat_output())) self.assertGreater(runner.counters()[0]['count']['combiners'], 2) self.assertEqual(sorted(results), [(input_path, 3), (input_gz_path, 1)])
def test_job_name_specified_run_twice(self): job_name = datetime.datetime.now().strftime('WordCount2-%Y%m%d%H%M%S') try: job = MRWordCount(['--job-name', job_name, '--cleanup', 'NONE', __file__]) with job.make_runner() as runner: runner.run() job2 = MRWordCount(['--job-name', job_name, __file__]) with job2.make_runner() as runner2: runner2.run() except OSError: self.fail('Local scratch was not auto-deleted')
def test_job_closed_on_cleanup(self): job = MRWordCount() with job.make_runner() as runner: # do nothing self.assertFalse(runner._closed) self.assertTrue(runner._closed)
def test_can_disable_check_input_paths_in_config(self): job = MRWordCount() with mrjob_conf_patcher( {'runners': {'inline': {'check_input_paths': False}}}): with job.make_runner() as runner: self.assertFalse(runner._opts['check_input_paths'])
def test_check_input_paths_disabled(self): job = MRWordCount(['--no-check-input-paths']) with job.make_runner() as runner: self.assertFalse(runner._opts['check_input_paths'])