def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path]) mr_job.sandbox() self.add_mock_s3_data({'walrus': {}}) self.mock_emr_failures = {('j-MOCKJOBFLOW0', 0): None} with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) with logger_disabled('mrjob.emr'): assert_raises(Exception, runner.run) emr_conn = botoemr.EmrConnection() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'FAILED') # job should get terminated on cleanup emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')
def test_end_to_end(self): n_file_path = os.path.join(self.tmp_dir, 'n_file') with open(n_file_path, 'w') as f: f.write('3') os.environ['LOCAL_N_FILE_PATH'] = n_file_path stdin = [b'0\n', b'1\n', b'2\n'] # use local runner so that the file is actually sent somewhere mr_job = MRTowerOfPowers( ['-v', '--cleanup=NONE', '--n-file', n_file_path, '--runner=local']) self.assertEqual(len(mr_job.steps()), 3) mr_job.sandbox(stdin=stdin) with logger_disabled('mrjob.local'): with mr_job.make_runner() as runner: # make sure our file gets placed in the working dir self.assertIn(n_file_path, runner._working_dir_mgr.paths()) runner.run() output = set() for _, value in mr_job.parse_output(runner.cat_output()): output.add(value) self.assertEqual(set(output), set([0, 1, ((2 ** 3) ** 3) ** 3]))
def test_end_to_end(self): n_file_path = os.path.join(self.tmp_dir, 'n_file') with open(n_file_path, 'w') as f: f.write('3') os.environ['LOCAL_N_FILE_PATH'] = n_file_path stdin = ['0\n', '1\n', '2\n'] # use local runner so that the file is actually sent somewhere mr_job = MRTowerOfPowers( ['--no-conf', '-v', '--cleanup=NONE', '--n-file', n_file_path, '--runner=local']) self.assertEqual(len(mr_job.steps()), 3) mr_job.sandbox(stdin=stdin) with logger_disabled('mrjob.local'): with mr_job.make_runner() as runner: # make sure our file gets "uploaded" assert [fd for fd in runner._files if fd['path'] == n_file_path] runner.run() output = set() for line in runner.stream_output(): _, value = mr_job.parse_output_line(line) output.add(value) self.assertEqual(set(output), set([0, 1, ((2 ** 3) ** 3) ** 3]))
def test_end_to_end(self): n_file_path = os.path.join(self.tmp_dir, "n_file") with open(n_file_path, "w") as f: f.write("3") os.environ["LOCAL_N_FILE_PATH"] = n_file_path stdin = ["0\n", "1\n", "2\n"] # use local runner so that the file is actually sent somewhere mr_job = MRTowerOfPowers(["-v", "--cleanup=NONE", "--n-file", n_file_path, "--runner=local"]) self.assertEqual(len(mr_job.steps()), 3) mr_job.sandbox(stdin=stdin) with logger_disabled("mrjob.local"): with mr_job.make_runner() as runner: # make sure our file gets placed in the working dir self.assertIn(n_file_path, runner._working_dir_mgr.paths()) runner.run() output = set() for line in runner.stream_output(): _, value = mr_job.parse_output_line(line) output.add(value) self.assertEqual(set(output), set([0, 1, ((2 ** 3) ** 3) ** 3]))
def test_hadoop_version_option_does_nothing(self): with logger_disabled('mrjob.dataproc'): with self.make_runner('--hadoop-version', '1.2.3.4') as runner: runner.run() self.assertEqual(runner.get_image_version(), _DEFAULT_IMAGE_VERSION) self.assertEqual(runner.get_hadoop_version(), '2.7.2')
def test_dont_take_down_cluster_on_failure(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox() self._dataproc_client.job_get_advances_states = collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR']) with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) with logger_disabled('mrjob.dataproc'): self.assertRaises(StepFailedException, runner.run) cluster = self.get_cluster_from_runner(runner, cluster_id) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'RUNNING') # job shouldn't get terminated by cleanup cluster = self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id] cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'RUNNING')
def test_dont_take_down_cluster_on_failure(self): runner1 = DataprocJobRunner(conf_paths=[]) runner1._launch_cluster() cluster_id = runner1._cluster_id mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox() self.mock_jobs_succeed = False with mr_job.make_runner() as runner2: self.assertIsInstance(runner2, DataprocJobRunner) with logger_disabled('mrjob.dataproc'): self.assertRaises(StepFailedException, runner2.run) cluster2 = runner2._get_cluster(runner2._cluster_id) self.assertEqual(_cluster_state_name(cluster2.status.state), 'RUNNING') # job shouldn't get terminated by cleanup cluster1 = runner1._get_cluster(runner1._cluster_id) self.assertEqual(_cluster_state_name(cluster1.status.state), 'RUNNING')
def test_extra_kwargs_passed_in_directly_okay(self): with logger_disabled('mrjob.runner'): with InlineMRJobRunner(conf_paths=[], base_tmp_dir='/var/tmp', foo='bar') as runner: self.assertEqual(runner._opts['base_tmp_dir'], '/var/tmp') self.assertNotIn('bar', runner._opts)
def test_end_to_end(self): n_file_path = os.path.join(self.tmp_dir, 'n_file') with open(n_file_path, 'w') as f: f.write('3') os.environ['LOCAL_N_FILE_PATH'] = n_file_path stdin = ['0\n', '1\n', '2\n'] mr_job = MRTowerOfPowers( ['--no-conf', '-v', '--cleanup=NONE', '--n-file', n_file_path]) self.assertEqual(len(mr_job.steps()), 3) mr_job.sandbox(stdin=stdin) with logger_disabled('mrjob.local'): with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) # make sure our file gets "uploaded" assert [ fd for fd in runner._files if fd['path'] == n_file_path ] runner.run() output = set() for line in runner.stream_output(): _, value = mr_job.parse_output_line(line) output.add(value) self.assertEqual(set(output), set([0, 1, ((2**3)**3)**3]))
def test_dont_take_down_cluster_on_failure(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] mr_job = MRTwoStepJob( ['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox() self._dataproc_client.job_get_advances_states = (collections.deque( ['SETUP_DONE', 'RUNNING', 'ERROR'])) with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) with logger_disabled('mrjob.dataproc'): self.assertRaises(StepFailedException, runner.run) cluster = self.get_cluster_from_runner(runner, cluster_id) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'RUNNING') # job shouldn't get terminated by cleanup cluster = ( self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'RUNNING')
def test_mr(self): kwargs = { 'mapper': _IDENTITY_MAPPER, 'reducer': _IDENTITY_REDUCER, } with logger_disabled('mrjob.job'): self.assertEqual(MRJob.mr(**kwargs), MRStep(**kwargs))
def test_extra_kwargs_passed_in_directly_okay(self): with logger_disabled('mrjob.runner'): opts = RunnerOptionStore('inline', { 'local_tmp_dir': '/var/tmp', 'foo': 'bar' }, []) self.assertEqual(opts['local_tmp_dir'], '/var/tmp') self.assertNotIn('bar', opts)
def test_jar(self): kwargs = { 'jar': 'binks.jar.jar', 'main_class': 'MyMainMan', 'args': ['argh', 'argh'], } with logger_disabled('mrjob.job'): self.assertEqual(MRJob.jar(**kwargs), JarStep(**kwargs))
def test_parse_counters(self): mr_job = MRJob().sandbox() mr_job.increment_counter('Foo', 'Bar') mr_job.increment_counter('Foo', 'Bar') mr_job.increment_counter('Foo', 'Baz', 20) with logger_disabled('mrjob.job'): self.assertEqual(mr_job.parse_counters(), {'Foo': {'Bar': 2, 'Baz': 20}})
def test_extra_kwargs_passed_in_directly_okay(self): with logger_disabled('mrjob.runner'): runner = InlineMRJobRunner( foo='bar', local_tmp_dir='/var/tmp', conf_paths=[], ) self.assertEqual(runner._opts['local_tmp_dir'], '/var/tmp') self.assertNotIn('bar', runner._opts)
def test_load_mrjob_conf_and_load_opts(self): conf_path = os.path.join(self.tmp_dir, "mrjob.conf.2") with open(conf_path, "w") as f: f.write('{"runners": {"foo": {"qux": "quux"}}}') self.assertEqual(load_mrjob_conf(conf_path=conf_path), {"runners": {"foo": {"qux": "quux"}}}) self.assertEqual(load_opts_from_mrjob_conf("foo", conf_path=conf_path)[0][1], {"qux": "quux"}) # test missing options with logger_disabled("mrjob.conf"): self.assertEqual(load_opts_from_mrjob_conf("bar", conf_path=conf_path)[0][1], {})
def test_parse_output(self): # test parsing JSON mr_job = MRJob() output = b'0\t1\n"a"\t"b"\n' mr_job.stdout = BytesIO(output) with logger_disabled('mrjob.job'): self.assertEqual(mr_job.parse_output(), [(0, 1), ('a', 'b')]) # verify that stdout is not cleared self.assertEqual(mr_job.stdout.getvalue(), output)
def test_parse_output(self): # test parsing JSON mr_job = MRJob() output = '0\t1\n"a"\t"b"\n' mr_job.stdout = StringIO(output) with logger_disabled('mrjob.job'): self.assertEqual(mr_job.parse_output(), [(0, 1), ('a', 'b')]) # verify that stdout is not cleared self.assertEqual(mr_job.stdout.getvalue(), output)
def test_parse_output_with_protocol_instance(self): # see if we can use the repr protocol mr_job = MRJob() output = b"0\t1\n['a', 'b']\tset(['c', 'd'])\n" mr_job.stdout = BytesIO(output) with logger_disabled('mrjob.job'): self.assertEqual(mr_job.parse_output(ReprProtocol()), [(0, 1), (['a', 'b'], set(['c', 'd']))]) # verify that stdout is not cleared self.assertEqual(mr_job.stdout.getvalue(), output)
def test_parse_output_with_protocol_instance(self): # see if we can use the repr protocol mr_job = MRJob() output = "0\t1\n['a', 'b']\tset(['c', 'd'])\n" mr_job.stdout = StringIO(output) with logger_disabled('mrjob.job'): self.assertEqual(mr_job.parse_output(ReprProtocol()), [(0, 1), (['a', 'b'], set(['c', 'd']))]) # verify that stdout is not cleared self.assertEqual(mr_job.stdout.getvalue(), output)
def test_job_name_prefix_is_now_label(self): with logger_disabled('mrjob.runner'): old_way = LocalMRJobRunner( conf_path=False, job_name_prefix='ads_chain') old_opts = old_way.get_opts() new_way = LocalMRJobRunner(conf_path=False, label='ads_chain') new_opts = new_way.get_opts() assert_equal(old_opts, new_opts) assert_equal(old_opts['label'], 'ads_chain') assert_not_in('job_name_prefix', old_opts)
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(["-r", "local", "--setup", "true"]) job.sandbox(stdin=BytesIO()) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch("mrjob.runner.open", create=True, side_effect=open) as m_open: with logger_disabled("mrjob.local"): with job.make_runner() as runner: runner.run() self.assertIn(call(runner._setup_wrapper_script_path, "w"), m_open.mock_calls)
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(['-r', 'local', '--setup', 'true']) job.sandbox(stdin=BytesIO()) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch('mrjob.sim.open', create=True, side_effect=open) as m_open: with logger_disabled('mrjob.local'): with job.make_runner() as runner: runner.run() self.assertIn(call(runner._setup_wrapper_script_path, 'w'), m_open.mock_calls)
def test_deprecated_command_line_options_override_attrs(self): mr_job = MRHadoopFormatJob([ '--hadoop-input-format', 'org.apache.hadoop.mapred.lib.NLineInputFormat', '--hadoop-output-format', 'org.apache.hadoop.mapred.FileOutputFormat', ]) with logger_disabled('mrjob.job'): assert_equal(mr_job.job_runner_kwargs()['hadoop_input_format'], 'org.apache.hadoop.mapred.lib.NLineInputFormat') assert_equal(mr_job.job_runner_kwargs()['hadoop_output_format'], 'org.apache.hadoop.mapred.FileOutputFormat')
def test_load_mrjob_conf_and_load_opts(self): conf_path = os.path.join(self.tmp_dir, 'mrjob.conf.2') with open(conf_path, 'w') as f: f.write('{"runners": {"foo": {"qux": "quux"}}}') assert_equal(load_mrjob_conf(conf_path=conf_path), {'runners': {'foo': {'qux': 'quux'}}}) assert_equal(load_opts_from_mrjob_conf('foo', conf_path=conf_path), {'qux': 'quux'}) # test missing options with logger_disabled('mrjob.conf'): assert_equal( load_opts_from_mrjob_conf('bar', conf_path=conf_path), {})
def test_deprecated_command_line_options(self): mr_job = MRJob([ '--hadoop-input-format', 'org.apache.hadoop.mapred.lib.NLineInputFormat', '--hadoop-output-format', 'org.apache.hadoop.mapred.FileOutputFormat', ]) with logger_disabled('mrjob.job'): job_runner_kwargs = mr_job.job_runner_kwargs() self.assertEqual(job_runner_kwargs['hadoop_input_format'], 'org.apache.hadoop.mapred.lib.NLineInputFormat') self.assertEqual(job_runner_kwargs['hadoop_output_format'], 'org.apache.hadoop.mapred.FileOutputFormat')
def test_load_mrjob_conf_and_load_opts(self): conf_path = os.path.join(self.tmp_dir, 'mrjob.conf.2') with open(conf_path, 'w') as f: f.write('{"runners": {"foo": {"qux": "quux"}}}') with no_handlers_for_logger('mrjob.conf'): self.assertEqual( load_mrjob_conf(conf_path=conf_path), {'runners': {'foo': {'qux': 'quux'}}}) self.assertEqual( load_opts_from_mrjob_conf('foo', conf_path=conf_path)[0][1], {'qux': 'quux'}) # test missing options with logger_disabled('mrjob.conf'): self.assertEqual( load_opts_from_mrjob_conf('bar', conf_path=conf_path)[0][1], {})
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(['-r', 'hadoop', '--setup', 'true']) job.sandbox(stdin=BytesIO(b'')) add_mock_hadoop_output([b'']) add_mock_hadoop_output([b'']) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch( 'mrjob.runner.open', create=True, side_effect=open) as m_open: with logger_disabled('mrjob.hadoop'): with job.make_runner() as runner: runner.run() self.assertIn( call(runner._setup_wrapper_script_path, 'wb'), m_open.mock_calls)
def test_hdfs_jar_uri(self): # this could change, but for now, we pass URIs straight through mock_hdfs_jar = os.path.join(os.environ["MOCK_HDFS_ROOT"], "fake.jar") open(mock_hdfs_jar, "w").close() jar_uri = "hdfs:///fake.jar" job = MRJustAJar(["-r", "hadoop", "--jar", jar_uri]) job.sandbox() with job.make_runner() as runner: with logger_disabled("mrjob.hadoop"): # `hadoop jar` doesn't actually accept URIs self.assertRaises(CalledProcessError, runner.run) with open(os.environ["MOCK_HADOOP_LOG"]) as hadoop_log: hadoop_jar_lines = [line for line in hadoop_log if line.startswith("jar ")] self.assertEqual(len(hadoop_jar_lines), 1) self.assertEqual(hadoop_jar_lines[0].rstrip(), "jar " + jar_uri)
def test_hdfs_jar_uri(self): # this could change, but for now, we pass URIs straight through mock_hdfs_jar = os.path.join(os.environ['MOCK_HDFS_ROOT'], 'fake.jar') open(mock_hdfs_jar, 'w').close() jar_uri = 'hdfs:///fake.jar' job = MRJustAJar(['-r', 'hadoop', '--jar', jar_uri]) job.sandbox() with job.make_runner() as runner: with logger_disabled('mrjob.hadoop'): # `hadoop jar` doesn't actually accept URIs self.assertRaises(CalledProcessError, runner.run) with open(os.environ['MOCK_HADOOP_LOG']) as hadoop_log: hadoop_jar_lines = [line for line in hadoop_log if line.startswith('jar ')] self.assertEqual(len(hadoop_jar_lines), 1) self.assertEqual(hadoop_jar_lines[0].rstrip(), 'jar ' + jar_uri)
def test_hdfs_jar_uri(self): # this could change, but for now, we pass URIs straight through mock_hdfs_jar = os.path.join(os.environ['MOCK_HDFS_ROOT'], 'fake.jar') open(mock_hdfs_jar, 'w').close() jar_uri = 'hdfs:///fake.jar' job = MRJustAJar(['-r', 'hadoop', '--jar', jar_uri]) job.sandbox() with job.make_runner() as runner: with logger_disabled('mrjob.hadoop'): # `hadoop jar` doesn't actually accept URIs self.assertRaises(CalledProcessError, runner.run) with open(os.environ['MOCK_HADOOP_LOG']) as hadoop_log: hadoop_jar_lines = [ line for line in hadoop_log if line.startswith('jar ')] self.assertEqual(len(hadoop_jar_lines), 1) self.assertEqual(hadoop_jar_lines[0].rstrip(), 'jar ' + jar_uri)
def test_hdfs_jar_uri(self): # this could change, but for now, we pass URIs straight through mock_hdfs_jar = os.path.join(get_mock_hdfs_root(), 'fake.jar') open(mock_hdfs_jar, 'w').close() jar_uri = 'hdfs:///fake.jar' job = MRJustAJar(['-r', 'hadoop', '--jar', jar_uri]) job.sandbox() with job.make_runner() as runner: with logger_disabled('mrjob.hadoop'): # `hadoop jar` doesn't actually accept URIs self.assertRaises(StepFailedException, runner.run) hadoop_cmd_args = get_mock_hadoop_cmd_args() hadoop_jar_cmd_args = [args for args in hadoop_cmd_args if args and args[0] == 'jar'] self.assertEqual(len(hadoop_jar_cmd_args), 1) self.assertEqual(hadoop_jar_cmd_args[0], ['jar', jar_uri])
def test_step_args_kwarg(self): with logger_disabled('mrjob.step'): self.assertEqual(JarStep(jar='bell.jar', step_args=['5', 'six']), JarStep(jar='bell.jar', args=['5', 'six']))
def test_deprecated_alias(self): with logger_disabled('mrjob.launch'): self.assertEqual(MRJob().is_mapper_or_reducer(), False) self.assertEqual(MRJob(['--mapper']).is_mapper_or_reducer(), True)
def test_extra_kwargs_in_mrjob_conf_okay(self): with logger_disabled('mrjob.runner'): opts = RunnerOptionStore('inline', {}, [self.path]) self.assertEqual(opts['setup_cmds'], ['echo foo']) self.assertNotIn('qux', opts)
def test_extra_kwargs_passed_in_directly_okay(self): with logger_disabled('mrjob.runner'): opts = RunnerOptionStore( 'inline', {'local_tmp_dir': '/var/tmp', 'foo': 'bar'}, []) self.assertEqual(opts['local_tmp_dir'], '/var/tmp') self.assertNotIn('bar', opts)
def test_extra_kwargs_in_mrjob_conf_okay(self): with logger_disabled('mrjob.runner'): with LocalMRJobRunner(conf_path=self.mrjob_conf_path) as runner: assert_equal(runner._opts['setup_cmds'], ['echo foo']) assert_not_in('qux', runner._opts)
def test_mr_positional_arguments(self): with logger_disabled('mrjob.job'): self.assertEqual( MRJob.mr(_IDENTITY_MAPPER, _IDENTITY_REDUCER), MRStep(mapper=_IDENTITY_MAPPER, reducer=_IDENTITY_REDUCER))
def test_extra_kwargs_passed_in_directly_okay(self): with logger_disabled('mrjob.runner'): with LocalMRJobRunner( conf_path=False, base_tmp_dir='/var/tmp', foo='bar') as runner: assert_equal(runner._opts['base_tmp_dir'], '/var/tmp') assert_not_in('bar', runner._opts)
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar' ] + list(args) + ['-', local_input_path, remote_input_path] + ['--hadoop-input-format', 'FooFormat'] + ['--hadoop-output-format', 'BarFormat'] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] # don't care that --hadoop-*-format is deprecated with logger_disabled('mrjob.job'): runner = mr_job.make_runner() with runner as runner: # i.e. call cleanup when we're done assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [ args for args in hadoop_cmd_args if args[:1] == ['jar'] ] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in('-inputformat', step_0_args) assert_not_in('-outputformat', step_0_args) assert_not_in('-inputformat', step_1_args) assert_in('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in('-libjar', args) assert_in('-mapper', args) assert_lt(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through assert_in('-D', step_0_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_extra_kwargs_in_mrjob_conf_okay(self): with logger_disabled('mrjob.runner'): runner = InlineMRJobRunner(conf_paths=[self.path]) self.assertEqual(runner._opts['setup'], ['echo foo']) self.assertNotIn('qux', runner._opts)
def test_name_kwarg(self): with logger_disabled('mrjob.step'): self.assertEqual(JarStep(jar='pickle.jar', name='Bubbies'), JarStep(jar='pickle.jar'))
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob(['-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar'] + list(args) + ['-', local_input_path, remote_input_path] + ['--hadoop-input-format', 'FooFormat'] + ['--hadoop-output-format', 'BarFormat'] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] # don't care that --hadoop-*-format is deprecated with logger_disabled('mrjob.job'): runner = mr_job.make_runner() with runner as runner: # i.e. call cleanup when we're done assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [args for args in hadoop_cmd_args if args[:1] == ['jar']] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in('-inputformat', step_0_args) assert_not_in('-outputformat', step_0_args) assert_not_in('-inputformat', step_1_args) assert_in('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in('-libjar', args) assert_in('-mapper', args) assert_lt(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through assert_in('-D', step_0_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_positional(self): with logger_disabled('mrjob.step'): self.assertEqual( JarStep('foo', 'bell.jar', 'First', ['one', '2']), JarStep(jar='bell.jar', main_class='First', args=['one', '2']))
def test_mixed(self): with logger_disabled('mrjob.step'): self.assertEqual( JarStep('foo', jar='bell.jar', args=['3', 'four']), JarStep(jar='bell.jar', args=['3', 'four']))
def test_extra_kwargs_in_mrjob_conf_okay(self): with logger_disabled('mrjob.runner'): with InlineMRJobRunner(conf_path=self.mrjob_conf_path) as runner: self.assertEqual(runner._opts['setup_cmds'], ['echo foo']) self.assertNotIn('qux', runner._opts)