def test_bootstrap_python_comes_before_bootstrap(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '--bootstrap', 'true']) with mr_job.make_runner() as runner: self.assertEqual( runner._bootstrap, self.EXPECTED_BOOTSTRAP + [['true']])
def test_dont_take_down_cluster_on_failure(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox() self._dataproc_client.job_get_advances_states = collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR']) with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) with logger_disabled('mrjob.dataproc'): self.assertRaises(StepFailedException, runner.run) cluster = self.get_cluster_from_runner(runner, cluster_id) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'RUNNING') # job shouldn't get terminated by cleanup cluster = self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id] cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'RUNNING')
def test_owner_and_label_switches(self): runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain'] runner = MRTwoStepJob(runner_opts).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'ads_chain') assert_equal(match.group(2), 'ads')
def test_attach_to_existing_job_flow(self): emr_conn = EMRJobRunner(conf_path=False).make_emr_conn() # set log_uri to None, so that when we describe the job flow, it # won't have the loguri attribute, to test Issue #112 emr_job_flow_id = emr_conn.run_jobflow( name='Development Job Flow', log_uri=None) stdin = StringIO('foo\nbar\n') self.mock_emr_output = {(emr_job_flow_id, 1): [ '1\t"bar"\n1\t"foo"\n2\tnull\n']} mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path, '--emr-job-flow-id', emr_job_flow_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Issue 182: don't create the bootstrap script when # attaching to another job flow assert_equal(runner._master_bootstrap_script, None) for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) assert_equal(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = StringIO("foo\nbar\n") input_path = os.path.join(self.tmp_dir, "input") with open(input_path, "w") as input_file: input_file.write("bar\nqux\n") input_gz_path = os.path.join(self.tmp_dir, "input.gz") input_gz = gzip.GzipFile(input_gz_path, "w") input_gz.write("foo\n") input_gz.close() mr_job = MRTwoStepJob(["-c", self.mrjob_conf_path, "-", input_path, input_gz_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any( 'import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any( '#' in line for line in lines)) # should still get expected results self.assertEqual( sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def test_owner_and_label_switches(self): runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain'] runner = MRTwoStepJob(runner_opts).make_runner() match = _JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'ads_chain') self.assertEqual(match.group(2), 'ads')
def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path]) mr_job.sandbox() self.add_mock_s3_data({'walrus': {}}) self.mock_emr_failures = {('j-MOCKJOBFLOW0', 0): None} with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) with logger_disabled('mrjob.emr'): assert_raises(Exception, runner.run) emr_conn = botoemr.EmrConnection() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'FAILED') # job should get terminated on cleanup emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')
def test_missing_input(self): mr_job = MRTwoStepJob(['-r', 'inline', '/some/bogus/file/path']) mr_job.sandbox() with mr_job.make_runner() as runner: assert isinstance(runner, InlineMRJobRunner) self.assertRaises(IOError, runner.run)
def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = BytesIO(b'foo\nbar\n') input_path = join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\n') input_gz_path = join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRTwoStepJob( ['--runner', 'inline', '-', input_path, input_gz_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, InlineMRJobRunner) runner.run() results.extend(mr_job.parse_output(runner.cat_output())) local_tmp_dir = runner._get_local_tmp_dir() assert exists(local_tmp_dir) # make sure cleanup happens assert not exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_two_step_job(self): # good all-around test. MRTwoStepJob's first step logs counters, but # its second step does not job = MRTwoStepJob(['-r', 'spark']) job.sandbox(stdin=BytesIO(b'foo\nbar\n')) with job.make_runner() as runner: runner.run() counters = runner.counters() # should have two steps worth of counters, even though it runs as a # single Spark job self.assertEqual(len(counters), 2) # first step counters should be {'count': {'combiners': <int>}} self.assertEqual(sorted(counters[0]), ['count']) self.assertEqual(sorted(counters[0]['count']), ['combiners']) self.assertIsInstance(counters[0]['count']['combiners'], int) # second step counters should be empty self.assertEqual(counters[1], {}) log_output = '\n'.join(c[0][0] for c in self.log.info.call_args_list) log_lines = log_output.split('\n') # should log first step counters but not second step self.assertIn('Counters for step 1: 1', log_lines) self.assertIn('\tcount', log_output) self.assertNotIn('Counters for step 2', log_output)
def test_bootstrap_python_switch(self): mr_job = MRTwoStepJob(["-r", "dataproc", "--bootstrap-python"]) with mr_job.make_runner() as runner: self.assertEqual(runner._opts["bootstrap_python"], True) self.assertEqual(runner._bootstrap_python(), self.EXPECTED_BOOTSTRAP) self.assertEqual(runner._bootstrap, self.EXPECTED_BOOTSTRAP)
def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '-v']) mr_job.sandbox() with no_handlers_for_logger('mrjob.dataproc'): stderr = StringIO() log_to_stream('mrjob.dataproc', stderr) self._dataproc_client.job_get_advances_states = ( collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR'])) with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) self.assertRaises(StepFailedException, runner.run) self.assertIn(' => ERROR\n', stderr.getvalue()) cluster_id = runner.get_cluster_id() # job should get terminated cluster = ( self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'DELETING')
def test_attach_to_existing_cluster(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Generate fake output self.put_job_output_parts(runner, [ b'1\t"bar"\n1\t"foo"\n2\tnull\n' ]) # Issue 182: don't create the bootstrap script when # attaching to another cluster self.assertIsNone(runner._master_bootstrap_script_path) for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) self.assertEqual(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def test_owner_and_label_switches(self): runner_opts = ["--no-conf", "--owner=ads", "--label=ads_chain"] runner = MRTwoStepJob(runner_opts).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), "ads_chain") self.assertEqual(match.group(2), "ads")
def test_streaming_step_not_okay(self): job = MRTwoStepJob() job.sandbox() with job.make_runner() as runner: self.assertRaises( TypeError, runner._spark_script_args, 0)
def test_default(self): mr_job = MRTwoStepJob(['-r', 'dataproc']) with mr_job.make_runner() as runner: self.assertEqual(runner._opts['bootstrap_python'], True) self.assertEqual(runner._bootstrap_python(), self.EXPECTED_BOOTSTRAP) self.assertEqual(runner._bootstrap, self.EXPECTED_BOOTSTRAP)
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob(['-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar', '-', local_input_path, remote_input_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_debugging_works(self): mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path, '--enable-emr-debugging']) mr_job.sandbox() with mr_job.make_runner() as runner: runner.run() flow = runner.make_emr_conn().describe_jobflow(runner._emr_job_flow_id) assert_equal(flow.steps[0].name, 'Setup Hadoop Debugging')
def test_default(self): mr_job = MRTwoStepJob(['-r', 'local']) mr_job.sandbox(stdin=BytesIO(b'foo\nbar\n')) with mr_job.make_runner() as runner: self.assertEqual(runner._num_mappers(0), cpu_count()) self.assertEqual(runner._num_reducers(0), cpu_count()) runner.run() self.pool.assert_called_with(processes=None)
def test_echo_as_steps_python_bin(self): mr_job = MRTwoStepJob(["--steps", "--steps-python-bin", "echo", "--no-conf", "-r", "local"]) mr_job.sandbox() with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) # MRTwoStepJob populates _steps in the runner, so un-populate # it here so that the runner actually tries to get the steps # via subprocess runner._steps = None self.assertRaises(ValueError, runner._get_steps)
def test_three_cores(self): mr_job = MRTwoStepJob(['-r', 'local', '--num-cores', '3']) mr_job.sandbox(stdin=BytesIO(b'foo\nbar\n')) with mr_job.make_runner() as runner: self.assertEqual(runner._num_mappers(0), 3) self.assertEqual(runner._num_reducers(0), 3) runner.run() self.pool.assert_called_with(processes=3)
def test_blank_out_counters_if_not_output(self): self.start(patch('mrjob.bin.MRJobBinRunner._run_spark_submit', return_value=2)) job = MRTwoStepJob(['-r', 'spark']) job.sandbox(stdin=BytesIO(b'foo\nbar\n')) with job.make_runner() as runner: self.assertRaises(StepFailedException, runner.run) # should blank out counters from failed step self.assertEqual(runner.counters(), [{}, {}])
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(["-r", "local", "--setup", "true"]) job.sandbox(stdin=BytesIO()) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch("mrjob.runner.open", create=True, side_effect=open) as m_open: with logger_disabled("mrjob.local"): with job.make_runner() as runner: runner.run() self.assertIn(call(runner._setup_wrapper_script_path, "w"), m_open.mock_calls)
def make_runner(self, *args): """create a dummy job, and call make_runner() on it. Use this in a with block: with self.make_runner() as runner: ... """ stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'emr'] + list(args)) mr_job.sandbox(stdin=stdin) return mr_job.make_runner()
def test_default_hadoop_version(self): stdin = StringIO('foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path]) mr_job.sandbox(stdin=stdin) with mr_job.make_runner() as runner: runner.run() emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.hadoopversion, '0.18')
def test_unexpected_opt_from_command_line(self): # regression test for #1898. local runner doesn't support *zone* job = MRTwoStepJob(['-r', 'local', '--no-conf', '--zone', 'DANGER']) job.sandbox() with job.make_runner(): self.assertTrue(self.log.warning.called) warnings = '\n'.join( arg[0][0] for arg in self.log.warning.call_args_list) self.assertIn('Unexpected option', warnings) self.assertIn('zone', warnings) self.assertIn('command line', warnings)
def test_setup_wrapper_script_uses_local_line_endings(self): job = MRTwoStepJob(['-r', 'local', '--setup', 'true']) job.sandbox(stdin=BytesIO()) # tests #1071. Unfortunately, we mostly run these tests on machines # that use unix line endings anyway. So monitor open() instead with patch( 'mrjob.sim.open', create=True, side_effect=open) as m_open: with job.make_runner() as runner: runner.run() self.assertIn( call(runner._setup_wrapper_script_path, 'w'), m_open.mock_calls)
def test_auto_everything(self): test_start = datetime.datetime.utcnow() os.environ["USER"] = "******" runner = MRTwoStepJob(["--no-conf"]).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), "mr_two_step_job") self.assertEqual(match.group(2), "mcp") job_start = datetime.datetime.strptime(match.group(3) + match.group(4), "%Y%m%d%H%M%S") job_start = job_start.replace(microsecond=int(match.group(5))) self.assertGreaterEqual(job_start, test_start) self.assertLessEqual(job_start - test_start, datetime.timedelta(seconds=5))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO("foo\nbar\n") local_input_path = os.path.join(self.tmp_dir, "input") with open(local_input_path, "w") as local_input_file: local_input_file.write("bar\nqux\n") input_to_upload = os.path.join(self.tmp_dir, "remote_input") with open(input_to_upload, "w") as input_to_upload_file: input_to_upload_file.write("foo\n") remote_input_path = "hdfs:///data/foo" check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output([""]) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob(["-r", "hadoop", "-v", "--no-conf", "-", local_input_path, remote_input_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ["MOCK_HDFS_ROOT"] assert_equal(sorted(os.listdir(hdfs_root)), ["data", "user"]) home_dir = os.path.join(hdfs_root, "user", getpass.getuser()) assert_equal(os.listdir(home_dir), ["tmp"]) assert_equal(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"]) assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def run_job(self, args=()): args = ([sys.executable, MRTwoStepJob.mr_job_script()] + list(args) + ['--no-conf']) # add . to PYTHONPATH (in case mrjob isn't actually installed) env = combine_envs(os.environ, {'PYTHONPATH': os.path.abspath('.')}) proc = Popen(args, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=env) stdout, stderr = proc.communicate(input='foo\nbar\nbar\n') return stdout, stderr, proc.returncode
def test_hadoop_output_format(self): output_format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat' # one-step job job1 = MRWordCount() # no cmd-line argument for this because it's part of job semantics job1.HADOOP_OUTPUT_FORMAT = output_format with job1.make_runner() as runner1: self.assertEqual(runner1._hadoop_args_for_step(0), ['-outputformat', output_format]) # multi-step job: only use -outputformat on the last step job2 = MRTwoStepJob() job2.HADOOP_OUTPUT_FORMAT = output_format with job2.make_runner() as runner2: self.assertEqual(runner2._hadoop_args_for_step(0), []) self.assertEqual(runner2._hadoop_args_for_step(1), ['-outputformat', output_format])
def test_echo_as_python_bin(self): # "echo" is a pretty poor substitute for Python, but it # should be available on most systems mr_job = MRTwoStepJob( ['--python-bin', 'echo', '--steps-python-bin', sys.executable, '--no-conf', '-r', 'local']) mr_job.sandbox() with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() output = b''.join(runner.cat_output()) # the output should basically be the command we used to # run the last step, which in this case is a mapper self.assertIn(b'mr_two_step_job.py', output) self.assertIn(b'--step-num=1', output) self.assertIn(b'--mapper', output)
def test_unexpected_opt_from_mrjob_conf(self): conf_path = self.makefile('mrjob.custom.conf') with open(conf_path, 'w') as f: dump_mrjob_conf( dict(runners=dict(local=dict(land='useless_swamp'))), f) job = MRTwoStepJob(['-r', 'local', '-c', conf_path]) job.sandbox() with job.make_runner(): self.assertTrue(self.log.warning.called) warnings = '\n'.join( arg[0][0] for arg in self.log.warning.call_args_list) self.assertIn('Unexpected option', warnings) self.assertIn('land', warnings) self.assertIn(conf_path, warnings)
def _test_cloud_tmp_cleanup(self, mode, tmp_len): stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '-', '--cleanup', mode]) mr_job.sandbox(stdin=stdin) with mr_job.make_runner() as runner: tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir) runner.run() # this is set and unset before we can get at it unless we do this list(runner.cat_output()) fs = runner.fs # with statement finishes, cleanup runs self.assertEqual(len(list(fs.client.bucket(tmp_bucket).list_blobs())), tmp_len)
def test_echo_as_steps_python_bin(self): mr_job = MRTwoStepJob([ '--steps', '--steps-python-bin', 'echo', '--no-conf', '-r', 'local' ]) mr_job.sandbox() with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) try: # make_runner() populates _steps in the runner, so un-populate # it here so that the runner actually tries to get the steps # via subprocess runner._steps = None runner._get_steps() assert False, 'Should throw exception' except ValueError, ex: output = str(ex) # the output should basically be the command used to # run the steps command self.assertIn('mr_two_step_job.py', output) self.assertIn('--steps', output)
def test_attach_to_existing_cluster(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob( ['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Generate fake output self.put_job_output_parts(runner, [b'1\t"bar"\n1\t"foo"\n2\tnull\n']) # Issue 182: don't create the bootstrap script when # attaching to another cluster self.assertIsNone(runner._master_bootstrap_script_path) results.extend(mr_job.parse_output(runner.cat_output())) self.assertEqual(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue( any('import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any('#' in line for line in lines)) # should still get expected results self.assertEqual(sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '-v']) mr_job.sandbox() with no_handlers_for_logger('mrjob.dataproc'): stderr = StringIO() log_to_stream('mrjob.dataproc', stderr) self.mock_jobs_succeed = False with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) self.assertRaises(StepFailedException, runner.run) self.assertIn(' => ERROR\n', stderr.getvalue()) cluster_id = runner.get_cluster_id() # job should get terminated cluster = runner._get_cluster(cluster_id) self.assertEqual(_cluster_state_name(cluster.status.state), 'DELETING')
def test_two_step_job(self): input1_path = self.makefile('input1') input2_path = self.makefile('input2') job = MRTwoStepJob([ '-r', 'hadoop', '--hadoop-bin', 'false', # shouldn't run; just in case input1_path, input2_path]) job.sandbox() with job.make_runner() as runner: runner._add_job_files_for_upload() input_uris_0 = runner._step_input_uris(0) self.assertEqual([os.path.basename(uri) for uri in input_uris_0], ['input1', 'input2']) output_uri_0 = runner._step_output_uri(0) input_uris_1 = runner._step_input_uris(1) self.assertEqual(input_uris_1, [output_uri_0]) output_uri_1 = runner._step_output_uri(1) self.assertEqual(output_uri_1, runner._output_dir)
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf']) mr_job.sandbox(stdin=['bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr assert_in('import mrjob', mr_job.stderr.getvalue()) assert_in('#', mr_job.stderr.getvalue()) # should still get expected results assert_equal(sorted(mr_job.parse_output()), [(1, None), (1, 'bar')])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=['bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr self.assertIn('import mrjob', mr_job.stderr.getvalue()) self.assertIn('#', mr_job.stderr.getvalue()) # should still get expected results self.assertItemsEqual(mr_job.stdout.getvalue().splitlines(), ['1\tnull', '1\t"bar"'])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr. stderr = mr_job.stderr.getvalue() # stderr is huge, so don't use assertIn() self.assertTrue(b'import mrjob' in stderr or # Python 2 b"import 'mrjob'" in stderr) # Python 3 self.assertTrue(b'#' in stderr) # should still get expected results self.assertEqual(sorted(mr_job.stdout.getvalue().splitlines()), sorted([b'1\tnull', b'1\t"bar"']))
def test_show_steps(self): mr_boring_job = MRBoringJob(['--steps']) mr_boring_job.sandbox() mr_boring_job.show_steps() self.assertEqual(mr_boring_job.stdout.getvalue(), 'MR\n') # final mappers don't show up in the step description mr_final_boring_job = MRFinalBoringJob(['--steps']) mr_final_boring_job.sandbox() mr_final_boring_job.show_steps() self.assertEqual(mr_final_boring_job.stdout.getvalue(), 'MR\n') mr_two_step_job = MRTwoStepJob(['--steps']) mr_two_step_job.sandbox() mr_two_step_job.show_steps() self.assertEqual(mr_two_step_job.stdout.getvalue(), 'MCR M\n') mr_no_mapper = MRNoMapper(['--steps']) mr_no_mapper.sandbox() mr_no_mapper.show_steps() self.assertEqual(mr_no_mapper.stdout.getvalue(), 'MR R\n')
def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = StringIO('foo\nbar\n') input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz_glob = os.path.join(self.tmp_dir, '*.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\n') input_gz.close() mr_job = MRTwoStepJob([ '-c', self.mrjob_conf_path, '-r', 'local', '-', input_path, input_gz_glob ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) self.assertEqual(runner.counters()[0]['count']['combiners'], 8) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_end_to_end_multiple_tasks(self): # read from STDIN, a regular file, and a .gz stdin = BytesIO(b'foo\nbar\n') input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRTwoStepJob([ '-r', 'local', '--jobconf=mapred.map.tasks=2', '--jobconf=mapred.reduce.tasks=2', '-', input_path, input_gz_path ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_nonexistent_steps(self): mr_job = MRTwoStepJob() mr_job.sandbox() self.assertRaises(ValueError, mr_job.run_reducer, 1) self.assertRaises(ValueError, mr_job.run_mapper, 2) self.assertRaises(ValueError, mr_job.run_reducer, -1)
def test_auto_label(self): runner = MRTwoStepJob(['--no-conf']).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'mr_two_step_job') self.assertEqual(match.group(2), getpass.getuser())
def test_streaming_step_not_okay(self): job = MRTwoStepJob() job.sandbox() with job.make_runner() as runner: self.assertRaises(TypeError, runner._spark_script_args, 0)
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar' ] + list(args) + ['-', local_input_path, remote_input_path] + ['--hadoop-input-format', 'FooFormat'] + ['--hadoop-output-format', 'BarFormat'] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] # don't care that --hadoop-*-format is deprecated with logger_disabled('mrjob.job'): runner = mr_job.make_runner() with runner as runner: # i.e. call cleanup when we're done assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [ args for args in hadoop_cmd_args if args[:1] == ['jar'] ] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in('-inputformat', step_0_args) assert_not_in('-outputformat', step_0_args) assert_not_in('-inputformat', step_1_args) assert_in('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in('-libjar', args) assert_in('-mapper', args) assert_lt(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through assert_in('-D', step_0_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') remote_input_path = 's3://walrus/data/foo' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}}) # setup fake output self.mock_emr_output = { ('j-MOCKJOBFLOW0', 1): ['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'] } mr_job = MRTwoStepJob([ '-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-', local_input_path, remote_input_path, '--hadoop-input-format', 'FooFormat', '--hadoop-output-format', 'BarFormat' ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs) with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) # make sure that initializing the runner doesn't affect S3 # (Issue #50) assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.state, 'COMPLETED') name_match = JOB_NAME_RE.match(job_flow.name) assert_equal(name_match.group(1), 'mr_two_step_job') assert_equal(name_match.group(2), getpass.getuser()) # make sure our input and output formats are attached to # the correct steps assert_in('-inputformat', job_flow.steps[0].args) assert_not_in('-outputformat', job_flow.steps[0].args) assert_not_in('-inputformat', job_flow.steps[1].args) assert_in('-outputformat', job_flow.steps[1].args) # make sure mrjob.tar.gz is created and uploaded as # a bootstrap file assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file') # shouldn't be in PYTHONPATH (we dump it directly in site-packages) pythonpath = runner._get_cmdenv().get('PYTHONPATH') or '' assert_not_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir())) # job should get terminated emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')
def test_base_classes_cant_have_steps(self): steps = MRTwoStepJob([])._steps_desc() self.assertRaises(NotImplementedError, MRJobRunner, steps=steps)
def test_no_warning_by_default(self): job = MRTwoStepJob(['-r', 'local', '--no-conf']) job.sandbox() with job.make_runner(): self.assertFalse(self.log.warning.called)
def test_bootstrap_python_comes_before_bootstrap(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '--bootstrap', 'true']) with mr_job.make_runner() as runner: self.assertEqual(runner._bootstrap, self.EXPECTED_BOOTSTRAP + [['true']])
def test_no_bootstrap_python_switch(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '--no-bootstrap-python']) with mr_job.make_runner() as runner: self.assertEqual(runner._opts['bootstrap_python'], False) self.assertEqual(runner._bootstrap_python(), []) self.assertEqual(runner._bootstrap, [])
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar', '-', local_input_path, remote_input_path ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))