def test_precedence_deprecated(self): os.environ["HOME"] = "/home/foo" os.environ["PYTHONPATH"] = "/py1:/py2" self._existing_paths = set() self.assertEqual(find_mrjob_conf(), None) self._existing_paths.add("/etc/mrjob.conf") self.assertEqual(find_mrjob_conf(), "/etc/mrjob.conf") self._existing_paths.add("/py2/mrjob.conf") with no_handlers_for_logger(): buf = log_to_buffer("mrjob.conf") self.assertEqual(find_mrjob_conf(), "/py2/mrjob.conf") self.assertIn("This config path is deprecated", buf.getvalue()) self._existing_paths.add("/py1/mrjob.conf") with no_handlers_for_logger(): buf = log_to_buffer("mrjob.conf") self.assertEqual(find_mrjob_conf(), "/py1/mrjob.conf") self.assertIn("This config path is deprecated", buf.getvalue()) self._existing_paths.add("/home/foo/.mrjob") with no_handlers_for_logger(): buf = log_to_buffer("mrjob.conf") self.assertEqual(find_mrjob_conf(), "/home/foo/.mrjob") self.assertIn("This config path is deprecated", buf.getvalue()) mrjob_conf_path = os.path.join(self.tmp_dir, "mrjob.conf") open(mrjob_conf_path, "w").close() os.environ["MRJOB_CONF"] = mrjob_conf_path self._existing_paths.add(mrjob_conf_path) self.assertEqual(find_mrjob_conf(), mrjob_conf_path)
def test_precedence_deprecated(self): os.environ['HOME'] = '/home/foo' os.environ['PYTHONPATH'] = '/py1:/py2' self._existing_paths = set() assert_equal(find_mrjob_conf(), None) self._existing_paths.add('/etc/mrjob.conf') assert_equal(find_mrjob_conf(), '/etc/mrjob.conf') self._existing_paths.add('/py2/mrjob.conf') with no_handlers_for_logger(): buf = self._log_to_buffer() assert_equal(find_mrjob_conf(), '/py2/mrjob.conf') assert_in('This config path is deprecated', buf.getvalue()) self._existing_paths.add('/py1/mrjob.conf') with no_handlers_for_logger(): buf = self._log_to_buffer() assert_equal(find_mrjob_conf(), '/py1/mrjob.conf') assert_in('This config path is deprecated', buf.getvalue()) self._existing_paths.add('/home/foo/.mrjob') with no_handlers_for_logger(): buf = self._log_to_buffer() assert_equal(find_mrjob_conf(), '/home/foo/.mrjob') assert_in('This config path is deprecated', buf.getvalue()) mrjob_conf_path = os.path.join(self.tmp_dir, 'mrjob.conf') open(mrjob_conf_path, 'w').close() os.environ['MRJOB_CONF'] = mrjob_conf_path self._existing_paths.add(mrjob_conf_path) assert_equal(find_mrjob_conf(), mrjob_conf_path)
def test_round_trip(self): conf = {"runners": {"foo": {"qux": "quux"}}} conf_path = os.path.join(self.tmp_dir, "mrjob.conf") dump_mrjob_conf(conf, open(conf_path, "w")) with no_handlers_for_logger("mrjob.conf"): self.assertEqual(conf, load_mrjob_conf(conf_path=conf_path))
def test_non_log_lines(self): lines = StringIO( "foo\n" "bar\n" "15/12/11 13:26:08 ERROR streaming.StreamJob:" " Error Launching job :" " Output directory already exists\n" "Streaming Command Failed!" ) with no_handlers_for_logger("mrjob.logs.parse"): stderr = StringIO() log_to_stream("mrjob.logs.parse", stderr) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ # ignore leading non-log lines dict( timestamp="15/12/11 13:26:08", level="ERROR", logger="streaming.StreamJob", thread=None, # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=( "Error Launching job :" " Output directory already exists\n" "Streaming Command Failed!" ), ) ], ) # should be one warning for each leading non-log line log_lines = stderr.getvalue().splitlines() self.assertEqual(len(log_lines), 2)
def test_kill_persistent_cluster(self): with no_handlers_for_logger("mrjob.dataproc"): r = self._quick_runner() with patch.object(mrjob.dataproc.DataprocJobRunner, "_api_cluster_delete") as m: r._opts["cluster_id"] = "j-MOCKCLUSTER0" r._cleanup_cluster() self.assertTrue(m.called)
def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '-v']) mr_job.sandbox() with no_handlers_for_logger('mrjob.dataproc'): stderr = StringIO() log_to_stream('mrjob.dataproc', stderr) self._dataproc_client.job_get_advances_states = ( collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR'])) with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) self.assertRaises(StepFailedException, runner.run) self.assertIn(' => ERROR\n', stderr.getvalue()) cluster_id = runner.get_cluster_id() # job should get terminated cluster = ( self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'DELETING')
def test_non_log_lines(self): lines = StringIO('foo\n' 'bar\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!') with no_handlers_for_logger('mrjob.logs.parse'): stderr = StringIO() log_to_stream('mrjob.logs.parse', stderr) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ # ignore leading non-log lines dict( timestamp='15/12/11 13:26:08', level='ERROR', logger='streaming.StreamJob', thread=None, # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=('Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!')) ]) # should be one warning for each leading non-log line log_lines = stderr.getvalue().splitlines() self.assertEqual(len(log_lines), 2)
def assert_hadoop_version(self, JobClass, version_string): mr_job = JobClass() mock_log = StringIO() with no_handlers_for_logger("mrjob.job"): log_to_stream("mrjob.job", mock_log) self.assertEqual(mr_job.jobconf()["hadoop_version"], version_string) self.assertIn("should be a string", mock_log.getvalue())
def test_getattr_forward(self): with no_handlers_for_logger(): r = InlineMRJobRunner(conf_path=False) store = r._opts self.assertIsInstance(store, InlineRunnerOptionStore) a = r.get_default_opts() self.assertEqual(a, store.default_options())
def test_messy_error(self): counter_string = 'Job JOBID="_001" FAILED_REDUCES="0" COUNTERS="THIS IS NOT ACTUALLY A COUNTER"' with no_handlers_for_logger(''): stderr = StringIO() log_to_stream('mrjob.parse', stderr, level=logging.WARN) assert_equal((None, None), parse_hadoop_counters_from_line(counter_string)) assert_in('Cannot parse Hadoop counter line', stderr.getvalue())
def test_cleanup_options(self): stderr = StringIO() with no_handlers_for_logger('mrjob.runner'): log_to_stream('mrjob.runner', stderr) opts = RunnerOptionStore( 'inline', dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'], cleanup_on_failure=['JOB_FLOW', 'SCRATCH']), []) self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'CLOUD_TMP']) self.assertIn( 'Deprecated cleanup option LOCAL_SCRATCH has been renamed' ' to LOCAL_TMP', stderr.getvalue()) self.assertIn( 'Deprecated cleanup option REMOTE_SCRATCH has been renamed' ' to CLOUD_TMP', stderr.getvalue()) self.assertEqual(opts['cleanup_on_failure'], ['CLUSTER', 'TMP']) self.assertIn( 'Deprecated cleanup_on_failure option JOB_FLOW has been' ' renamed to CLUSTER', stderr.getvalue()) self.assertIn( 'Deprecated cleanup_on_failure option SCRATCH has been renamed' ' to TMP', stderr.getvalue())
def test_hadoop_runner(self): # you can't instantiate a HadoopJobRunner without Hadoop installed launcher = MRJobLauncher(args=["--no-conf", "-r", "hadoop", "", "--hadoop-streaming-jar", "HUNNY"]) with no_handlers_for_logger("mrjob.runner"): with patch.dict(os.environ, {"HADOOP_HOME": "100-Acre Wood"}): with launcher.make_runner() as runner: self.assertIsInstance(runner, HadoopJobRunner)
def _test_round_trip(self, conf): conf_path = os.path.join(self.tmp_dir, 'mrjob.conf') with open(conf_path, 'w') as f: dump_mrjob_conf(conf, f) with no_handlers_for_logger('mrjob.conf'): self.assertEqual(conf, load_mrjob_conf(conf_path=conf_path))
def test_round_trip(self): conf = {'runners': {'foo': {'qux': 'quux'}}} conf_path = os.path.join(self.tmp_dir, 'mrjob.conf') dump_mrjob_conf(conf, open(conf_path, 'w')) with no_handlers_for_logger('mrjob.conf'): self.assertEqual(conf, load_mrjob_conf(conf_path=conf_path))
def test_fallback(self): self.assertFalse(self.which.called) with no_handlers_for_logger('mrjob.fs.hadoop'): self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop']) self.which.assert_called_once_with('hadoop', path=None)
def test_kill_persistent_cluster(self): with no_handlers_for_logger('mrjob.dataproc'): r = self._quick_runner() with patch.object(mrjob.dataproc.DataprocJobRunner, '_api_cluster_delete') as m: r._opts['cluster_id'] = 'j-MOCKCLUSTER0' r._cleanup_cluster() self.assertTrue(m.called)
def _test_environment_variable(self, envvar, *dirnames): """Check if we can find the hadoop binary from *envvar*""" # okay to add after HadoopFilesystem() created; it hasn't looked yet hadoop_bin = self._add_hadoop_bin_for_envvar(envvar, *dirnames) with no_handlers_for_logger('mrjob.fs.hadoop'): self.assertEqual(self.fs.get_hadoop_bin(), [hadoop_bin])
def test_deprecated_mapper_final_positional_arg(self): def mapper(k, v): pass def reducer(k, v): pass def mapper_final(): pass stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.job', stderr) step = MRJob.mr(mapper, reducer, mapper_final) # should be allowed to specify mapper_final as a positional arg, # but we log a warning self.assertEqual( step, MRJob.mr( mapper=mapper, reducer=reducer, mapper_final=mapper_final)) self.assertIn('mapper_final should be specified', stderr.getvalue()) # can't specify mapper_final as a positional and keyword arg self.assertRaises( TypeError, MRJob.mr, mapper, reducer, mapper_final, mapper_final=mapper_final)
def test_can_turn_off_bootstrap_mrjob(self): with mrjob_conf_patcher({"runners": {"local": {"bootstrap_mrjob": False}}}): mr_job = MRJobWhereAreYou(["-r", "local"]) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner.get_opts()["bootstrap_mrjob"], False) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) try: with no_handlers_for_logger(): runner.run() except Exception as e: # if mrjob is not installed, script won't be able to run self.assertIn("ImportError", str(e)) return output = list(runner.stream_output()) self.assertEqual(len(output), 1) # script should not load mrjob from local_tmp_dir _, script_mrjob_dir = mr_job.parse_output_line(output[0]) self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
def test_bad_sort(self): self.use_bad_sort() runner = MRJobRunner(conf_paths=[]) with no_handlers_for_logger(): self.assertRaises(CalledProcessError, runner._invoke_sort, [self.a, self.b], self.out)
def test_large_amounts_of_stderr(self): mr_job = MRVerboseJob(['--no-conf', '-r', 'local', '-v']) mr_job.sandbox() try: with no_handlers_for_logger(): mr_job.run_job() except TimeoutException: raise except SystemExit: # we expect the job to throw a StepFailedException, # which causes run_job to call sys.exit() # look for expected output from MRVerboseJob stderr = mr_job.stderr.getvalue() self.assertIn( b"Counters: 1\n\tFoo\n\t\tBar=10000", stderr) self.assertIn(b'Status: 0\n', stderr) self.assertIn(b'Status: 99\n', stderr) self.assertNotIn(b'Status: 100\n', stderr) self.assertIn(b'STDERR: Qux\n', stderr) # exception should appear in exception message self.assertIn(b'BOOM', stderr) else: raise AssertionError()
def test_can_turn_off_bootstrap_mrjob(self): with mrjob_conf_patcher( {'runners': {'local': {'bootstrap_mrjob': False}}}): mr_job = MRJobWhereAreYou(['-r', 'local']) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner._opts['bootstrap_mrjob'], False) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) try: with no_handlers_for_logger(): runner.run() except StepFailedException: # this is what happens when mrjob isn't installed elsewhere return # however, if mrjob is installed, we need to verify that # we're using the installed version and not a bootstrapped copy output = list(mr_job.parse_output(runner.cat_output())) self.assertEqual(len(output), 1) # script should not load mrjob from local_tmp_dir _, script_mrjob_dir = output[0] self.assertFalse(script_mrjob_dir.startswith(local_tmp_dir))
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with no_handlers_for_logger(): with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any( 'import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any( '#' in line for line in lines)) # should still get expected results self.assertEqual( sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def test_mixed_behavior_2(self): stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.job', stderr) mr_job = self.MRInconsistentJob2() self.assertEqual(mr_job.options.input_protocol, None) self.assertEqual(mr_job.input_protocol().__class__, ReprProtocol) self.assertIn('custom behavior', stderr.getvalue())
def test_hadoop_runner(self): # you can't instantiate a HadoopJobRunner without Hadoop installed launcher = MRJobLauncher(args=['--no-conf', '-r', 'hadoop', '', '--hadoop-streaming-jar', 'HUNNY']) with no_handlers_for_logger('mrjob.runner'): with patch.dict(os.environ, {'HADOOP_HOME': '100-Acre Wood'}): with launcher.make_runner() as runner: self.assertIsInstance(runner, HadoopJobRunner)
def test_verbose(self): with no_handlers_for_logger('__main__'): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging(verbose=True) log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
def test_path_exists(self): fs = Filesystem() with patch.object(fs, 'exists'): with no_handlers_for_logger('mrjob.fs.base'): fs.path_exists('foo') fs.exists.assert_called_once_with('foo')
def test_default_options(self): with no_handlers_for_logger('__main__'): with patch.object(sys, 'stderr', cStringIO.StringIO()) as stderr: MRJob.set_up_logging() log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\n')
def test_path_join(self): fs = Filesystem() with patch.object(fs, 'join'): with no_handlers_for_logger('mrjob.fs.base'): fs.path_join('foo', 'bar') fs.join.assert_called_once_with('foo', 'bar')
def test_path_join(self): fs = Filesystem() with patch.object(fs, "join"): with no_handlers_for_logger("mrjob.fs.base"): fs.path_join("foo", "bar") fs.join.assert_called_once_with("foo", "bar")
def test_prefer_own_methods(self): # TODO: currently can't initialize HadoopRunner without setting these runner = HadoopJobRunner(hadoop_bin='hadoop', hadoop_home='kansas', hadoop_streaming_jar='streaming.jar') with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner.ls, runner.fs.ls) # Hadoop Runner has its own version self.assertNotEqual(runner.get_hadoop_version, runner.fs.get_hadoop_version) self.assertIn('deprecated: call HadoopJobRunner.fs.ls() directly', stderr.getvalue()) self.assertNotIn('get_hadoop_version', stderr.getvalue())
def test_wrapper_script_only_writes_to_stderr(self): job = MROSWalkJob([ '-r', 'local', '--setup', 'echo stray output', ]) job.sandbox() with no_handlers_for_logger('mrjob.local'): stderr = StringIO() log_to_stream('mrjob.local', stderr) with job.make_runner() as r: r.run() output = ''.join(r.stream_output()) # stray ouput should be in stderr, not the job's output self.assertIn('stray output', stderr.getvalue()) self.assertNotIn('stray output', output)
def test_with_header(self): lines = [ 'Counters: 1', ' File System Counters', ' FILE: Number of bytes read=86', ] with no_handlers_for_logger('mrjob.logs.step'): stderr = StringIO() log_to_stream('mrjob.logs.step', stderr) self.assertEqual(_parse_indented_counters(lines), { 'File System Counters': { 'FILE: Number of bytes read': 86, }, }) # header shouldn't freak it out self.assertEqual(stderr.getvalue(), '')
def test_emr_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) opts = EMRRunnerOptionStore( 'emr', dict(base_tmp_dir='/scratch', emr_job_flow_id='j-CLUSTERID', emr_job_flow_pool_name='liver', pool_emr_job_flows=True, s3_scratch_uri='s3://bucket/walrus'), []) self.assertEqual(opts['cluster_id'], 'j-CLUSTERID') self.assertNotIn('emr_job_flow_id', opts) self.assertIn( 'Deprecated option emr_job_flow_id has been renamed' ' to cluster_id', stderr.getvalue()) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn( 'Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue()) self.assertEqual(opts['pool_clusters'], True) self.assertNotIn('pool_emr_job_flows', opts) self.assertIn( 'Deprecated option pool_emr_job_flows has been' ' renamed to pool_clusters', stderr.getvalue()) self.assertEqual(opts['pool_name'], 'liver') self.assertNotIn('emr_job_flow_pool_name', opts) self.assertIn( 'Deprecated option emr_job_flow_pool_name has been' ' renamed to pool_name', stderr.getvalue()) self.assertEqual(opts['cloud_tmp_dir'], 's3://bucket/walrus') self.assertNotIn('s3_scratch_uri', opts) self.assertIn( 'Deprecated option s3_scratch_uri has been renamed' ' to cloud_tmp_dir', stderr.getvalue())
def test_load_mrjob_conf_and_load_opts(self): conf_path = os.path.join(self.tmp_dir, 'mrjob.conf.2') with open(conf_path, 'w') as f: f.write('{"runners": {"foo": {"qux": "quux"}}}') with no_handlers_for_logger('mrjob.conf'): self.assertEqual(load_mrjob_conf(conf_path=conf_path), {'runners': { 'foo': { 'qux': 'quux' } }}) self.assertEqual( load_opts_from_mrjob_conf('foo', conf_path=conf_path)[0][1], {'qux': 'quux'}) # test missing options with logger_disabled('mrjob.conf'): self.assertEqual( load_opts_from_mrjob_conf('bar', conf_path=conf_path)[0][1], {})
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr. stderr = mr_job.stderr.getvalue() # stderr is huge, so don't use assertIn() self.assertTrue(b'import mrjob' in stderr or # Python 2 b"import 'mrjob'" in stderr) # Python 3 self.assertTrue(b'#' in stderr) # should still get expected results self.assertEqual(sorted(mr_job.stdout.getvalue().splitlines()), sorted([b'1\tnull', b'1\t"bar"']))
def test_hadoop_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) opts = HadoopRunnerOptionStore( 'hadoop', dict(base_tmp_dir='/scratch', hdfs_scratch_dir='hdfs:///scratch'), []) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn('Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue()) self.assertEqual(opts['hadoop_tmp_dir'], 'hdfs:///scratch') self.assertNotIn('hdfs_scratch_dir', opts) self.assertIn('Deprecated option hdfs_scratch_dir has been renamed' ' to hadoop_tmp_dir', stderr.getvalue())
def test_conf_contain_only_include_file(self): """If a config file only include other configuration files no warnings are thrown as long as the included files are not empty. """ # dummy configuration for include file 1 conf = { 'runners': { 'inline': { 'local_tmp_dir': "include_file1_local_tmp_dir" } } } include_file_1 = self.save_conf('include_file_1', conf) # dummy configuration for include file 2 conf = { 'runners': { 'inline': { 'local_tmp_dir': "include_file2_local_tmp_dir" } } } include_file_2 = self.save_conf('include_file_2', conf) # test configuration conf = { 'include': [include_file_1, include_file_2] } path = self.save_conf('twoincludefiles', conf) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.conf', stderr) InlineMRJobRunner(conf_paths=[path]) self.assertEqual( "", stderr.getvalue())
def test_large_amounts_of_stderr(self): mr_job = MRVerboseJob(['--no-conf']) mr_job.sandbox() try: with no_handlers_for_logger(): mr_job.run_job() except TimeoutException: raise except Exception, e: # we expect the job to throw an exception # look for expected output from MRVerboseJob stderr = mr_job.stderr.getvalue() assert_in("Counters from step 1:\n Foo:\n Bar: 10000", stderr) assert_in('status: 0\n', stderr) assert_in('status: 99\n', stderr) assert_not_in('status: 100\n', stderr) assert_in('STDERR: Qux\n', stderr) # exception should appear in exception message assert_in('BOOM', repr(e))
def test_archive_upload(self): job = MROSWalkJob([ '-r', 'local', '--archive', self.foo_tar_gz, '--archive', self.foo_tar_gz + '#foo', ]) job.sandbox() with job.make_runner() as r: with no_handlers_for_logger('mrjob.local'): r.run() path_to_size = dict( job.parse_output_line(line) for line in r.stream_output()) self.assertEqual(path_to_size.get('./foo.tar.gz/foo.py'), self.foo_py_size) self.assertEqual(path_to_size.get('./foo/foo.py'), self.foo_py_size)
def test_emr_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) opts = EMRRunnerOptionStore( 'emr', dict(base_tmp_dir='/scratch', s3_scratch_uri='s3://bucket/walrus'), []) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn( 'Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue()) self.assertEqual(opts['s3_tmp_dir'], 's3://bucket/walrus') self.assertNotIn('s3_scratch_uri', opts) self.assertIn( 'Deprecated option s3_scratch_uri has been renamed' ' to s3_tmp_dir', stderr.getvalue())
def test_cleanup_options(self): stderr = StringIO() with no_handlers_for_logger('mrjob.runner'): log_to_stream('mrjob.runner', stderr) opts = RunnerOptionStore( 'inline', dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'], cleanup_on_failure=['SCRATCH']), []) self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'REMOTE_TMP']) self.assertIn( 'Deprecated cleanup option LOCAL_SCRATCH has been renamed' ' to LOCAL_TMP', stderr.getvalue()) self.assertIn( 'Deprecated cleanup option REMOTE_SCRATCH has been renamed' ' to REMOTE_TMP', stderr.getvalue()) # should quietly convert string to list self.assertEqual(opts['cleanup_on_failure'], ['TMP']) self.assertIn( 'Deprecated cleanup_on_failure option SCRATCH has been renamed' ' to TMP', stderr.getvalue())
def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '-v']) mr_job.sandbox() with no_handlers_for_logger('mrjob.dataproc'): stderr = StringIO() log_to_stream('mrjob.dataproc', stderr) self.mock_jobs_succeed = False with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) self.assertRaises(StepFailedException, runner.run) self.assertIn(' => ERROR\n', stderr.getvalue()) cluster_id = runner.get_cluster_id() # job should get terminated cluster = runner._get_cluster(cluster_id) self.assertEqual(_cluster_state_name(cluster.status.state), 'DELETING')
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with no_handlers_for_logger(): with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue( any('import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any('#' in line for line in lines)) # should still get expected results self.assertEqual(sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def test_large_amounts_of_stderr(self): mr_job = MRVerboseJob(['--no-conf', '-r', 'local']) mr_job.sandbox() try: with no_handlers_for_logger(): mr_job.run_job() except TimeoutException: raise except Exception as e: # we expect the job to throw an exception # look for expected output from MRVerboseJob stderr = mr_job.stderr.getvalue() self.assertIn(b"Counters from step 1:\n\tFoo\n\t\tBar=10000", stderr) self.assertIn(b'status: 0\n', stderr) self.assertIn(b'status: 99\n', stderr) self.assertNotIn(b'status: 100\n', stderr) self.assertIn(b'STDERR: Qux\n', stderr) # exception should appear in exception message self.assertIn('BOOM', repr(e)) else: raise AssertionError()
def test_large_amounts_of_stderr(self): mr_job = MRVerboseJob(['--no-conf', '-r', 'local', '-v']) mr_job.sandbox() try: with no_handlers_for_logger(): mr_job.run_job() except TimeoutException: raise except SystemExit: # we expect the job to throw a StepFailedException, # which causes run_job to call sys.exit() # look for expected output from MRVerboseJob stderr = mr_job.stderr.getvalue() self.assertIn(b"Counters: 1\n\tFoo\n\t\tBar=10000", stderr) self.assertIn(b'Status: 0\n', stderr) self.assertIn(b'Status: 99\n', stderr) self.assertNotIn(b'Status: 100\n', stderr) self.assertIn(b'STDERR: Qux\n', stderr) # exception should appear in exception message self.assertIn(b'BOOM', stderr) else: raise AssertionError()
def test_other_environment_variable(self): self._add_hadoop_bin_for_envvar('HADOOP_YARN_MRJOB_DIR', 'bin') with no_handlers_for_logger('mrjob.fs.hadoop'): self.assertEqual(self.fs.get_hadoop_bin(), ['hadoop'])
def test_local_runner(self): launcher = MRJobLauncher(args=['--no-conf', '-r', 'local', '']) with no_handlers_for_logger('mrjob.runner'): with launcher.make_runner() as runner: self.assertIsInstance(runner, LocalMRJobRunner)
def test_emr_runner(self): launcher = MRJobLauncher(args=['--no-conf', '-r', 'emr', '']) with no_handlers_for_logger('mrjob'): with patch_fs_s3(): with launcher.make_runner() as runner: self.assertIsInstance(runner, EMRJobRunner)