def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '-v']) mr_job.sandbox() with no_handlers_for_logger('mrjob.dataproc'): stderr = StringIO() log_to_stream('mrjob.dataproc', stderr) self._dataproc_client.job_get_advances_states = ( collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR'])) with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) self.assertRaises(StepFailedException, runner.run) self.assertIn(' => ERROR\n', stderr.getvalue()) cluster_id = runner.get_cluster_id() # job should get terminated cluster = ( self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'DELETING')
def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '-v']) mr_job.sandbox() with no_handlers_for_logger('mrjob.dataproc'): stderr = StringIO() log_to_stream('mrjob.dataproc', stderr) self._dataproc_client.job_get_advances_states = (collections.deque( ['SETUP_DONE', 'RUNNING', 'ERROR'])) with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) self.assertRaises(StepFailedException, runner.run) self.assertIn(' => ERROR\n', stderr.getvalue()) cluster_id = runner.get_cluster_id() # job should get terminated cluster = ( self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id]) cluster_state = self._dataproc_client.get_state(cluster) self.assertEqual(cluster_state, 'DELETING')
def assert_hadoop_version(self, JobClass, version_string): mr_job = JobClass() mock_log = StringIO() with no_handlers_for_logger("mrjob.job"): log_to_stream("mrjob.job", mock_log) self.assertEqual(mr_job.jobconf()["hadoop_version"], version_string) self.assertIn("should be a string", mock_log.getvalue())
def test_hadoop_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) # HadoopRunnerOptionStore really wants to find the streaming jar with patch.object(mrjob.hadoop, 'find_hadoop_streaming_jar', return_value='found'): opts = HadoopRunnerOptionStore( 'hadoop', dict(base_tmp_dir='/scratch', hadoop_home='required', hdfs_scratch_dir='hdfs:///scratch'), []) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn( 'Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue()) self.assertEqual(opts['hadoop_tmp_dir'], 'hdfs:///scratch') self.assertNotIn('hdfs_scratch_dir', opts) self.assertIn( 'Deprecated option hdfs_scratch_dir has been renamed' ' to hadoop_tmp_dir', stderr.getvalue())
def test_cleanup_options(self): stderr = StringIO() with no_handlers_for_logger('mrjob.runner'): log_to_stream('mrjob.runner', stderr) opts = RunnerOptionStore( 'inline', dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'], cleanup_on_failure=['JOB_FLOW', 'SCRATCH']), []) self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'CLOUD_TMP']) self.assertIn( 'Deprecated cleanup option LOCAL_SCRATCH has been renamed' ' to LOCAL_TMP', stderr.getvalue()) self.assertIn( 'Deprecated cleanup option REMOTE_SCRATCH has been renamed' ' to CLOUD_TMP', stderr.getvalue()) self.assertEqual(opts['cleanup_on_failure'], ['CLUSTER', 'TMP']) self.assertIn( 'Deprecated cleanup_on_failure option JOB_FLOW has been' ' renamed to CLUSTER', stderr.getvalue()) self.assertIn( 'Deprecated cleanup_on_failure option SCRATCH has been renamed' ' to TMP', stderr.getvalue())
def test_non_log_lines(self): lines = StringIO('foo\n' 'bar\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!') with no_handlers_for_logger('mrjob.logs.parse'): stderr = StringIO() log_to_stream('mrjob.logs.parse', stderr) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ # ignore leading non-log lines dict( timestamp='15/12/11 13:26:08', level='ERROR', logger='streaming.StreamJob', thread=None, # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=('Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!')) ]) # should be one warning for each leading non-log line log_lines = stderr.getvalue().splitlines() self.assertEqual(len(log_lines), 2)
def test_non_log_lines(self): lines = StringIO( "foo\n" "bar\n" "15/12/11 13:26:08 ERROR streaming.StreamJob:" " Error Launching job :" " Output directory already exists\n" "Streaming Command Failed!" ) with no_handlers_for_logger("mrjob.logs.parse"): stderr = StringIO() log_to_stream("mrjob.logs.parse", stderr) self.assertEqual( list(_parse_hadoop_log_lines(lines)), [ # ignore leading non-log lines dict( timestamp="15/12/11 13:26:08", level="ERROR", logger="streaming.StreamJob", thread=None, # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=( "Error Launching job :" " Output directory already exists\n" "Streaming Command Failed!" ), ) ], ) # should be one warning for each leading non-log line log_lines = stderr.getvalue().splitlines() self.assertEqual(len(log_lines), 2)
def assert_hadoop_version(self, JobClass, version_string): mr_job = JobClass() mock_log = StringIO() with no_handlers_for_logger('mrjob.job'): log_to_stream('mrjob.job', mock_log) self.assertEqual(mr_job.jobconf()['hadoop_version'], version_string) self.assertIn('should be a string', mock_log.getvalue())
def updated_and_warnings(self, jobconf, hadoop_version): jobconf = jobconf.copy() with no_handlers_for_logger("mrjob.runner"): stderr = StringIO() log_to_stream("mrjob.runner", stderr) self.runner._update_jobconf_for_hadoop_version(jobconf, hadoop_version) return jobconf, stderr.getvalue()
def test_messy_error(self): counter_string = b'Job JOBID="_001" FAILED_REDUCES="0" COUNTERS="THIS IS NOT ACTUALLY A COUNTER"' with no_handlers_for_logger(''): stderr = StringIO() log_to_stream('mrjob.parse', stderr, level=logging.WARN) self.assertEqual(({}, 1), parse_hadoop_counters_from_line(counter_string)) self.assertIn('Cannot parse Hadoop counter string', stderr.getvalue())
def updated_and_warnings(self, jobconf, hadoop_version): jobconf = jobconf.copy() with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.runner._update_jobconf_for_hadoop_version( jobconf, hadoop_version) return jobconf, stderr.getvalue()
def test_option_debug_printout(self): stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr, debug=True) InlineMRJobRunner(owner='dave') self.assertIn("'owner'", stderr.getvalue()) self.assertIn("'dave'", stderr.getvalue())
def get_debug_printout(self, opt_store_class, alias, opts): stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr, debug=True) # debug printout happens in constructor opt_store_class(alias, opts, []) return stderr.getvalue()
def test_empty_runner_error(self): conf = dict(runner=dict(local=dict(local_tmp_dir='/tmp'))) path = self.save_conf('basic', conf) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr) RunnerOptionStore('inline', {}, [path]) self.assertEqual("No configs specified for inline runner\n", stderr.getvalue())
def test_empty_runner_error(self): conf = dict(runner=dict(local=dict(local_tmp_dir='/tmp'))) path = self.save_conf('basic', conf) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.runner', stderr) RunnerOptionStore('inline', {}, [path]) self.assertEqual( "No configs specified for inline runner\n", stderr.getvalue())
def test_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) opts = RunnerOptionStore( 'inline', dict(base_tmp_dir='/scratch'), []) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn('Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue())
def test_indentation_is_required(self): lines = ["File System Counters", " FILE: Number of bytes read=8"] with no_handlers_for_logger("mrjob.logs.parse"): stderr = StringIO() log_to_stream("mrjob.logs.parse", stderr) # counter line is interpreted as group self.assertEqual(_parse_indented_counters(lines), {}) # should complain self.assertNotEqual(stderr.getvalue(), "")
def test_attrs_should_be_classes(self): with no_handlers_for_logger('mrjob.job'): stderr = StringIO() log_to_stream('mrjob.job', stderr) job = self.StrangeJob() self.assertIsInstance(job.input_protocol(), JSONProtocol) self.assertIsInstance(job.internal_protocol(), JSONProtocol) self.assertIsInstance(job.output_protocol(), JSONProtocol) logs = stderr.getvalue() self.assertIn('INPUT_PROTOCOL should be a class', logs) self.assertIn('INTERNAL_PROTOCOL should be a class', logs) self.assertIn('OUTPUT_PROTOCOL should be a class', logs)
def test_recurse(self): path = os.path.join(self.tmp_dir, 'LOL.conf') recurse_conf = dict(include=path) with open(path, 'w') as f: dump_mrjob_conf(recurse_conf, f) stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.conf', stderr) RunnerOptionStore('inline', {}, [path]) self.assertIn('%s tries to recursively include %s!' % (path, path), stderr.getvalue())
def _test_recoverable_error(self, ex): self.mock_paths = ['/path/to/logs/oak', ex] with no_handlers_for_logger('mrjob.logs.wrap'): stderr = StringIO() log_to_stream('mrjob.logs.wrap', stderr) self.assertEqual(self._ls_logs([['/path/to/logs']]), [dict(path='/path/to/logs/oak')]) self.mock_fs.ls.assert_called_once_with('/path/to/logs') self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
def test_passthrough(self): runner = InlineMRJobRunner() with no_handlers_for_logger("mrjob.runner"): stderr = StringIO() log_to_stream("mrjob.runner", stderr) self.assertEqual(runner.ls, runner.fs.ls) # no special rules for underscore methods self.assertEqual(runner._cat_file, runner.fs._cat_file) self.assertIn("deprecated: call InlineMRJobRunner.fs.ls() directly", stderr.getvalue()) self.assertIn("deprecated: call InlineMRJobRunner.fs._cat_file() directly", stderr.getvalue())
def test_with_header(self): lines = ["Counters: 1", " File System Counters", " FILE: Number of bytes read=86"] with no_handlers_for_logger("mrjob.logs.parse"): stderr = StringIO() log_to_stream("mrjob.logs.parse", stderr) self.assertEqual( _parse_indented_counters(lines), {"File System Counters": {"FILE: Number of bytes read": 86}} ) # header shouldn't freak it out self.assertEqual(stderr.getvalue(), "")
def test_deprecated_alias(self): with no_handlers_for_logger('mrjob.util'): stderr = StringIO() log_to_stream('mrjob.util', stderr) self.assertEqual( list(buffer_iterator_to_line_iterator( chunk for chunk in [b'The quick\nbrown fox\njumped over\nthe lazy\ndogs.\n']) ), [b'The quick\n', b'brown fox\n', b'jumped over\n', b'the lazy\n', b'dogs.\n']) self.assertIn('has been renamed', stderr.getvalue())
def test_io_error(self): self.mock_paths = [ IOError(), ] with no_handlers_for_logger('mrjob.logs.ls'): stderr = StringIO() log_to_stream('mrjob.logs.ls', stderr) self.assertEqual(list(_ls_logs(self.mock_fs, '/path/to/logs')), []) self.mock_fs.ls.assert_called_once_with('/path/to/logs') self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
def test_deprecated_alias(self): with no_handlers_for_logger('mrjob.util'): stderr = StringIO() log_to_stream('mrjob.util', stderr) self.assertEqual( list(buffer_iterator_to_line_iterator(chunk for chunk in [b'The quick\nbrown fox\nju', b'mped over\nthe lazy\ndog', b's.\n'])), [b'The quick\n', b'brown fox\n', b'jumped over\n', b'the lazy\n', b'dogs.\n']) self.assertIn('has been renamed', stderr.getvalue())
def test_dry_run(self): stdout = StringIO() self.maybe_terminate_quietly( stdout=stdout, max_mins_idle=0.6, dry_run=True) # dry_run doesn't actually try to lock expected_stdout_lines = self.EXPECTED_STDOUT_LINES + [ 'Terminated cluster j-IDLE_AND_LOCKED (IDLE_AND_LOCKED);' ' was idle for 2:00:00'] self.assertEqual(set(stdout.getvalue().splitlines()), set(expected_stdout_lines)) # shouldn't *actually* terminate clusters self.assertEqual(self.ids_of_terminated_clusters(), [])
def test_prefer_own_methods(self): # TODO: currently can't initialize HadoopRunner without setting these runner = HadoopJobRunner(hadoop_bin="hadoop", hadoop_home="kansas", hadoop_streaming_jar="streaming.jar") with no_handlers_for_logger("mrjob.runner"): stderr = StringIO() log_to_stream("mrjob.runner", stderr) self.assertEqual(runner.ls, runner.fs.ls) # Hadoop Runner has its own version self.assertNotEqual(runner.get_hadoop_version, runner.fs.get_hadoop_version) self.assertIn("deprecated: call HadoopJobRunner.fs.ls() directly", stderr.getvalue()) self.assertNotIn("get_hadoop_version", stderr.getvalue())
def test_indentation_is_required(self): lines = [ 'File System Counters', ' FILE: Number of bytes read=8', ] with no_handlers_for_logger('mrjob.logs.step'): stderr = StringIO() log_to_stream('mrjob.logs.step', stderr) # counter line is interpreted as group self.assertEqual(_parse_indented_counters(lines), {}) # should complain self.assertNotEqual(stderr.getvalue(), '')
def test_dry_run(self): stdout = StringIO() self.maybe_terminate_quietly( stdout=stdout, max_hours_idle=0.01, dry_run=True) # dry_run doesn't actually try to lock expected_stdout_lines = self.EXPECTED_STDOUT_LINES + [ 'Terminated job flow j-IDLE_AND_LOCKED (IDLE_AND_LOCKED);' ' was idle for 2:00:00, 1:00:00 to end of hour'] self.assertEqual(set(stdout.getvalue().splitlines()), set(expected_stdout_lines)) # shouldn't *actually* terminate clusters self.assertEqual(self.ids_of_terminated_clusters(), [])
def test_warn_on_io_error(self): self.mock_paths = [ '/path/to/logs/oak', IOError(), ] with no_handlers_for_logger('mrjob.logs.ls'): stderr = StringIO() log_to_stream('mrjob.logs.wrap', stderr) self.assertEqual(self._ls_logs([['/path/to/logs']]), [dict(path='/path/to/logs/oak')]) self.mock_fs.ls.assert_called_once_with('/path/to/logs') self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
class ReportLongJobsTestCase(MockBotoTestCase): def setUp(self): super(ReportLongJobsTestCase, self).setUp() # redirect print statements to self.stdout self._real_stdout = sys.stdout self.stdout = StringIO() sys.stdout = self.stdout def tearDown(self): sys.stdout = self._real_stdout super(ReportLongJobsTestCase, self).tearDown() def test_with_no_clusters(self): main(['-q', '--no-conf']) # just make sure it doesn't crash def test_with_all_clusters(self): for cluster in CLUSTERS: self.add_mock_emr_cluster(cluster) emr_conn = self.connect_emr() emr_conn.run_jobflow('no name', job_flow_role='fake-instance-profile', service_role='fake-service-role') main(['-q', '--no-conf']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 1)
def test_with_all_job_flows(self): self.mock_emr_job_flows.update(JOB_FLOWS_BY_ID) emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn() emr_conn.run_jobflow('no name', log_uri=None) main(['-q', '--no-conf']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(JOB_FLOWS_BY_ID) - 1)
def test_its_not_very_quiet(self): stdout = StringIO() self.inspect_and_maybe_terminate_quietly( stdout=stdout, max_hours_idle=0.01) output = """Terminated job flow j-POOLED (Pooled Job Flow); was idle for 0:50:00, 0:05:00 to end of hour Terminated job flow j-PENDING_BUT_IDLE (Pending But Idle Job Flow); was pending for 2:50:00, 0:05:00 to end of hour Terminated job flow j-DEBUG_ONLY (Debug Only Job Flow); was idle for 2:00:00, 1:00:00 to end of hour Terminated job flow j-DONE_AND_IDLE (Done And Idle Job Flow); was idle for 2:00:00, 1:00:00 to end of hour Terminated job flow j-IDLE_AND_EXPIRED (Idle And Expired Job Flow); was idle for 2:00:00, 1:00:00 to end of hour Terminated job flow j-IDLE_AND_FAILED (Idle And Failed Job Flow); was idle for 3:00:00, 1:00:00 to end of hour Terminated job flow j-HADOOP_DEBUGGING (Hadoop Debugging Job Flow); was idle for 2:00:00, 1:00:00 to end of hour Terminated job flow j-EMPTY (Empty Job Flow); was idle for 10:00:00, 1:00:00 to end of hour """ self.assertEqual( sorted(stdout.getvalue().splitlines()), sorted(output.splitlines()))
def test_pass_through_fields(self): # TODO: currently can't initialize HadoopRunner without setting these runner = HadoopJobRunner(hadoop_bin='hadoooooooooop', hadoop_home='kansas', hadoop_streaming_jar='streaming.jar') with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner._hadoop_bin, runner.fs._hadoop_bin) # deprecation warning is different for non-functions self.assertIn( 'deprecated: access HadoopJobRunner.fs._hadoop_bin directly', stderr.getvalue())
def test_verbose(self): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging(verbose=True) log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
def test_log_lines(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( level='INFO', logger='client.RMProxy', message='Connecting to ResourceManager at /0.0.0.0:8032', num_lines=1, start_line=0, thread='', timestamp='15/12/11 13:26:07', ), dict( level='ERROR', logger='streaming.StreamJob', message=('Error Launching job :' ' Output directory already exists'), num_lines=1, start_line=1, thread='', timestamp='15/12/11 13:26:08', ), ])
class ReportLongJobsTestCase(MockBoto3TestCase): def setUp(self): super(ReportLongJobsTestCase, self).setUp() # redirect print statements to self.stdout self._real_stdout = sys.stdout self.stdout = StringIO() sys.stdout = self.stdout def tearDown(self): sys.stdout = self._real_stdout super(ReportLongJobsTestCase, self).tearDown() def test_with_no_clusters(self): main(['-q', '--no-conf']) # just make sure it doesn't crash def test_with_all_clusters(self): for cluster in CLUSTERS: self.add_mock_emr_cluster(cluster) emr_client = self.client('emr') emr_client.run_job_flow( Name='no name', Instances=dict( MasterInstanceType='m1.medium', InstanceCount=1, ), JobFlowRole='fake-instance-profile', ReleaseLabel='emr-4.0.0', ServiceRole='fake-service-role', ) main(['-q', '--no-conf']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 1)
def test_yarn_output(self): # abbreviated version of real output from Hadoop 2.7.0. # Including things that might be interesting to parse later on lines = StringIO( '15/12/11 13:32:44 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\n' '15/12/11 13:32:45 INFO mapreduce.JobSubmitter:' ' Submitting tokens for job: job_1449857544442_0002\n' '15/12/11 13:32:45 INFO impl.YarnClientImpl:' ' Submitted application application_1449857544442_0002\n' '15/12/11 13:32:45 INFO mapreduce.Job:' ' The url to track the job:' ' http://0a7802e19139:8088/proxy/application_1449857544442_0002/\n' '15/12/11 13:33:11 INFO mapreduce.Job: map 100% reduce 100%\n' '15/12/11 13:33:11 INFO mapreduce.Job:' ' Job job_1449857544442_0002 completed successfully\n' '15/12/11 13:33:11 INFO mapreduce.Job: Counters: 49\n' ' File System Counters\n' ' FILE: Number of bytes read=86\n' '15/12/11 13:33:11 INFO streaming.StreamJob:' ' Output directory:' ' hdfs:///user/root/tmp/mrjob/mr_wc.root.20151211.181326.984074' '/output\n') self.assertEqual( _parse_hadoop_streaming_log(lines), dict(application_id='application_1449857544442_0002', counters={ 'File System Counters': { 'FILE: Number of bytes read': 86, } }, job_id='job_1449857544442_0002', output_dir=('hdfs:///user/root/tmp/mrjob' '/mr_wc.root.20151211.181326.984074/output')))
def test_wrapper_script_only_writes_to_stderr(self): job = MROSWalkJob(["-r", "local", "--setup", "echo stray output"]) job.sandbox() with no_handlers_for_logger("mrjob.local"): stderr = StringIO() log_to_stream("mrjob.local", stderr, debug=True) with job.make_runner() as r: r.run() output = b"".join(r.stream_output()) # stray ouput should be in stderr, not the job's output self.assertIn("stray output", stderr.getvalue()) self.assertNotIn(b"stray output", output)
def test_pre_yarn_output(self): # actual output from Hadoop 1.0.3 on EMR AMI 2.4.9 # Including things that might be interesting to parse later on lines = StringIO( '15/12/11 23:08:37 INFO streaming.StreamJob:' ' getLocalDirs(): [/mnt/var/lib/hadoop/mapred]\n' '15/12/11 23:08:37 INFO streaming.StreamJob:' ' Running job: job_201512112247_0003\n' '15/12/11 23:08:37 INFO streaming.StreamJob:' ' Tracking URL:' ' http://ip-172-31-27-129.us-west-2.compute.internal:9100' '/jobdetails.jsp?jobid=job_201512112247_0003\n' '15/12/11 23:09:16 INFO streaming.StreamJob:' ' map 100% reduce 100%\n' '15/12/11 23:09:22 INFO streaming.StreamJob:' ' Output: hdfs:///user/hadoop/tmp/mrjob' '/mr_wc.hadoop.20151211.230352.433691/output\n') self.assertEqual( _parse_hadoop_streaming_log(lines), dict(application_id=None, counters=None, job_id='job_201512112247_0003', output_dir=('hdfs:///user/hadoop/tmp/mrjob' '/mr_wc.hadoop.20151211.230352.433691/output')))
def test_dry_run(self): stdout = StringIO() self.maybe_terminate_quietly( stdout=stdout, max_mins_idle=0.6, dry_run=True) # shouldn't *actually* terminate clusters self.assertEqual(self.ids_of_terminated_clusters(), [])
class ReportLongJobsTestCase(MockBoto3TestCase): def setUp(self): super(ReportLongJobsTestCase, self).setUp() # redirect print statements to self.stdout self._real_stdout = sys.stdout self.stdout = StringIO() sys.stdout = self.stdout def tearDown(self): sys.stdout = self._real_stdout super(ReportLongJobsTestCase, self).tearDown() def test_with_no_clusters(self): main(['-q', '--no-conf']) # just make sure it doesn't crash def test_with_all_clusters(self): for cluster in CLUSTERS: self.add_mock_emr_cluster(cluster) emr_client = self.client('emr') emr_client.run_job_flow( Name='no name', Instances=dict( MasterInstanceType='m1.medium', InstanceCount=1, ), JobFlowRole='fake-instance-profile', ReleaseLabel='emr-4.0.0', ServiceRole='fake-service-role', ) main(['-q', '--no-conf']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 1) self.assertNotIn('j-COMPLETED', self.stdout.getvalue()) def test_exclude(self): for cluster in CLUSTERS: self.add_mock_emr_cluster(cluster) main(['-q', '--no-conf', '-x', 'my_key,my_value']) lines = [line for line in StringIO(self.stdout.getvalue())] self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 2) self.assertNotIn('j-COMPLETED', self.stdout.getvalue()) self.assertNotIn('j-RUNNING1STEP', self.stdout.getvalue())
def test_passthrough(self): runner = InlineMRJobRunner() with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner.ls, runner.fs.ls) # no special rules for underscore methods self.assertEqual(runner._cat_file, runner.fs._cat_file) self.assertIn( 'deprecated: call InlineMRJobRunner.fs.ls() directly', stderr.getvalue()) self.assertIn( 'deprecated: call InlineMRJobRunner.fs._cat_file() directly', stderr.getvalue())
def test_pass_through_fields(self): # TODO: currently can't initialize HadoopRunner without setting these runner = HadoopJobRunner( hadoop_bin='hadoooooooooop', hadoop_home='kansas', hadoop_streaming_jar='streaming.jar') with no_handlers_for_logger('mrjob.runner'): stderr = StringIO() log_to_stream('mrjob.runner', stderr) self.assertEqual(runner._hadoop_bin, runner.fs._hadoop_bin) # deprecation warning is different for non-functions self.assertIn( 'deprecated: access HadoopJobRunner.fs._hadoop_bin directly', stderr.getvalue())
def test_default_options(self): with no_handlers_for_logger('__main__'): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging() log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\n')
def test_its_not_very_quiet(self): stdout = StringIO() self.maybe_terminate_quietly( stdout=stdout, max_hours_idle=0.01) self.assertEqual(set(stdout.getvalue().splitlines()), set(self.EXPECTED_STDOUT_LINES)) # should have actually terminated clusters self.assertEqual(self.ids_of_terminated_clusters(), [ 'j-DEBUG_ONLY', 'j-DONE_AND_IDLE', 'j-HADOOP_DEBUGGING', 'j-IDLE_AND_EXPIRED', 'j-IDLE_AND_FAILED', 'j-PENDING_BUT_IDLE', 'j-POOLED', ])
def test_its_not_very_quiet(self): stdout = StringIO() self.maybe_terminate_quietly(stdout=stdout, max_hours_idle=0.01) self.assertEqual(set(stdout.getvalue().splitlines()), set(self.EXPECTED_STDOUT_LINES)) # should have actually terminated clusters self.assertEqual(self.ids_of_terminated_clusters(), [ 'j-DEBUG_ONLY', 'j-DONE_AND_IDLE', 'j-DONE_AND_IDLE_4_X', 'j-HADOOP_DEBUGGING', 'j-IDLE_AND_EXPIRED', 'j-IDLE_AND_FAILED', 'j-PENDING_BUT_IDLE', 'j-POOLED', ])
def test_with_header(self): lines = [ 'Counters: 1', ' File System Counters', ' FILE: Number of bytes read=86', ] with no_handlers_for_logger('mrjob.logs.step'): stderr = StringIO() log_to_stream('mrjob.logs.step', stderr) self.assertEqual(_parse_indented_counters(lines), { 'File System Counters': { 'FILE: Number of bytes read': 86, }, }) # header shouldn't freak it out self.assertEqual(stderr.getvalue(), '')
def test_wrapper_script_only_writes_to_stderr(self): job = MROSWalkJob([ '-r', 'local', '--setup', 'echo stray output', ]) job.sandbox() with no_handlers_for_logger('mrjob.local'): stderr = StringIO() log_to_stream('mrjob.local', stderr, debug=True) with job.make_runner() as r: r.run() output = b''.join(r.stream_output()) # stray ouput should be in stderr, not the job's output self.assertIn('stray output', stderr.getvalue()) self.assertNotIn(b'stray output', output)
def test_emr_runner_option_store(self): stderr = StringIO() with no_handlers_for_logger('mrjob.conf'): log_to_stream('mrjob.conf', stderr) opts = EMRRunnerOptionStore( 'emr', dict(base_tmp_dir='/scratch', s3_scratch_uri='s3://bucket/walrus'), []) self.assertEqual(opts['local_tmp_dir'], '/scratch') self.assertNotIn('base_tmp_dir', opts) self.assertIn('Deprecated option base_tmp_dir has been renamed' ' to local_tmp_dir', stderr.getvalue()) self.assertEqual(opts['s3_tmp_dir'], 's3://bucket/walrus') self.assertNotIn('s3_scratch_uri', opts) self.assertIn('Deprecated option s3_scratch_uri has been renamed' ' to s3_tmp_dir', stderr.getvalue())