Example #1
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
        mr_job.sandbox()

        with no_handlers_for_logger('mrjob.dataproc'):
            stderr = StringIO()
            log_to_stream('mrjob.dataproc', stderr)

            self._dataproc_client.job_get_advances_states = (
                collections.deque(['SETUP_DONE', 'RUNNING', 'ERROR']))

            with mr_job.make_runner() as runner:
                self.assertIsInstance(runner, DataprocJobRunner)

                self.assertRaises(StepFailedException, runner.run)

                self.assertIn(' => ERROR\n', stderr.getvalue())

                cluster_id = runner.get_cluster_id()

        # job should get terminated
        cluster = (
            self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'DELETING')
Example #2
0
    def test_failed_job(self):
        mr_job = MRTwoStepJob(['-r', 'dataproc', '-v'])
        mr_job.sandbox()

        with no_handlers_for_logger('mrjob.dataproc'):
            stderr = StringIO()
            log_to_stream('mrjob.dataproc', stderr)

            self._dataproc_client.job_get_advances_states = (collections.deque(
                ['SETUP_DONE', 'RUNNING', 'ERROR']))

            with mr_job.make_runner() as runner:
                self.assertIsInstance(runner, DataprocJobRunner)

                self.assertRaises(StepFailedException, runner.run)

                self.assertIn(' => ERROR\n', stderr.getvalue())

                cluster_id = runner.get_cluster_id()

        # job should get terminated
        cluster = (
            self._dataproc_client._cache_clusters[_TEST_PROJECT][cluster_id])
        cluster_state = self._dataproc_client.get_state(cluster)
        self.assertEqual(cluster_state, 'DELETING')
Example #3
0
 def assert_hadoop_version(self, JobClass, version_string):
     mr_job = JobClass()
     mock_log = StringIO()
     with no_handlers_for_logger("mrjob.job"):
         log_to_stream("mrjob.job", mock_log)
         self.assertEqual(mr_job.jobconf()["hadoop_version"], version_string)
         self.assertIn("should be a string", mock_log.getvalue())
Example #4
0
    def test_hadoop_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)

            # HadoopRunnerOptionStore really wants to find the streaming jar
            with patch.object(mrjob.hadoop,
                              'find_hadoop_streaming_jar',
                              return_value='found'):
                opts = HadoopRunnerOptionStore(
                    'hadoop',
                    dict(base_tmp_dir='/scratch',
                         hadoop_home='required',
                         hdfs_scratch_dir='hdfs:///scratch'), [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn(
                'Deprecated option base_tmp_dir has been renamed'
                ' to local_tmp_dir', stderr.getvalue())

            self.assertEqual(opts['hadoop_tmp_dir'], 'hdfs:///scratch')
            self.assertNotIn('hdfs_scratch_dir', opts)
            self.assertIn(
                'Deprecated option hdfs_scratch_dir has been renamed'
                ' to hadoop_tmp_dir', stderr.getvalue())
Example #5
0
    def test_cleanup_options(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.runner'):
            log_to_stream('mrjob.runner', stderr)
            opts = RunnerOptionStore(
                'inline',
                dict(cleanup=['LOCAL_SCRATCH', 'REMOTE_SCRATCH'],
                     cleanup_on_failure=['JOB_FLOW', 'SCRATCH']),
                [])

            self.assertEqual(opts['cleanup'], ['LOCAL_TMP', 'CLOUD_TMP'])
            self.assertIn(
                'Deprecated cleanup option LOCAL_SCRATCH has been renamed'
                ' to LOCAL_TMP', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup option REMOTE_SCRATCH has been renamed'
                ' to CLOUD_TMP', stderr.getvalue())

            self.assertEqual(opts['cleanup_on_failure'], ['CLUSTER', 'TMP'])
            self.assertIn(
                'Deprecated cleanup_on_failure option JOB_FLOW has been'
                ' renamed to CLUSTER', stderr.getvalue())
            self.assertIn(
                'Deprecated cleanup_on_failure option SCRATCH has been renamed'
                ' to TMP', stderr.getvalue())
Example #6
0
    def test_non_log_lines(self):
        lines = StringIO('foo\n'
                         'bar\n'
                         '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                         ' Error Launching job :'
                         ' Output directory already exists\n'
                         'Streaming Command Failed!')

        with no_handlers_for_logger('mrjob.logs.parse'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.parse', stderr)

            self.assertEqual(
            list(_parse_hadoop_log_lines(lines)), [
                # ignore leading non-log lines
                dict(
                    timestamp='15/12/11 13:26:08',
                    level='ERROR',
                    logger='streaming.StreamJob',
                    thread=None,
                    # no way to know that Streaming Command Failed! wasn't part
                    # of a multi-line message
                    message=('Error Launching job :'
                             ' Output directory already exists\n'
                             'Streaming Command Failed!'))
            ])

            # should be one warning for each leading non-log line
            log_lines = stderr.getvalue().splitlines()
            self.assertEqual(len(log_lines), 2)
Example #7
0
    def test_non_log_lines(self):
        lines = StringIO('foo\n'
                         'bar\n'
                         '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                         ' Error Launching job :'
                         ' Output directory already exists\n'
                         'Streaming Command Failed!')

        with no_handlers_for_logger('mrjob.logs.parse'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.parse', stderr)

            self.assertEqual(
                list(_parse_hadoop_log_lines(lines)),
                [
                    # ignore leading non-log lines
                    dict(
                        timestamp='15/12/11 13:26:08',
                        level='ERROR',
                        logger='streaming.StreamJob',
                        thread=None,
                        # no way to know that Streaming Command Failed! wasn't part
                        # of a multi-line message
                        message=('Error Launching job :'
                                 ' Output directory already exists\n'
                                 'Streaming Command Failed!'))
                ])

            # should be one warning for each leading non-log line
            log_lines = stderr.getvalue().splitlines()
            self.assertEqual(len(log_lines), 2)
Example #8
0
    def test_non_log_lines(self):
        lines = StringIO(
            "foo\n"
            "bar\n"
            "15/12/11 13:26:08 ERROR streaming.StreamJob:"
            " Error Launching job :"
            " Output directory already exists\n"
            "Streaming Command Failed!"
        )

        with no_handlers_for_logger("mrjob.logs.parse"):
            stderr = StringIO()
            log_to_stream("mrjob.logs.parse", stderr)

            self.assertEqual(
                list(_parse_hadoop_log_lines(lines)),
                [
                    # ignore leading non-log lines
                    dict(
                        timestamp="15/12/11 13:26:08",
                        level="ERROR",
                        logger="streaming.StreamJob",
                        thread=None,
                        # no way to know that Streaming Command Failed! wasn't part
                        # of a multi-line message
                        message=(
                            "Error Launching job :" " Output directory already exists\n" "Streaming Command Failed!"
                        ),
                    )
                ],
            )

            # should be one warning for each leading non-log line
            log_lines = stderr.getvalue().splitlines()
            self.assertEqual(len(log_lines), 2)
Example #9
0
 def assert_hadoop_version(self, JobClass, version_string):
     mr_job = JobClass()
     mock_log = StringIO()
     with no_handlers_for_logger('mrjob.job'):
         log_to_stream('mrjob.job', mock_log)
         self.assertEqual(mr_job.jobconf()['hadoop_version'],
                          version_string)
         self.assertIn('should be a string', mock_log.getvalue())
Example #10
0
    def updated_and_warnings(self, jobconf, hadoop_version):
        jobconf = jobconf.copy()
        with no_handlers_for_logger("mrjob.runner"):
            stderr = StringIO()
            log_to_stream("mrjob.runner", stderr)
            self.runner._update_jobconf_for_hadoop_version(jobconf, hadoop_version)

        return jobconf, stderr.getvalue()
Example #11
0
 def test_messy_error(self):
     counter_string = b'Job JOBID="_001" FAILED_REDUCES="0" COUNTERS="THIS IS NOT ACTUALLY A COUNTER"'
     with no_handlers_for_logger(''):
         stderr = StringIO()
         log_to_stream('mrjob.parse', stderr, level=logging.WARN)
         self.assertEqual(({}, 1),
                          parse_hadoop_counters_from_line(counter_string))
         self.assertIn('Cannot parse Hadoop counter string',
                       stderr.getvalue())
Example #12
0
    def updated_and_warnings(self, jobconf, hadoop_version):
        jobconf = jobconf.copy()
        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)
            self.runner._update_jobconf_for_hadoop_version(
                jobconf, hadoop_version)

        return jobconf, stderr.getvalue()
Example #13
0
    def test_option_debug_printout(self):
        stderr = StringIO()

        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr, debug=True)

            InlineMRJobRunner(owner='dave')

        self.assertIn("'owner'", stderr.getvalue())
        self.assertIn("'dave'", stderr.getvalue())
Example #14
0
    def get_debug_printout(self, opt_store_class, alias, opts):
        stderr = StringIO()

        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr, debug=True)

            # debug printout happens in constructor
            opt_store_class(alias, opts, [])

        return stderr.getvalue()
Example #15
0
    def get_debug_printout(self, opt_store_class, alias, opts):
        stderr = StringIO()

        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr, debug=True)

            # debug printout happens in constructor
            opt_store_class(alias, opts, [])

        return stderr.getvalue()
Example #16
0
    def test_empty_runner_error(self):
        conf = dict(runner=dict(local=dict(local_tmp_dir='/tmp')))
        path = self.save_conf('basic', conf)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr)
            RunnerOptionStore('inline', {}, [path])
            self.assertEqual("No configs specified for inline runner\n",
                             stderr.getvalue())
Example #17
0
    def test_option_debug_printout(self):
        stderr = StringIO()

        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr, debug=True)

            InlineMRJobRunner(owner='dave')

        self.assertIn("'owner'", stderr.getvalue())
        self.assertIn("'dave'", stderr.getvalue())
Example #18
0
    def test_empty_runner_error(self):
        conf = dict(runner=dict(local=dict(local_tmp_dir='/tmp')))
        path = self.save_conf('basic', conf)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.runner', stderr)
            RunnerOptionStore('inline', {}, [path])
            self.assertEqual(
                "No configs specified for inline runner\n",
                stderr.getvalue())
Example #19
0
    def test_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)
            opts = RunnerOptionStore(
                'inline', dict(base_tmp_dir='/scratch'), [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn('Deprecated option base_tmp_dir has been renamed'
                          ' to local_tmp_dir', stderr.getvalue())
Example #20
0
    def test_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)
            opts = RunnerOptionStore(
                'inline', dict(base_tmp_dir='/scratch'), [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn('Deprecated option base_tmp_dir has been renamed'
                          ' to local_tmp_dir', stderr.getvalue())
Example #21
0
    def test_indentation_is_required(self):
        lines = ["File System Counters", "   FILE: Number of bytes read=8"]

        with no_handlers_for_logger("mrjob.logs.parse"):
            stderr = StringIO()
            log_to_stream("mrjob.logs.parse", stderr)

            # counter line is interpreted as group
            self.assertEqual(_parse_indented_counters(lines), {})

            # should complain
            self.assertNotEqual(stderr.getvalue(), "")
Example #22
0
 def test_attrs_should_be_classes(self):
     with no_handlers_for_logger('mrjob.job'):
         stderr = StringIO()
         log_to_stream('mrjob.job', stderr)
         job = self.StrangeJob()
         self.assertIsInstance(job.input_protocol(), JSONProtocol)
         self.assertIsInstance(job.internal_protocol(), JSONProtocol)
         self.assertIsInstance(job.output_protocol(), JSONProtocol)
         logs = stderr.getvalue()
         self.assertIn('INPUT_PROTOCOL should be a class', logs)
         self.assertIn('INTERNAL_PROTOCOL should be a class', logs)
         self.assertIn('OUTPUT_PROTOCOL should be a class', logs)
    def test_recurse(self):
        path = os.path.join(self.tmp_dir, 'LOL.conf')
        recurse_conf = dict(include=path)
        with open(path, 'w') as f:
            dump_mrjob_conf(recurse_conf, f)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.conf', stderr)
            RunnerOptionStore('inline', {}, [path])
            self.assertIn('%s tries to recursively include %s!' % (path, path),
                          stderr.getvalue())
Example #24
0
 def test_attrs_should_be_classes(self):
     with no_handlers_for_logger('mrjob.job'):
         stderr = StringIO()
         log_to_stream('mrjob.job', stderr)
         job = self.StrangeJob()
         self.assertIsInstance(job.input_protocol(), JSONProtocol)
         self.assertIsInstance(job.internal_protocol(), JSONProtocol)
         self.assertIsInstance(job.output_protocol(), JSONProtocol)
         logs = stderr.getvalue()
         self.assertIn('INPUT_PROTOCOL should be a class', logs)
         self.assertIn('INTERNAL_PROTOCOL should be a class', logs)
         self.assertIn('OUTPUT_PROTOCOL should be a class', logs)
Example #25
0
    def test_recurse(self):
        path = os.path.join(self.tmp_dir, 'LOL.conf')
        recurse_conf = dict(include=path)
        with open(path, 'w') as f:
            dump_mrjob_conf(recurse_conf, f)

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.conf', stderr)
            RunnerOptionStore('inline', {}, [path])
            self.assertIn('%s tries to recursively include %s!' % (path, path),
                          stderr.getvalue())
Example #26
0
    def _test_recoverable_error(self, ex):
        self.mock_paths = ['/path/to/logs/oak', ex]

        with no_handlers_for_logger('mrjob.logs.wrap'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.wrap', stderr)

            self.assertEqual(self._ls_logs([['/path/to/logs']]),
                             [dict(path='/path/to/logs/oak')])

            self.mock_fs.ls.assert_called_once_with('/path/to/logs')

            self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
Example #27
0
    def test_passthrough(self):
        runner = InlineMRJobRunner()

        with no_handlers_for_logger("mrjob.runner"):
            stderr = StringIO()
            log_to_stream("mrjob.runner", stderr)

            self.assertEqual(runner.ls, runner.fs.ls)
            # no special rules for underscore methods
            self.assertEqual(runner._cat_file, runner.fs._cat_file)

            self.assertIn("deprecated: call InlineMRJobRunner.fs.ls() directly", stderr.getvalue())
            self.assertIn("deprecated: call InlineMRJobRunner.fs._cat_file() directly", stderr.getvalue())
Example #28
0
    def test_with_header(self):
        lines = ["Counters: 1", "  File System Counters", "    FILE: Number of bytes read=86"]

        with no_handlers_for_logger("mrjob.logs.parse"):
            stderr = StringIO()
            log_to_stream("mrjob.logs.parse", stderr)

            self.assertEqual(
                _parse_indented_counters(lines), {"File System Counters": {"FILE: Number of bytes read": 86}}
            )

            # header shouldn't freak it out
            self.assertEqual(stderr.getvalue(), "")
Example #29
0
    def test_deprecated_alias(self):
        with no_handlers_for_logger('mrjob.util'):
            stderr = StringIO()
            log_to_stream('mrjob.util', stderr)

            self.assertEqual(
                list(buffer_iterator_to_line_iterator(
                    chunk for chunk in
                    [b'The quick\nbrown fox\njumped over\nthe lazy\ndogs.\n'])
                ),
                [b'The quick\n', b'brown fox\n', b'jumped over\n',
                 b'the lazy\n', b'dogs.\n'])

            self.assertIn('has been renamed', stderr.getvalue())
Example #30
0
    def test_io_error(self):
        self.mock_paths = [
            IOError(),
        ]

        with no_handlers_for_logger('mrjob.logs.ls'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.ls', stderr)

            self.assertEqual(list(_ls_logs(self.mock_fs, '/path/to/logs')), [])

            self.mock_fs.ls.assert_called_once_with('/path/to/logs')

            self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
Example #31
0
    def test_deprecated_alias(self):
        with no_handlers_for_logger('mrjob.util'):
            stderr = StringIO()
            log_to_stream('mrjob.util', stderr)

            self.assertEqual(
                list(buffer_iterator_to_line_iterator(chunk for chunk in
                          [b'The quick\nbrown fox\nju',
                           b'mped over\nthe lazy\ndog',
                           b's.\n'])),
            [b'The quick\n', b'brown fox\n', b'jumped over\n', b'the lazy\n',
             b'dogs.\n'])

            self.assertIn('has been renamed', stderr.getvalue())
Example #32
0
    def test_io_error(self):
        self.mock_paths = [
            IOError(),
        ]

        with no_handlers_for_logger('mrjob.logs.ls'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.ls', stderr)

            self.assertEqual(list(_ls_logs(self.mock_fs, '/path/to/logs')), [])

            self.mock_fs.ls.assert_called_once_with('/path/to/logs')

            self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
Example #33
0
    def test_dry_run(self):
        stdout = StringIO()
        self.maybe_terminate_quietly(
            stdout=stdout, max_mins_idle=0.6, dry_run=True)

        # dry_run doesn't actually try to lock
        expected_stdout_lines = self.EXPECTED_STDOUT_LINES + [
            'Terminated cluster j-IDLE_AND_LOCKED (IDLE_AND_LOCKED);'
            ' was idle for 2:00:00']

        self.assertEqual(set(stdout.getvalue().splitlines()),
                         set(expected_stdout_lines))

        # shouldn't *actually* terminate clusters
        self.assertEqual(self.ids_of_terminated_clusters(), [])
Example #34
0
    def test_prefer_own_methods(self):
        # TODO: currently can't initialize HadoopRunner without setting these
        runner = HadoopJobRunner(hadoop_bin="hadoop", hadoop_home="kansas", hadoop_streaming_jar="streaming.jar")

        with no_handlers_for_logger("mrjob.runner"):
            stderr = StringIO()
            log_to_stream("mrjob.runner", stderr)

            self.assertEqual(runner.ls, runner.fs.ls)

            # Hadoop Runner has its own version
            self.assertNotEqual(runner.get_hadoop_version, runner.fs.get_hadoop_version)

            self.assertIn("deprecated: call HadoopJobRunner.fs.ls() directly", stderr.getvalue())
            self.assertNotIn("get_hadoop_version", stderr.getvalue())
Example #35
0
    def test_indentation_is_required(self):
        lines = [
            'File System Counters',
            '   FILE: Number of bytes read=8',
        ]

        with no_handlers_for_logger('mrjob.logs.step'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.step', stderr)

            # counter line is interpreted as group
            self.assertEqual(_parse_indented_counters(lines), {})

            # should complain
            self.assertNotEqual(stderr.getvalue(), '')
    def test_dry_run(self):
        stdout = StringIO()
        self.maybe_terminate_quietly(
            stdout=stdout, max_hours_idle=0.01, dry_run=True)

        # dry_run doesn't actually try to lock
        expected_stdout_lines = self.EXPECTED_STDOUT_LINES + [
            'Terminated job flow j-IDLE_AND_LOCKED (IDLE_AND_LOCKED);'
            ' was idle for 2:00:00, 1:00:00 to end of hour']

        self.assertEqual(set(stdout.getvalue().splitlines()),
                         set(expected_stdout_lines))

        # shouldn't *actually* terminate clusters
        self.assertEqual(self.ids_of_terminated_clusters(), [])
Example #37
0
    def test_warn_on_io_error(self):
        self.mock_paths = [
            '/path/to/logs/oak',
            IOError(),
        ]

        with no_handlers_for_logger('mrjob.logs.ls'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.wrap', stderr)

            self.assertEqual(self._ls_logs([['/path/to/logs']]),
                             [dict(path='/path/to/logs/oak')])

            self.mock_fs.ls.assert_called_once_with('/path/to/logs')

            self.assertIn("couldn't ls() /path/to/logs", stderr.getvalue())
Example #38
0
class ReportLongJobsTestCase(MockBotoTestCase):

    def setUp(self):
        super(ReportLongJobsTestCase, self).setUp()
        # redirect print statements to self.stdout
        self._real_stdout = sys.stdout
        self.stdout = StringIO()
        sys.stdout = self.stdout

    def tearDown(self):
        sys.stdout = self._real_stdout
        super(ReportLongJobsTestCase, self).tearDown()

    def test_with_no_clusters(self):
        main(['-q', '--no-conf'])  # just make sure it doesn't crash

    def test_with_all_clusters(self):
        for cluster in CLUSTERS:
            self.add_mock_emr_cluster(cluster)

        emr_conn = self.connect_emr()
        emr_conn.run_jobflow('no name',
                             job_flow_role='fake-instance-profile',
                             service_role='fake-service-role')
        main(['-q', '--no-conf'])

        lines = [line for line in StringIO(self.stdout.getvalue())]
        self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 1)
Example #39
0
 def test_with_all_job_flows(self):
     self.mock_emr_job_flows.update(JOB_FLOWS_BY_ID)
     emr_conn = EMRJobRunner(conf_paths=[]).make_emr_conn()
     emr_conn.run_jobflow('no name', log_uri=None)
     main(['-q', '--no-conf'])
     lines = [line for line in StringIO(self.stdout.getvalue())]
     self.assertEqual(len(lines), len(JOB_FLOWS_BY_ID) - 1)
    def test_its_not_very_quiet(self):
        stdout = StringIO()
        self.inspect_and_maybe_terminate_quietly(
            stdout=stdout, max_hours_idle=0.01)
        output = """Terminated job flow j-POOLED (Pooled Job Flow); was idle for 0:50:00, 0:05:00 to end of hour
Terminated job flow j-PENDING_BUT_IDLE (Pending But Idle Job Flow); was pending for 2:50:00, 0:05:00 to end of hour
Terminated job flow j-DEBUG_ONLY (Debug Only Job Flow); was idle for 2:00:00, 1:00:00 to end of hour
Terminated job flow j-DONE_AND_IDLE (Done And Idle Job Flow); was idle for 2:00:00, 1:00:00 to end of hour
Terminated job flow j-IDLE_AND_EXPIRED (Idle And Expired Job Flow); was idle for 2:00:00, 1:00:00 to end of hour
Terminated job flow j-IDLE_AND_FAILED (Idle And Failed Job Flow); was idle for 3:00:00, 1:00:00 to end of hour
Terminated job flow j-HADOOP_DEBUGGING (Hadoop Debugging Job Flow); was idle for 2:00:00, 1:00:00 to end of hour
Terminated job flow j-EMPTY (Empty Job Flow); was idle for 10:00:00, 1:00:00 to end of hour
"""
        self.assertEqual(
            sorted(stdout.getvalue().splitlines()),
            sorted(output.splitlines()))
Example #41
0
    def test_pass_through_fields(self):
        # TODO: currently can't initialize HadoopRunner without setting these
        runner = HadoopJobRunner(hadoop_bin='hadoooooooooop',
                                 hadoop_home='kansas',
                                 hadoop_streaming_jar='streaming.jar')

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner._hadoop_bin, runner.fs._hadoop_bin)

            # deprecation warning is different for non-functions
            self.assertIn(
                'deprecated: access HadoopJobRunner.fs._hadoop_bin directly',
                stderr.getvalue())
Example #42
0
 def test_verbose(self):
     with patch.object(sys, 'stderr', StringIO()) as stderr:
         MRJob.set_up_logging(verbose=True)
         log = logging.getLogger('__main__')
         log.info('INFO')
         log.debug('DEBUG')
         self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
Example #43
0
 def test_log_lines(self):
     lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:'
                      ' Connecting to ResourceManager at /0.0.0.0:8032\n'
                      '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                      ' Error Launching job :'
                      ' Output directory already exists\n')
     self.assertEqual(
         list(_parse_hadoop_log4j_records(lines)), [
             dict(
                 level='INFO',
                 logger='client.RMProxy',
                 message='Connecting to ResourceManager at /0.0.0.0:8032',
                 num_lines=1,
                 start_line=0,
                 thread='',
                 timestamp='15/12/11 13:26:07',
             ),
             dict(
                 level='ERROR',
                 logger='streaming.StreamJob',
                 message=('Error Launching job :'
                          ' Output directory already exists'),
                 num_lines=1,
                 start_line=1,
                 thread='',
                 timestamp='15/12/11 13:26:08',
             ),
         ])
Example #44
0
class ReportLongJobsTestCase(MockBoto3TestCase):
    def setUp(self):
        super(ReportLongJobsTestCase, self).setUp()
        # redirect print statements to self.stdout
        self._real_stdout = sys.stdout
        self.stdout = StringIO()
        sys.stdout = self.stdout

    def tearDown(self):
        sys.stdout = self._real_stdout
        super(ReportLongJobsTestCase, self).tearDown()

    def test_with_no_clusters(self):
        main(['-q', '--no-conf'])  # just make sure it doesn't crash

    def test_with_all_clusters(self):
        for cluster in CLUSTERS:
            self.add_mock_emr_cluster(cluster)

        emr_client = self.client('emr')
        emr_client.run_job_flow(
            Name='no name',
            Instances=dict(
                MasterInstanceType='m1.medium',
                InstanceCount=1,
            ),
            JobFlowRole='fake-instance-profile',
            ReleaseLabel='emr-4.0.0',
            ServiceRole='fake-service-role',
        )
        main(['-q', '--no-conf'])

        lines = [line for line in StringIO(self.stdout.getvalue())]
        self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 1)
Example #45
0
    def test_yarn_output(self):
        # abbreviated version of real output from Hadoop 2.7.0.
        # Including things that might be interesting to parse later on
        lines = StringIO(
            '15/12/11 13:32:44 INFO client.RMProxy:'
            ' Connecting to ResourceManager at /0.0.0.0:8032\n'
            '15/12/11 13:32:45 INFO mapreduce.JobSubmitter:'
            ' Submitting tokens for job: job_1449857544442_0002\n'
            '15/12/11 13:32:45 INFO impl.YarnClientImpl:'
            ' Submitted application application_1449857544442_0002\n'
            '15/12/11 13:32:45 INFO mapreduce.Job:'
            ' The url to track the job:'
            ' http://0a7802e19139:8088/proxy/application_1449857544442_0002/\n'
            '15/12/11 13:33:11 INFO mapreduce.Job:  map 100% reduce 100%\n'
            '15/12/11 13:33:11 INFO mapreduce.Job:'
            ' Job job_1449857544442_0002 completed successfully\n'
            '15/12/11 13:33:11 INFO mapreduce.Job: Counters: 49\n'
            '        File System Counters\n'
            '                FILE: Number of bytes read=86\n'
            '15/12/11 13:33:11 INFO streaming.StreamJob:'
            ' Output directory:'
            ' hdfs:///user/root/tmp/mrjob/mr_wc.root.20151211.181326.984074'
            '/output\n')

        self.assertEqual(
            _parse_hadoop_streaming_log(lines),
            dict(application_id='application_1449857544442_0002',
                 counters={
                     'File System Counters': {
                         'FILE: Number of bytes read': 86,
                     }
                 },
                 job_id='job_1449857544442_0002',
                 output_dir=('hdfs:///user/root/tmp/mrjob'
                             '/mr_wc.root.20151211.181326.984074/output')))
Example #46
0
    def test_wrapper_script_only_writes_to_stderr(self):
        job = MROSWalkJob(["-r", "local", "--setup", "echo stray output"])
        job.sandbox()

        with no_handlers_for_logger("mrjob.local"):
            stderr = StringIO()
            log_to_stream("mrjob.local", stderr, debug=True)

            with job.make_runner() as r:
                r.run()

                output = b"".join(r.stream_output())

                # stray ouput should be in stderr, not the job's output
                self.assertIn("stray output", stderr.getvalue())
                self.assertNotIn(b"stray output", output)
Example #47
0
    def test_pre_yarn_output(self):
        # actual output from Hadoop 1.0.3 on EMR AMI 2.4.9
        # Including things that might be interesting to parse later on
        lines = StringIO(
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' getLocalDirs(): [/mnt/var/lib/hadoop/mapred]\n'
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' Running job: job_201512112247_0003\n'
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' Tracking URL:'
            ' http://ip-172-31-27-129.us-west-2.compute.internal:9100'
            '/jobdetails.jsp?jobid=job_201512112247_0003\n'
            '15/12/11 23:09:16 INFO streaming.StreamJob:'
            '  map 100%  reduce 100%\n'
            '15/12/11 23:09:22 INFO streaming.StreamJob:'
            ' Output: hdfs:///user/hadoop/tmp/mrjob'
            '/mr_wc.hadoop.20151211.230352.433691/output\n')

        self.assertEqual(
            _parse_hadoop_streaming_log(lines),
            dict(application_id=None,
                 counters=None,
                 job_id='job_201512112247_0003',
                 output_dir=('hdfs:///user/hadoop/tmp/mrjob'
                             '/mr_wc.hadoop.20151211.230352.433691/output')))
    def test_dry_run(self):
        stdout = StringIO()
        self.maybe_terminate_quietly(
            stdout=stdout, max_mins_idle=0.6, dry_run=True)

        # shouldn't *actually* terminate clusters
        self.assertEqual(self.ids_of_terminated_clusters(), [])
Example #49
0
class ReportLongJobsTestCase(MockBoto3TestCase):

    def setUp(self):
        super(ReportLongJobsTestCase, self).setUp()
        # redirect print statements to self.stdout
        self._real_stdout = sys.stdout
        self.stdout = StringIO()
        sys.stdout = self.stdout

    def tearDown(self):
        sys.stdout = self._real_stdout
        super(ReportLongJobsTestCase, self).tearDown()

    def test_with_no_clusters(self):
        main(['-q', '--no-conf'])  # just make sure it doesn't crash

    def test_with_all_clusters(self):
        for cluster in CLUSTERS:
            self.add_mock_emr_cluster(cluster)

        emr_client = self.client('emr')
        emr_client.run_job_flow(
            Name='no name',
            Instances=dict(
                MasterInstanceType='m1.medium',
                InstanceCount=1,
            ),
            JobFlowRole='fake-instance-profile',
            ReleaseLabel='emr-4.0.0',
            ServiceRole='fake-service-role',
        )
        main(['-q', '--no-conf'])

        lines = [line for line in StringIO(self.stdout.getvalue())]
        self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 1)
        self.assertNotIn('j-COMPLETED', self.stdout.getvalue())

    def test_exclude(self):
        for cluster in CLUSTERS:
            self.add_mock_emr_cluster(cluster)

        main(['-q', '--no-conf', '-x', 'my_key,my_value'])

        lines = [line for line in StringIO(self.stdout.getvalue())]
        self.assertEqual(len(lines), len(CLUSTERS_BY_ID) - 2)
        self.assertNotIn('j-COMPLETED', self.stdout.getvalue())
        self.assertNotIn('j-RUNNING1STEP', self.stdout.getvalue())
Example #50
0
    def test_passthrough(self):
        runner = InlineMRJobRunner()

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner.ls, runner.fs.ls)
            # no special rules for underscore methods
            self.assertEqual(runner._cat_file, runner.fs._cat_file)

            self.assertIn(
                'deprecated: call InlineMRJobRunner.fs.ls() directly',
                stderr.getvalue())
            self.assertIn(
                'deprecated: call InlineMRJobRunner.fs._cat_file() directly',
                stderr.getvalue())
Example #51
0
    def test_pass_through_fields(self):
        # TODO: currently can't initialize HadoopRunner without setting these
        runner = HadoopJobRunner(
            hadoop_bin='hadoooooooooop',
            hadoop_home='kansas',
            hadoop_streaming_jar='streaming.jar')

        with no_handlers_for_logger('mrjob.runner'):
            stderr = StringIO()
            log_to_stream('mrjob.runner', stderr)

            self.assertEqual(runner._hadoop_bin, runner.fs._hadoop_bin)

            # deprecation warning is different for non-functions
            self.assertIn(
                'deprecated: access HadoopJobRunner.fs._hadoop_bin directly',
                stderr.getvalue())
Example #52
0
 def test_default_options(self):
     with no_handlers_for_logger('__main__'):
         with patch.object(sys, 'stderr', StringIO()) as stderr:
             MRJob.set_up_logging()
             log = logging.getLogger('__main__')
             log.info('INFO')
             log.debug('DEBUG')
             self.assertEqual(stderr.getvalue(), 'INFO\n')
    def test_its_not_very_quiet(self):
        stdout = StringIO()
        self.maybe_terminate_quietly(
            stdout=stdout, max_hours_idle=0.01)

        self.assertEqual(set(stdout.getvalue().splitlines()),
                         set(self.EXPECTED_STDOUT_LINES))

        # should have actually terminated clusters
        self.assertEqual(self.ids_of_terminated_clusters(), [
            'j-DEBUG_ONLY',
            'j-DONE_AND_IDLE',
            'j-HADOOP_DEBUGGING',
            'j-IDLE_AND_EXPIRED',
            'j-IDLE_AND_FAILED',
            'j-PENDING_BUT_IDLE',
            'j-POOLED',
        ])
    def test_its_not_very_quiet(self):
        stdout = StringIO()
        self.maybe_terminate_quietly(stdout=stdout, max_hours_idle=0.01)

        self.assertEqual(set(stdout.getvalue().splitlines()),
                         set(self.EXPECTED_STDOUT_LINES))

        # should have actually terminated clusters
        self.assertEqual(self.ids_of_terminated_clusters(), [
            'j-DEBUG_ONLY',
            'j-DONE_AND_IDLE',
            'j-DONE_AND_IDLE_4_X',
            'j-HADOOP_DEBUGGING',
            'j-IDLE_AND_EXPIRED',
            'j-IDLE_AND_FAILED',
            'j-PENDING_BUT_IDLE',
            'j-POOLED',
        ])
Example #55
0
    def test_with_header(self):
        lines = [
            'Counters: 1',
            '  File System Counters',
            '    FILE: Number of bytes read=86',
        ]

        with no_handlers_for_logger('mrjob.logs.step'):
            stderr = StringIO()
            log_to_stream('mrjob.logs.step', stderr)

            self.assertEqual(_parse_indented_counters(lines), {
                'File System Counters': {
                    'FILE: Number of bytes read': 86,
                },
            })

            # header shouldn't freak it out
            self.assertEqual(stderr.getvalue(), '')
Example #56
0
    def test_wrapper_script_only_writes_to_stderr(self):
        job = MROSWalkJob([
            '-r', 'local',
            '--setup', 'echo stray output',
        ])
        job.sandbox()

        with no_handlers_for_logger('mrjob.local'):
            stderr = StringIO()
            log_to_stream('mrjob.local', stderr, debug=True)

            with job.make_runner() as r:
                r.run()

                output = b''.join(r.stream_output())

                # stray ouput should be in stderr, not the job's output
                self.assertIn('stray output', stderr.getvalue())
                self.assertNotIn(b'stray output', output)
Example #57
0
    def test_emr_runner_option_store(self):
        stderr = StringIO()
        with no_handlers_for_logger('mrjob.conf'):
            log_to_stream('mrjob.conf', stderr)

            opts = EMRRunnerOptionStore(
                'emr',
                dict(base_tmp_dir='/scratch',
                     s3_scratch_uri='s3://bucket/walrus'),
                    [])

            self.assertEqual(opts['local_tmp_dir'], '/scratch')
            self.assertNotIn('base_tmp_dir', opts)
            self.assertIn('Deprecated option base_tmp_dir has been renamed'
                          ' to local_tmp_dir', stderr.getvalue())

            self.assertEqual(opts['s3_tmp_dir'], 's3://bucket/walrus')
            self.assertNotIn('s3_scratch_uri', opts)
            self.assertIn('Deprecated option s3_scratch_uri has been renamed'
                          ' to s3_tmp_dir', stderr.getvalue())