Ejemplo n.º 1
0
    def test_pre_yarn_output(self):
        # actual output from Hadoop 1.0.3 on EMR AMI 2.4.9
        # Including things that might be interesting to parse later on
        lines = StringIO(
            "15/12/11 23:08:37 INFO streaming.StreamJob:"
            " getLocalDirs(): [/mnt/var/lib/hadoop/mapred]\n"
            "15/12/11 23:08:37 INFO streaming.StreamJob:"
            " Running job: job_201512112247_0003\n"
            "15/12/11 23:08:37 INFO streaming.StreamJob:"
            " Tracking URL:"
            " http://ip-172-31-27-129.us-west-2.compute.internal:9100"
            "/jobdetails.jsp?jobid=job_201512112247_0003\n"
            "15/12/11 23:09:16 INFO streaming.StreamJob:"
            "  map 100%  reduce 100%\n"
            "15/12/11 23:09:22 INFO streaming.StreamJob:"
            " Output: hdfs:///user/hadoop/tmp/mrjob"
            "/mr_wc.hadoop.20151211.230352.433691/output\n"
        )

        self.assertEqual(
            _parse_hadoop_streaming_log(lines),
            dict(
                application_id=None,
                counters=None,
                job_id="job_201512112247_0003",
                output_dir=("hdfs:///user/hadoop/tmp/mrjob" "/mr_wc.hadoop.20151211.230352.433691/output"),
            ),
        )
Ejemplo n.º 2
0
    def test_pre_yarn_output(self):
        # actual output from Hadoop 1.0.3 on EMR AMI 2.4.9
        # Including things that might be interesting to parse later on
        lines = StringIO(
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' getLocalDirs(): [/mnt/var/lib/hadoop/mapred]\n'
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' Running job: job_201512112247_0003\n'
            '15/12/11 23:08:37 INFO streaming.StreamJob:'
            ' Tracking URL:'
            ' http://ip-172-31-27-129.us-west-2.compute.internal:9100'
            '/jobdetails.jsp?jobid=job_201512112247_0003\n'
            '15/12/11 23:09:16 INFO streaming.StreamJob:'
            '  map 100%  reduce 100%\n'
            '15/12/11 23:09:22 INFO streaming.StreamJob:'
            ' Output: hdfs:///user/hadoop/tmp/mrjob'
            '/mr_wc.hadoop.20151211.230352.433691/output\n')

        self.assertEqual(
            _parse_hadoop_streaming_log(lines),
            dict(application_id=None,
                 counters=None,
                 job_id='job_201512112247_0003',
                 output_dir=('hdfs:///user/hadoop/tmp/mrjob'
                             '/mr_wc.hadoop.20151211.230352.433691/output')))
Ejemplo n.º 3
0
    def test_yarn_output(self):
        # abbreviated version of real output from Hadoop 2.7.0.
        # Including things that might be interesting to parse later on
        lines = StringIO(
            "15/12/11 13:32:44 INFO client.RMProxy:"
            " Connecting to ResourceManager at /0.0.0.0:8032\n"
            "15/12/11 13:32:45 INFO mapreduce.JobSubmitter:"
            " Submitting tokens for job: job_1449857544442_0002\n"
            "15/12/11 13:32:45 INFO impl.YarnClientImpl:"
            " Submitted application application_1449857544442_0002\n"
            "15/12/11 13:32:45 INFO mapreduce.Job:"
            " The url to track the job:"
            " http://0a7802e19139:8088/proxy/application_1449857544442_0002/\n"
            "15/12/11 13:33:11 INFO mapreduce.Job:  map 100% reduce 100%\n"
            "15/12/11 13:33:11 INFO mapreduce.Job:"
            " Job job_1449857544442_0002 completed successfully\n"
            "15/12/11 13:33:11 INFO mapreduce.Job: Counters: 49\n"
            "        File System Counters\n"
            "                FILE: Number of bytes read=86\n"
            "15/12/11 13:33:11 INFO streaming.StreamJob:"
            " Output directory:"
            " hdfs:///user/root/tmp/mrjob/mr_wc.root.20151211.181326.984074"
            "/output\n"
        )

        self.assertEqual(
            _parse_hadoop_streaming_log(lines),
            dict(
                application_id="application_1449857544442_0002",
                counters={"File System Counters": {"FILE: Number of bytes read": 86}},
                job_id="job_1449857544442_0002",
                output_dir=("hdfs:///user/root/tmp/mrjob" "/mr_wc.root.20151211.181326.984074/output"),
            ),
        )
Ejemplo n.º 4
0
    def test_yarn_output(self):
        # abbreviated version of real output from Hadoop 2.7.0.
        # Including things that might be interesting to parse later on
        lines = StringIO(
            '15/12/11 13:32:44 INFO client.RMProxy:'
            ' Connecting to ResourceManager at /0.0.0.0:8032\n'
            '15/12/11 13:32:45 INFO mapreduce.JobSubmitter:'
            ' Submitting tokens for job: job_1449857544442_0002\n'
            '15/12/11 13:32:45 INFO impl.YarnClientImpl:'
            ' Submitted application application_1449857544442_0002\n'
            '15/12/11 13:32:45 INFO mapreduce.Job:'
            ' The url to track the job:'
            ' http://0a7802e19139:8088/proxy/application_1449857544442_0002/\n'
            '15/12/11 13:33:11 INFO mapreduce.Job:  map 100% reduce 100%\n'
            '15/12/11 13:33:11 INFO mapreduce.Job:'
            ' Job job_1449857544442_0002 completed successfully\n'
            '15/12/11 13:33:11 INFO mapreduce.Job: Counters: 49\n'
            '        File System Counters\n'
            '                FILE: Number of bytes read=86\n'
            '15/12/11 13:33:11 INFO streaming.StreamJob:'
            ' Output directory:'
            ' hdfs:///user/root/tmp/mrjob/mr_wc.root.20151211.181326.984074'
            '/output\n')

        self.assertEqual(
            _parse_hadoop_streaming_log(lines),
            dict(application_id='application_1449857544442_0002',
                 counters={
                     'File System Counters': {
                         'FILE: Number of bytes read': 86,
                     }
                 },
                 job_id='job_1449857544442_0002',
                 output_dir=('hdfs:///user/root/tmp/mrjob'
                             '/mr_wc.root.20151211.181326.984074/output')))
Ejemplo n.º 5
0
 def test_empty(self):
     self.assertEqual(
         _parse_hadoop_streaming_log([]),
         dict(application_id=None,
              counters=None,
              job_id=None,
              output_dir=None))
Ejemplo n.º 6
0
def _process_stderr_from_streaming(lines):
    """Wrapper for mrjob.logs._parse_hadoop_streaming_log().

    This converts lines from bytes to str in Python 3, logs every line
    (abbreviating the counters message, since we log counters next).

    This also screens out and logs 'Streaming Command Failed!', which
    isn't in log format.
    """
    def stderr_to_log(lines):
        for line in lines:
            line = to_string(line)
            if _HADOOP_NON_LOG_LINE_RE.match(line):
                # use error because this is usually "Streaming Command Failed!"
                _log_line_from_hadoop(line, level=logging.ERROR)
            else:
                yield line

    def callback(record):
        message = record['message']

        level = getattr(logging, record['level'], None)

        if _INDENTED_COUNTERS_START_RE.match(message):
            # don't show the counters themselves
            _log_line_from_hadoop(message.split('\n')[0], level=level)
            log.info('(parsing counters)')
        else:
            _log_line_from_hadoop(message, level=level)

    return _parse_hadoop_streaming_log(stderr_to_log(lines),
                                       record_callback=callback)
Ejemplo n.º 7
0
 def test_empty(self):
     self.assertEqual(
         _parse_hadoop_streaming_log([]),
         dict(application_id=None,
              counters=None,
              job_id=None,
              output_dir=None))
Ejemplo n.º 8
0
def _process_stderr_from_streaming(lines):
    """Wrapper for mrjob.logs._parse_hadoop_streaming_log().

    This converts lines from bytes to str in Python 3, logs every line
    (abbreviating the counters message, since we log counters next).

    This also screens out and logs 'Streaming Command Failed!', which
    isn't in log format.
    """
    def stderr_to_log(lines):
        for line in lines:
            line = to_string(line)
            if _HADOOP_NON_LOG_LINE_RE.match(line):
                # use error because this is usually "Streaming Command Failed!"
                _log_line_from_hadoop(line, level=logging.ERROR)
            else:
                yield line

    def callback(record):
        message = record['message']

        level = getattr(logging, record['level'], None)

        if _INDENTED_COUNTERS_START_RE.match(message):
            # don't show the counters themselves
            _log_line_from_hadoop(message.split('\n')[0], level=level)
            log.info('(parsing counters)')
        else:
            _log_line_from_hadoop(message, level=level)

    return _parse_hadoop_streaming_log(stderr_to_log(lines),
                                       record_callback=callback)