Example #1
0
    def _run_spark_submit(self, spark_submit_args, env, record_callback):
        """Run the spark submit binary in a subprocess, using a PTY if possible

        :param spark_submit_args: spark-submit binary and arguments, as as list
        :param env: environment variables, as a dict
        :param record_callback: a function that takes a single log4j record
                                as its argument (see
                                :py:func:`~mrjob.logs.log4j\
                                ._parse_hadoop_log4j_records)

        :return: the subprocess's return code
        """
        log.debug('> %s' % cmd_line(spark_submit_args))
        log.debug('  with environment: %r' % sorted(env.items()))

        returncode = 0  # should always be set, but just in case

        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # spark-submit is running
            log.debug('No PTY available, using Popen() to invoke spark-submit')

            step_proc = Popen(spark_submit_args,
                              stdout=PIPE,
                              stderr=PIPE,
                              env=env)

            for line in step_proc.stderr:
                for record in _parse_hadoop_log4j_records(
                        _yield_lines_from_pty_or_pipe(step_proc.stderr)):
                    record_callback(record)

            # there shouldn't be much output on STDOUT
            for record in _parse_hadoop_log4j_records(step_proc.stdout):
                record_callback(record)

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                os.execvpe(spark_submit_args[0], spark_submit_args, env)
                # now this process is no longer Python
            else:
                log.debug('Invoking spark-submit via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    for record in _parse_hadoop_log4j_records(
                            _yield_lines_from_pty_or_pipe(master)):
                        record_callback(record)
                    _, returncode = os.waitpid(pid, 0)

        return returncode
Example #2
0
    def _run_step_on_spark(self, step, step_num):
        if self._opts['upload_archives']:
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        step_args = self._args_for_spark_step(step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        log.debug('> %s' % cmd_line(step_args))
        log.debug('  with environment: %r' % sorted(env.items()))

        def _log_line(line):
            log.info('  %s' % to_unicode(line).strip('\r\n'))

        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # spark-submit is running
            log.debug('No PTY available, using Popen() to invoke spark-submit')

            step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env)

            for line in step_proc.stderr:
                for record in _parse_hadoop_log4j_records(
                        _yield_lines_from_pty_or_pipe(step_proc.stderr)):
                    _log_log4j_record(record)

            # there shouldn't be much output on STDOUT
            for line in step_proc.stdout:
                _log_line_from_driver(line)

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                os.execvpe(step_args[0], step_args, env)
            else:
                log.debug('Invoking spark-submit via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    for record in _parse_hadoop_log4j_records(
                            _yield_lines_from_pty_or_pipe(master)):
                        _log_log4j_record(record)
                    _, returncode = os.waitpid(pid, 0)

        if returncode:
            reason = str(CalledProcessError(returncode, step_args))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      num_steps=self._num_steps())
Example #3
0
File: bin.py Project: Affirm/mrjob
    def _run_spark_submit(self, spark_submit_args, env, record_callback):
        """Run the spark submit binary in a subprocess, using a PTY if possible

        :param spark_submit_args: spark-submit binary and arguments, as as list
        :param env: environment variables, as a dict
        :param record_callback: a function that takes a single log4j record
                                as its argument (see
                                :py:func:`~mrjob.logs.log4j\
                                ._parse_hadoop_log4j_records)

        :return: the subprocess's return code
        """
        log.debug('> %s' % cmd_line(spark_submit_args))
        log.debug('  with environment: %r' % sorted(env.items()))

        returncode = 0  # should always be set, but just in case

        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # spark-submit is running
            log.debug('No PTY available, using Popen() to invoke spark-submit')

            step_proc = Popen(
                spark_submit_args, stdout=PIPE, stderr=PIPE, env=env)

            for line in step_proc.stderr:
                for record in _parse_hadoop_log4j_records(
                        _yield_lines_from_pty_or_pipe(step_proc.stderr)):
                    record_callback(record)

            # there shouldn't be much output on STDOUT
            for record in _parse_hadoop_log4j_records(step_proc.stdout):
                record_callback(record)

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                os.execvpe(spark_submit_args[0], spark_submit_args, env)
                # now this process is no longer Python
            else:
                log.debug('Invoking spark-submit via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    for record in _parse_hadoop_log4j_records(
                            _yield_lines_from_pty_or_pipe(master)):
                        record_callback(record)
                    _, returncode = os.waitpid(pid, 0)

        return returncode
Example #4
0
 def test_log_lines(self):
     lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:'
                      ' Connecting to ResourceManager at /0.0.0.0:8032\n'
                      '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                      ' Error Launching job :'
                      ' Output directory already exists\n')
     self.assertEqual(
         list(_parse_hadoop_log4j_records(lines)), [
             dict(
                 level='INFO',
                 logger='client.RMProxy',
                 message='Connecting to ResourceManager at /0.0.0.0:8032',
                 num_lines=1,
                 start_line=0,
                 thread='',
                 timestamp='15/12/11 13:26:07',
             ),
             dict(
                 level='ERROR',
                 logger='streaming.StreamJob',
                 message=('Error Launching job :'
                          ' Output directory already exists'),
                 num_lines=1,
                 start_line=1,
                 thread='',
                 timestamp='15/12/11 13:26:08',
             ),
         ])
Example #5
0
 def test_trailing_carriage_return(self):
     lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:'
                      ' Connecting to ResourceManager at /0.0.0.0:8032\r\n')
     self.assertEqual(
         list(_parse_hadoop_log4j_records(lines)), [
             dict(
                 level='INFO',
                 logger='client.RMProxy',
                 message='Connecting to ResourceManager at /0.0.0.0:8032',
                 num_lines=1,
                 start_line=0,
                 thread='',
                 timestamp='15/12/11 13:26:07',
             )
         ])
Example #6
0
    def test_thread(self):
        lines = StringIO(
            '2015-08-22 00:46:18,411 INFO amazon.emr.metrics.MetricsSaver'
            ' (main): Thread 1 created MetricsLockFreeSaver 1\n')

        self.assertEqual(
            list(_parse_hadoop_log4j_records(lines)), [
                dict(
                    timestamp='2015-08-22 00:46:18,411',
                    level='INFO',
                    logger='amazon.emr.metrics.MetricsSaver',
                    num_lines=1,
                    start_line=0,
                    thread='main',
                    message='Thread 1 created MetricsLockFreeSaver 1',
                )
            ])
Example #7
0
    def test_non_log_lines(self):
        lines = StringIO('foo\n'
                         'bar\n'
                         '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                         ' Error Launching job :'
                         ' Output directory already exists\n'
                         'Streaming Command Failed!')

        self.assertEqual(
            list(_parse_hadoop_log4j_records(lines)),
            [
                dict(
                    caller_location='',
                    level='',
                    logger='',
                    message='foo',
                    num_lines=1,
                    start_line=0,
                    thread='',
                    timestamp='',
                ),
                dict(
                    caller_location='',
                    level='',
                    logger='',
                    message='bar',
                    num_lines=1,
                    start_line=1,
                    thread='',
                    timestamp='',
                ),
                dict(
                    caller_location='',
                    level='ERROR',
                    logger='streaming.StreamJob',
                    # no way to know that Streaming Command Failed! wasn't part
                    # of a multi-line message
                    message=('Error Launching job :'
                             ' Output directory already exists\n'
                             'Streaming Command Failed!'),
                    num_lines=2,
                    start_line=2,
                    thread='',
                    timestamp='15/12/11 13:26:08',
                ),
            ])
Example #8
0
    def test_non_log_lines(self):
        lines = StringIO('foo\n'
                         'bar\n'
                         '15/12/11 13:26:08 ERROR streaming.StreamJob:'
                         ' Error Launching job :'
                         ' Output directory already exists\n'
                         'Streaming Command Failed!')

        self.assertEqual(
            list(_parse_hadoop_log4j_records(lines)), [
                dict(
                    caller_location='',
                    level='',
                    logger='',
                    message='foo',
                    num_lines=1,
                    start_line=0,
                    thread='',
                    timestamp='',
                ),
                dict(
                    caller_location='',
                    level='',
                    logger='',
                    message='bar',
                    num_lines=1,
                    start_line=1,
                    thread='',
                    timestamp='',
                ),
                dict(
                    caller_location='',
                    level='ERROR',
                    logger='streaming.StreamJob',
                    # no way to know that Streaming Command Failed! wasn't part
                    # of a multi-line message
                    message=('Error Launching job :'
                             ' Output directory already exists\n'
                             'Streaming Command Failed!'),
                    num_lines=2,
                    start_line=2,
                    thread='',
                    timestamp='15/12/11 13:26:08',
                ),
            ])
Example #9
0
    def test_caller_location(self):
        # tests fix for #1406
        lines = StringIO(
            '  2016-08-19 13:30:03,816 INFO  [main] impl.YarnClientImpl'
            ' (YarnClientImpl.java:submitApplication(251)) - Submitted'
            ' application application_1468316211405_1354')

        self.assertEqual(list(_parse_hadoop_log4j_records(lines)), [
            dict(
                caller_location=('YarnClientImpl.java:submitApplication(251)'),
                level='INFO',
                logger='impl.YarnClientImpl',
                message=('Submitted application'
                         ' application_1468316211405_1354'),
                num_lines=1,
                start_line=0,
                thread='main',
                timestamp='2016-08-19 13:30:03,816',
            )
        ])
Example #10
0
    def test_multiline_message(self):
        lines = StringIO(
            '2015-08-22 00:47:35,323 INFO org.apache.hadoop.mapreduce.Job'
            ' (main): Counters: 54\r\n'
            '        File System Counters\r\n'
            '                FILE: Number of bytes read=83\r\n')

        self.assertEqual(
            list(_parse_hadoop_log4j_records(lines)), [
                dict(
                    level='INFO',
                    logger='org.apache.hadoop.mapreduce.Job',
                    # strip \r's, no trailing \n
                    message=('Counters: 54\n'
                             '        File System Counters\n'
                             '                FILE: Number of bytes read=83'),
                    num_lines=3,
                    start_line=0,
                    thread='main',
                    timestamp='2015-08-22 00:47:35,323',
                )
            ])
Example #11
0
    def test_caller_location(self):
        # tests fix for #1406
        lines = StringIO(
            '  2016-08-19 13:30:03,816 INFO  [main] impl.YarnClientImpl'
            ' (YarnClientImpl.java:submitApplication(251)) - Submitted'
            ' application application_1468316211405_1354')

        self.assertEqual(
            list(_parse_hadoop_log4j_records(lines)), [
                dict(
                    caller_location=(
                        'YarnClientImpl.java:submitApplication(251)'),
                    level='INFO',
                    logger='impl.YarnClientImpl',
                    message=('Submitted application'
                             ' application_1468316211405_1354'),
                    num_lines=1,
                    start_line=0,
                    thread='main',
                    timestamp='2016-08-19 13:30:03,816',
                )
            ])
Example #12
0
    def _run_spark_submit(self, spark_submit_args, env, record_callback):
        """Run the spark submit binary in a subprocess, using a PTY if possible

        :param spark_submit_args: spark-submit binary and arguments, as as list
        :param env: environment variables, as a dict
        :param record_callback: a function that takes a single log4j record
                                as its argument (see
                                :py:func:`~mrjob.logs.log4j\
                                ._parse_hadoop_log4j_records)

        :return: tuple of the subprocess's return code and a
                 step interpretation dictionary
        """
        log.debug('> %s' % cmd_line(spark_submit_args))
        log.debug('  with environment: %r' % sorted(env.items()))

        # these should always be set, but just in case
        returncode = 0
        step_interpretation = {}

        # try to use a PTY if it's available
        try:
            pid, master_fd = pty.fork()
        except (AttributeError, OSError):
            # no PTYs, just use Popen

            # user won't get much feedback for a while, so tell them
            # spark-submit is running
            log.debug('No PTY available, using Popen() to invoke spark-submit')

            step_proc = Popen(
                spark_submit_args, stdout=PIPE, stderr=PIPE, env=env)

            # parse driver output
            step_interpretation = _parse_spark_log(
                step_proc.stderr, record_callback=record_callback)

            # there shouldn't be much output on STDOUT, just echo it
            for record in _parse_hadoop_log4j_records(step_proc.stdout):
                record_callback(record)

            step_proc.stdout.close()
            step_proc.stderr.close()

            returncode = step_proc.wait()
        else:
            # we have PTYs
            if pid == 0:  # we are the child process
                try:
                    os.execvpe(spark_submit_args[0], spark_submit_args, env)
                    # now this process is no longer Python
                except OSError as ex:
                    # use _exit() so we don't do cleanup, etc. that's
                    # the parent process's job
                    os._exit(ex.errno)
                finally:
                    # if we get some other exception, still exit hard
                    os._exit(-1)
            else:
                log.debug('Invoking spark-submit via PTY')

                with os.fdopen(master_fd, 'rb') as master:
                    step_interpretation = (
                        _parse_spark_log(
                            _eio_to_eof(master),
                            record_callback=record_callback))

                    _, returncode = os.waitpid(pid, 0)

        return (returncode, step_interpretation)
Example #13
0
 def test_empty(self):
     self.assertEqual(list(_parse_hadoop_log4j_records([])), [])