def _run_spark_submit(self, spark_submit_args, env, record_callback): """Run the spark submit binary in a subprocess, using a PTY if possible :param spark_submit_args: spark-submit binary and arguments, as as list :param env: environment variables, as a dict :param record_callback: a function that takes a single log4j record as its argument (see :py:func:`~mrjob.logs.log4j\ ._parse_hadoop_log4j_records) :return: the subprocess's return code """ log.debug('> %s' % cmd_line(spark_submit_args)) log.debug(' with environment: %r' % sorted(env.items())) returncode = 0 # should always be set, but just in case # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen(spark_submit_args, stdout=PIPE, stderr=PIPE, env=env) for line in step_proc.stderr: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(step_proc.stderr)): record_callback(record) # there shouldn't be much output on STDOUT for record in _parse_hadoop_log4j_records(step_proc.stdout): record_callback(record) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(master)): record_callback(record) _, returncode = os.waitpid(pid, 0) return returncode
def _run_step_on_spark(self, step, step_num): if self._opts['upload_archives']: log.warning('Spark master %r will probably ignore archives' % self._spark_master()) step_args = self._args_for_spark_step(step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) def _log_line(line): log.info(' %s' % to_unicode(line).strip('\r\n')) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) for line in step_proc.stderr: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(step_proc.stderr)): _log_log4j_record(record) # there shouldn't be much output on STDOUT for line in step_proc.stdout: _log_line_from_driver(line) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(master)): _log_log4j_record(record) _, returncode = os.waitpid(pid, 0) if returncode: reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def _run_spark_submit(self, spark_submit_args, env, record_callback): """Run the spark submit binary in a subprocess, using a PTY if possible :param spark_submit_args: spark-submit binary and arguments, as as list :param env: environment variables, as a dict :param record_callback: a function that takes a single log4j record as its argument (see :py:func:`~mrjob.logs.log4j\ ._parse_hadoop_log4j_records) :return: the subprocess's return code """ log.debug('> %s' % cmd_line(spark_submit_args)) log.debug(' with environment: %r' % sorted(env.items())) returncode = 0 # should always be set, but just in case # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen( spark_submit_args, stdout=PIPE, stderr=PIPE, env=env) for line in step_proc.stderr: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(step_proc.stderr)): record_callback(record) # there shouldn't be much output on STDOUT for record in _parse_hadoop_log4j_records(step_proc.stdout): record_callback(record) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: for record in _parse_hadoop_log4j_records( _yield_lines_from_pty_or_pipe(master)): record_callback(record) _, returncode = os.waitpid(pid, 0) return returncode
def test_log_lines(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( level='INFO', logger='client.RMProxy', message='Connecting to ResourceManager at /0.0.0.0:8032', num_lines=1, start_line=0, thread='', timestamp='15/12/11 13:26:07', ), dict( level='ERROR', logger='streaming.StreamJob', message=('Error Launching job :' ' Output directory already exists'), num_lines=1, start_line=1, thread='', timestamp='15/12/11 13:26:08', ), ])
def test_trailing_carriage_return(self): lines = StringIO('15/12/11 13:26:07 INFO client.RMProxy:' ' Connecting to ResourceManager at /0.0.0.0:8032\r\n') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( level='INFO', logger='client.RMProxy', message='Connecting to ResourceManager at /0.0.0.0:8032', num_lines=1, start_line=0, thread='', timestamp='15/12/11 13:26:07', ) ])
def test_thread(self): lines = StringIO( '2015-08-22 00:46:18,411 INFO amazon.emr.metrics.MetricsSaver' ' (main): Thread 1 created MetricsLockFreeSaver 1\n') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( timestamp='2015-08-22 00:46:18,411', level='INFO', logger='amazon.emr.metrics.MetricsSaver', num_lines=1, start_line=0, thread='main', message='Thread 1 created MetricsLockFreeSaver 1', ) ])
def test_non_log_lines(self): lines = StringIO('foo\n' 'bar\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( caller_location='', level='', logger='', message='foo', num_lines=1, start_line=0, thread='', timestamp='', ), dict( caller_location='', level='', logger='', message='bar', num_lines=1, start_line=1, thread='', timestamp='', ), dict( caller_location='', level='ERROR', logger='streaming.StreamJob', # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=('Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!'), num_lines=2, start_line=2, thread='', timestamp='15/12/11 13:26:08', ), ])
def test_non_log_lines(self): lines = StringIO('foo\n' 'bar\n' '15/12/11 13:26:08 ERROR streaming.StreamJob:' ' Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( caller_location='', level='', logger='', message='foo', num_lines=1, start_line=0, thread='', timestamp='', ), dict( caller_location='', level='', logger='', message='bar', num_lines=1, start_line=1, thread='', timestamp='', ), dict( caller_location='', level='ERROR', logger='streaming.StreamJob', # no way to know that Streaming Command Failed! wasn't part # of a multi-line message message=('Error Launching job :' ' Output directory already exists\n' 'Streaming Command Failed!'), num_lines=2, start_line=2, thread='', timestamp='15/12/11 13:26:08', ), ])
def test_caller_location(self): # tests fix for #1406 lines = StringIO( ' 2016-08-19 13:30:03,816 INFO [main] impl.YarnClientImpl' ' (YarnClientImpl.java:submitApplication(251)) - Submitted' ' application application_1468316211405_1354') self.assertEqual(list(_parse_hadoop_log4j_records(lines)), [ dict( caller_location=('YarnClientImpl.java:submitApplication(251)'), level='INFO', logger='impl.YarnClientImpl', message=('Submitted application' ' application_1468316211405_1354'), num_lines=1, start_line=0, thread='main', timestamp='2016-08-19 13:30:03,816', ) ])
def test_multiline_message(self): lines = StringIO( '2015-08-22 00:47:35,323 INFO org.apache.hadoop.mapreduce.Job' ' (main): Counters: 54\r\n' ' File System Counters\r\n' ' FILE: Number of bytes read=83\r\n') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( level='INFO', logger='org.apache.hadoop.mapreduce.Job', # strip \r's, no trailing \n message=('Counters: 54\n' ' File System Counters\n' ' FILE: Number of bytes read=83'), num_lines=3, start_line=0, thread='main', timestamp='2015-08-22 00:47:35,323', ) ])
def test_caller_location(self): # tests fix for #1406 lines = StringIO( ' 2016-08-19 13:30:03,816 INFO [main] impl.YarnClientImpl' ' (YarnClientImpl.java:submitApplication(251)) - Submitted' ' application application_1468316211405_1354') self.assertEqual( list(_parse_hadoop_log4j_records(lines)), [ dict( caller_location=( 'YarnClientImpl.java:submitApplication(251)'), level='INFO', logger='impl.YarnClientImpl', message=('Submitted application' ' application_1468316211405_1354'), num_lines=1, start_line=0, thread='main', timestamp='2016-08-19 13:30:03,816', ) ])
def _run_spark_submit(self, spark_submit_args, env, record_callback): """Run the spark submit binary in a subprocess, using a PTY if possible :param spark_submit_args: spark-submit binary and arguments, as as list :param env: environment variables, as a dict :param record_callback: a function that takes a single log4j record as its argument (see :py:func:`~mrjob.logs.log4j\ ._parse_hadoop_log4j_records) :return: tuple of the subprocess's return code and a step interpretation dictionary """ log.debug('> %s' % cmd_line(spark_submit_args)) log.debug(' with environment: %r' % sorted(env.items())) # these should always be set, but just in case returncode = 0 step_interpretation = {} # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen( spark_submit_args, stdout=PIPE, stderr=PIPE, env=env) # parse driver output step_interpretation = _parse_spark_log( step_proc.stderr, record_callback=record_callback) # there shouldn't be much output on STDOUT, just echo it for record in _parse_hadoop_log4j_records(step_proc.stdout): record_callback(record) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process try: os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python except OSError as ex: # use _exit() so we don't do cleanup, etc. that's # the parent process's job os._exit(ex.errno) finally: # if we get some other exception, still exit hard os._exit(-1) else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: step_interpretation = ( _parse_spark_log( _eio_to_eof(master), record_callback=record_callback)) _, returncode = os.waitpid(pid, 0) return (returncode, step_interpretation)
def test_empty(self): self.assertEqual(list(_parse_hadoop_log4j_records([])), [])