def _run_hadoop(self, hadoop_args, env, record_callback): # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(hadoop_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_driver(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process try: os.execvpe(hadoop_args[0], hadoop_args, env) # now we are no longer Python except OSError as ex: # use _exit() so we don't do cleanup, etc. that's # the parent process's job os._exit(ex.errno) finally: # if we got some other exception, still exit hard os._exit(-1) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( _eio_to_eof(master), record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) return returncode, step_interpretation
def _run_spark_submit(self, spark_submit_args, env, record_callback): """Run the spark submit binary in a subprocess, using a PTY if possible :param spark_submit_args: spark-submit binary and arguments, as as list :param env: environment variables, as a dict :param record_callback: a function that takes a single log4j record as its argument (see :py:func:`~mrjob.logs.log4j\ ._parse_hadoop_log4j_records) :return: tuple of the subprocess's return code and a step interpretation dictionary """ log.debug('> %s' % cmd_line(spark_submit_args)) log.debug(' with environment: %r' % sorted(env.items())) # these should always be set, but just in case returncode = 0 step_interpretation = {} # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # spark-submit is running log.debug('No PTY available, using Popen() to invoke spark-submit') step_proc = Popen( spark_submit_args, stdout=PIPE, stderr=PIPE, env=env) # parse driver output step_interpretation = _parse_spark_log( step_proc.stderr, record_callback=record_callback) # there shouldn't be much output on STDOUT, just echo it for record in _parse_hadoop_log4j_records(step_proc.stdout): record_callback(record) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process try: os.execvpe(spark_submit_args[0], spark_submit_args, env) # now this process is no longer Python except OSError as ex: # use _exit() so we don't do cleanup, etc. that's # the parent process's job os._exit(ex.errno) finally: # if we get some other exception, still exit hard os._exit(-1) else: log.debug('Invoking spark-submit via PTY') with os.fdopen(master_fd, 'rb') as master: step_interpretation = ( _parse_spark_log( _eio_to_eof(master), record_callback=record_callback)) _, returncode = os.waitpid(pid, 0) return (returncode, step_interpretation)