def _run_step_on_spark(self, step, step_num): """Set up a fake working directory and environment, and call the Spark method.""" # this is kind of a Spark-specific mash-up of _run_streaming_step() # (in sim.py) and _invoke_task_func(), above # don't create the output dir for the step; that's Spark's job # breaking the Spark step down into tasks is pyspark's job, so # we just have a single dummy task self.fs.mkdir(self._task_dir('spark', step_num, task_num=0)) # could potentially parse this for cause of error stderr_path = self._task_stderr_path('spark', step_num, task_num=0) stdout_path = self._task_output_path('spark', step_num, task_num=0) self._create_dist_cache_dir(step_num) wd = self._setup_working_dir('spark', step_num, task_num=0) # use abspath() on input URIs before changing working dir task_args = self._spark_script_args(step_num) with open(stdout_path, 'wb') as stdout, \ open(stderr_path, 'wb') as stderr: with save_current_environment(), save_cwd(), save_sys_path(), \ save_sys_std(): os.environ.update(_fix_env(self._opts['cmdenv'])) os.chdir(wd) sys.path = [os.getcwd()] + sys.path # pretend we redirected stdout and stderr sys.stdout, sys.stderr = stdout, stderr task = self._mrjob_cls(task_args) task.execute()
def _run_task_func(self, task_type, step_num, task_num, map_split=None): """Returns a no-args function that runs one mapper, reducer, or combiner. This sets up everything the task needs to run, then passes it off to :py:meth:`_invoke_task_func`. """ input_path = self._task_input_path(task_type, step_num, task_num) stderr_path = self._task_stderr_path(task_type, step_num, task_num) output_path = self._task_output_path(task_type, step_num, task_num) wd = self._setup_working_dir(task_type, step_num, task_num) env = _fix_env( self._env_for_task(task_type, step_num, task_num, map_split)) return partial(_run_task, self._invoke_task_func(task_type, step_num, task_num), task_type, step_num, task_num, input_path, output_path, stderr_path, wd, env)
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_type = step['type'] step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) if self._step_type_uses_spark(step_type): returncode, step_interpretation = self._run_spark_submit( step_args, env, record_callback=_log_log4j_record) else: returncode, step_interpretation = self._run_hadoop( step_args, env, record_callback=_log_record_from_hadoop) # make sure output_dir is filled (used for history log) if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) if returncode: error = self._pick_error(log_interpretation, step_type) if error: _log_probable_cause_of_failure(log, error) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def _run_task_func(self, task_type, step_num, task_num, map_split=None): """Returns a no-args function that runs one mapper, reducer, or combiner. This sets up everything the task needs to run, then passes it off to :py:meth:`_invoke_task_func`. """ input_path = self._task_input_path(task_type, step_num, task_num) stderr_path = self._task_stderr_path(task_type, step_num, task_num) output_path = self._task_output_path(task_type, step_num, task_num) wd = self._setup_working_dir(task_type, step_num, task_num) env = _fix_env( self._env_for_task(task_type, step_num, task_num, map_split)) return partial( _run_task, self._invoke_task_func(task_type, step_num, task_num), task_type, step_num, task_num, input_path, output_path, stderr_path, wd, env)
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_driver(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) step_type = step['type'] if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())