def _log_cause_of_error(self, ex): if not isinstance(ex, _TaskFailedException): # if something went wrong inside mrjob, the stacktrace # will bubble up to the top level return # not using LogInterpretationMixin because it would be overkill if not self._opts['read_logs']: return input_path = self._task_input_path(ex.task_type, ex.step_num, ex.task_num) stderr_path = self._task_stderr_path(ex.task_type, ex.step_num, ex.task_num) if self.fs.exists(stderr_path): # it should, but just to be safe # log-parsing code expects "str", not bytes; open in text mode with open(stderr_path) as stderr: task_error = _parse_task_stderr(stderr) if task_error: task_error['path'] = stderr_path error = dict(split=dict(path=input_path), task_error=task_error) _log_probable_cause_of_failure(log, error) return # fallback if we can't find the error (e.g. the job does something # weird to stderr or stack traces) log.error('Error while reading from %s:\n' % input_path)
def _wait_for_step_to_complete(self, job_id, step_num, num_steps): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) log_interpretation['step'] = {} step_type = self._get_step(step_num)['type'] while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) log_interpretation['step']['driver_output_uri'] = ( job.driver_output_resource_uri) self._interpret_step_logs(log_interpretation, step_type) progress = log_interpretation['step'].get('progress') if progress: log.info(' ' + progress['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) if job_state == 'ERROR': error = self._pick_error(log_interpretation, step_type) if error: _log_probable_cause_of_failure(log, error) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException(step_num=step_num, num_steps=num_steps)
def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode, step_interpretation = self._run_spark_submit( spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join(self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate(counters, start=(step_num + 1)): if counter_dict: log.info( _format_counters(counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: error = _pick_error(dict(step=step_interpretation)) if error: _log_probable_cause_of_failure(log, error) reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException(reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps())
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_type = step['type'] step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) if self._step_type_uses_spark(step_type): returncode, step_interpretation = self._run_spark_submit( step_args, env, record_callback=_log_log4j_record) else: returncode, step_interpretation = self._run_hadoop( step_args, env, record_callback=_log_record_from_hadoop) # make sure output_dir is filled (used for history log) if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation self._log_counters(log_interpretation, step_num) if returncode: error = self._pick_error(log_interpretation, step_type) if error: _log_probable_cause_of_failure(log, error) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())
def _run_step_on_spark(self, step, step_num): if self._opts['upload_archives']: log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode, step_interpretation = self._run_spark_submit( spark_submit_args, env, record_callback=_log_log4j_record) if returncode: error = _pick_error(dict(step=step_interpretation)) if error: _log_probable_cause_of_failure(log, error) reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())