def test_hadoop_error(self): self.assertEqual( _format_error(dict(hadoop_error=dict(message="DevastatingJavaException"))), "DevastatingJavaException" ) self.assertEqual( _format_error(dict(hadoop_error=dict(message="DevastatingJavaException", path="history.jhist"))), "DevastatingJavaException\n\n(from history.jhist)", ) self.assertEqual( _format_error( dict( hadoop_error=dict( message="DevastatingJavaException", path="history.jhist", start_line=23, num_lines=1 ) ) ), "DevastatingJavaException\n\n(from line 24 of history.jhist)", ) self.assertEqual( _format_error( dict( hadoop_error=dict( message="DevastatingJavaException", path="history.jhist", start_line=23, num_lines=3 ) ) ), "DevastatingJavaException\n\n(from lines 24-26 of history.jhist)", )
def test_hadoop_error(self): self.assertEqual( _format_error( dict(hadoop_error=dict(message='DevastatingJavaException'))), 'DevastatingJavaException') self.assertEqual( _format_error( dict(hadoop_error=dict(message='DevastatingJavaException', path='history.jhist'))), 'DevastatingJavaException\n\n(from history.jhist)') self.assertEqual( _format_error( dict(hadoop_error=dict(message='DevastatingJavaException', path='history.jhist', start_line=23, num_lines=1))), 'DevastatingJavaException\n\n(from line 24 of history.jhist)') self.assertEqual( _format_error( dict(hadoop_error=dict(message='DevastatingJavaException', path='history.jhist', start_line=23, num_lines=3))), 'DevastatingJavaException\n\n(from lines 24-26 of history.jhist)')
def test_hadoop_error(self): self.assertEqual( _format_error(dict(hadoop_error=dict( message='DevastatingJavaException') )), 'DevastatingJavaException') self.assertEqual( _format_error(dict(hadoop_error=dict( message='DevastatingJavaException', path='history.jhist' ))), 'DevastatingJavaException\n\n(from history.jhist)') self.assertEqual( _format_error(dict(hadoop_error=dict( message='DevastatingJavaException', path='history.jhist', start_line=23, num_lines=1 ))), 'DevastatingJavaException\n\n(from line 24 of history.jhist)') self.assertEqual( _format_error(dict(hadoop_error=dict( message='DevastatingJavaException', path='history.jhist', start_line=23, num_lines=3 ))), 'DevastatingJavaException\n\n(from lines 24-26 of history.jhist)')
def test_split(self): self.assertEqual( _format_error( dict(split=dict(path='very_troubling.log', start_line=665, num_lines=334))), '\n\nwhile reading input from lines 666-999 of very_troubling.log')
def test_split(self): self.assertEqual( _format_error(dict(split=dict( path='very_troubling.log', start_line=665, num_lines=334))), '\n\nwhile reading input from lines 666-999 of very_troubling.log')
def _log_cause_of_error(self, ex): if not isinstance(ex, _TaskFailedException): # if something went wrong inside mrjob, the stacktrace # will bubble up to the top level return # not using LogInterpretationMixin because it would be overkill input_path = self._task_input_path( ex.task_type, ex.step_num, ex.task_num) stderr_path = self._task_stderr_path( ex.task_type, ex.step_num, ex.task_num) if self.fs.exists(stderr_path): # it should, but just to be safe # log-parsing code expects "str", not bytes; open in text mode with open(stderr_path) as stderr: task_error = _parse_task_stderr(stderr) if task_error: task_error['path'] = stderr_path log.error('Cause of failure:\n\n%s\n\n' % _format_error(dict( split=dict(path=input_path), task_error=task_error))) return # fallback if we can't find the error (e.g. the job does something # weird to stderr or stack traces) log.error('Error while reading from %s:\n' % input_path)
def _log_cause_of_error(self, ex): if not isinstance(ex, _TaskFailedException): # if something went wrong inside mrjob, the stacktrace # will bubble up to the top level return # not using LogInterpretationMixin because it would be overkill if not self._opts['read_logs']: return input_path = self._task_input_path( ex.task_type, ex.step_num, ex.task_num) stderr_path = self._task_stderr_path( ex.task_type, ex.step_num, ex.task_num) if self.fs.exists(stderr_path): # it should, but just to be safe # log-parsing code expects "str", not bytes; open in text mode with open(stderr_path) as stderr: task_error = _parse_task_stderr(stderr) if task_error: task_error['path'] = stderr_path log.error('Cause of failure:\n\n%s\n\n' % _format_error(dict( split=dict(path=input_path), task_error=task_error))) return # fallback if we can't find the error (e.g. the job does something # weird to stderr or stack traces) log.error('Error while reading from %s:\n' % input_path)
def test_trim_spark_stacktrace(self): self.assertEqual( _format_error( dict(spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=0, num_lines=10, ))), _MULTI_LINE_ERROR[37:423])
def test_task_error(self): self.assertEqual( _format_error(dict(task_error=dict( message='system will self-destruct in 5s' ))), 'system will self-destruct in 5s') # everything uses the same code to format path + line range, so # don't worry about testing all the options each time self.assertEqual( _format_error(dict(task_error=dict( message='system will self-destruct in 5s', path='/path/to/stderr', start_line=0, num_lines=1))), 'system will self-destruct in 5s' '\n\n(from line 1 of /path/to/stderr)')
def test_task_error(self): self.assertEqual( _format_error( dict(task_error=dict( message='system will self-destruct in 5s'))), 'system will self-destruct in 5s') # everything uses the same code to format path + line range, so # don't worry about testing all the options each time self.assertEqual( _format_error( dict(task_error=dict(message='system will self-destruct in 5s', path='/path/to/stderr', start_line=0, num_lines=1))), 'system will self-destruct in 5s' '\n\n(from line 1 of /path/to/stderr)')
def test_task_error(self): self.assertEqual( _format_error(dict(task_error=dict(message="system will self-destruct in 5s"))), "\n\ncaused by:\n\n" "system will self-destruct in 5s", ) # everything uses the same code to format path + line range, so # don't worry about testing all the options each time self.assertEqual( _format_error( dict( task_error=dict( message="system will self-destruct in 5s", path="/path/to/stderr", start_line=0, num_lines=1 ) ) ), "\n\ncaused by:\n\n" "system will self-destruct in 5s" "\n\n(from line 1 of /path/to/stderr)", )
def _wait_for_step_to_complete(self, job_id, step_num, num_steps): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) log_interpretation['step'] = {} step_type = self._get_step(step_num)['type'] while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) log_interpretation['step']['driver_output_uri'] = ( job.driver_output_resource_uri) self._interpret_step_logs(log_interpretation, step_type) progress = log_interpretation['step'].get('progress') if progress: log.info(' ' + progress['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) if job_state == 'ERROR': error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException(step_num=step_num, num_steps=num_steps)
def _wait_for_step_to_complete(self, job_id, step_num, num_steps): """Helper for _wait_for_step_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. This also adds an item to self._log_interpretations """ log_interpretation = dict(job_id=job_id) self._log_interpretations.append(log_interpretation) log_interpretation['step'] = {} step_type = self._get_step(step_num)['type'] while True: # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#JobStatus # noqa job = self._get_job(job_id) job_state = job.status.State.Name(job.status.state) log.info('%s => %s' % (job_id, job_state)) log_interpretation['step']['driver_output_uri'] = ( job.driver_output_resource_uri) self._interpret_step_logs(log_interpretation, step_type) progress = log_interpretation['step'].get('progress') if progress: log.info(' ' + progress['message']) # https://cloud.google.com/dataproc/reference/rest/v1/projects.regions.jobs#State # noqa # these are the states covered by the ACTIVE job state matcher, # plus SETUP_DONE if job_state in ('PENDING', 'RUNNING', 'CANCEL_PENDING', 'SETUP_DONE'): self._wait_for_api('job completion') continue # print counters if job wasn't CANCELLED if job_state != 'CANCELLED': self._log_counters(log_interpretation, step_num) if job_state == 'ERROR': error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) # we're done, will return at the end of this if job_state == 'DONE': break else: raise StepFailedException( step_num=step_num, num_steps=num_steps)
def test_spark_error(self): self.assertEqual( _format_error( dict(spark_error=dict( message= 'Task attempt_20190829211242_0004_m_000000_0 aborted.', start_line=0, num_lines=1, ))), 'Task attempt_20190829211242_0004_m_000000_0 aborted.') self.assertEqual( _format_error( dict(spark_error=dict( message= 'Task attempt_20190829211242_0004_m_000000_0 aborted.', start_line=0, num_lines=1, path='/path/to/log'))), ('Task attempt_20190829211242_0004_m_000000_0 aborted.' '\n\n(from line 1 of /path/to/log)'))
def test_spark_error_hides_other_errors(self): self.assertEqual( _format_error( dict( hadoop_error=dict( message='DevastatingJavaException', path='history.jhist', start_line=23, num_lines=1, ), spark_error=dict( message='Aborting task', start_line=99, num_lines=1, ), task_error=dict(message='system will self-destruct in 5s'), )), 'Aborting task')
def main(cl_args=None): arg_parser = _make_arg_parser() options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = { k: v for k, v in options.__dict__.items() if k not in ('quiet', 'verbose', 'step_id') } runner = EMRJobRunner(**runner_kwargs) emr_client = runner.make_emr_client() # pick step step = _get_step(emr_client, options.cluster_id, options.step_id) if not step: raise SystemExit(1) if step['Status']['State'] != 'FAILED': log.warning('step %s has state %s, not FAILED' % (step['Id'], step['Status']['State'])) # interpret logs log.info('Diagnosing step %s (%s)' % (step['Id'], step['Name'])) log_interpretation = dict(step_id=step['Id']) step_type = _infer_step_type(step) error = runner._pick_error(log_interpretation, step_type) # print error if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) else: log.warning('No error detected')
def main(cl_args=None): arg_parser = _make_arg_parser() options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = {k: v for k, v in options.__dict__.items() if k not in ('quiet', 'verbose', 'step_id')} runner = EMRJobRunner(**runner_kwargs) emr_client = runner.make_emr_client() # pick step step = _get_step(emr_client, options.cluster_id, options.step_id) if not step: raise SystemExit(1) if step['Status']['State'] != 'FAILED': log.warning('step %s has state %s, not FAILED' % (step['Id'], step['Status']['State'])) # interpret logs log.info('Diagnosing step %s (%s)' % (step['Id'], step['Name'])) log_interpretation = dict(step_id=step['Id']) step_type = _infer_step_type(step) error = runner._pick_error(log_interpretation, step_type) # print error if error: log.error('Probable cause of failure:\n\n%s\n\n' % _format_error(error)) else: log.warning('No error detected')
def test_fall_back_to_json(self): self.assertEqual(_format_error([]), '[]')
def test_empty(self): self.assertEqual(_format_error({}), '')
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = self._env_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def _run_job_in_hadoop(self): for step_num in range(self._num_steps()): step_args = self._args_for_step(step_num) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_string(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvp(step_args[0], step_args) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._hdfs_step_output_dir(step_num)) log_interpretation['step'] = step_interpretation if 'counters' not in step_interpretation: log.info('Attempting to read counters from history log') self._interpret_history_log(log_interpretation) # just print counters for this one step self._print_counters(step_nums=[step_num]) if returncode: error = self._pick_error(log_interpretation) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) raise CalledProcessError(returncode, step_args)
def _run_job_in_hadoop(self): for step_num, step in enumerate(self._get_steps()): self._warn_about_spark_archives(step) step_args = self._args_for_step(step_num) env = _fix_env(self._env_for_step(step_num)) # log this *after* _args_for_step(), which can start a search # for the Hadoop streaming jar log.info('Running step %d of %d...' % (step_num + 1, self._num_steps())) log.debug('> %s' % cmd_line(step_args)) log.debug(' with environment: %r' % sorted(env.items())) log_interpretation = {} self._log_interpretations.append(log_interpretation) # try to use a PTY if it's available try: pid, master_fd = pty.fork() except (AttributeError, OSError): # no PTYs, just use Popen # user won't get much feedback for a while, so tell them # Hadoop is running log.debug('No PTY available, using Popen() to invoke Hadoop') step_proc = Popen(step_args, stdout=PIPE, stderr=PIPE, env=env) step_interpretation = _interpret_hadoop_jar_command_stderr( step_proc.stderr, record_callback=_log_record_from_hadoop) # there shouldn't be much output to STDOUT for line in step_proc.stdout: _log_line_from_hadoop(to_unicode(line).strip('\r\n')) step_proc.stdout.close() step_proc.stderr.close() returncode = step_proc.wait() else: # we have PTYs if pid == 0: # we are the child process os.execvpe(step_args[0], step_args, env) else: log.debug('Invoking Hadoop via PTY') with os.fdopen(master_fd, 'rb') as master: # reading from master gives us the subprocess's # stderr and stdout (it's a fake terminal) step_interpretation = ( _interpret_hadoop_jar_command_stderr( master, record_callback=_log_record_from_hadoop)) _, returncode = os.waitpid(pid, 0) # make sure output_dir is filled if 'output_dir' not in step_interpretation: step_interpretation['output_dir'] = ( self._step_output_uri(step_num)) log_interpretation['step'] = step_interpretation step_type = step['type'] if not _is_spark_step_type(step_type): counters = self._pick_counters(log_interpretation, step_type) if counters: log.info(_format_counters(counters)) else: log.warning('No counters found') if returncode: error = self._pick_error(log_interpretation, step_type) if error: log.error('Probable cause of failure:\n\n%s\n' % _format_error(error)) # use CalledProcessError's well-known message format reason = str(CalledProcessError(returncode, step_args)) raise StepFailedException( reason=reason, step_num=step_num, num_steps=self._num_steps())