def _interpret_task_logs(self, log_interpretation, partial=True): """Fetch task syslogs and stderr, and add 'task' to interpretation.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') output_dir = step_interpretation.get('output_dir') yarn = uses_yarn(self.get_hadoop_version()) if yarn: if not application_id: log.warning("Can't fetch task logs; missing application ID") return else: if not job_id: log.warning("Can't fetch task logs; missing job ID") return log_interpretation['task'] = _interpret_task_logs( self.fs, self._ls_task_syslogs( application_id=application_id, job_id=job_id, output_dir=output_dir), partial=partial, stderr_callback=_log_parsing_task_stderr)
def _interpret_task_logs(self, log_interpretation, partial=True): """Fetch task syslogs and stderr, and add 'task' to interpretation.""" if 'task' in log_interpretation and ( partial or not log_interpretation['task'].get('partial')): return # already interpreted step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') output_dir = step_interpretation.get('output_dir') yarn = uses_yarn(self.get_hadoop_version()) if yarn: if not application_id: if not log_interpretation.get('no_job'): log.warning( "Can't fetch task logs; missing application ID") return else: if not job_id: if not log_interpretation.get('no_job'): log.warning("Can't fetch task logs; missing job ID") return log_interpretation['task'] = _interpret_task_logs( self.fs, self._ls_task_syslogs( application_id=application_id, job_id=job_id, output_dir=output_dir), partial=partial, stderr_callback=_log_parsing_task_stderr)
def _interpret_task_logs(self, log_interpretation): """Find and interpret the task logs, storing the interpretation in ``log_interpretation['task']``.""" if 'task' not in log_interpretation: # get job/application ID from output of hadoop command step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') yarn = uses_yarn(self.get_hadoop_version()) if yarn and application_id is None: log.warning("Can't fetch task logs without application ID") return {} elif not yarn and job_id is None: log.warning("Can't fetch task logs without job ID") return {} # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs( output_dir=step_interpretation.get('output_dir'))): if yarn: path = self.fs.join(log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by # job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('Scanning task syslogs in %s' % path) yield [path] # wrap _ls_task_syslogs() to add logging def ls_task_syslogs(): # there should be at most one history log for match in _ls_task_syslogs(self.fs, stream_task_log_dirs(), application_id=application_id, job_id=job_id): # TODO: this isn't really correct because # _interpret_task_logs() sorts the logs paths and # scans starting at the most recent one. Probably # should have _ls_task_syslogs() do the sorting. log.info(' Scanning for errors: %s' % match['path']) yield match log_interpretation['task'] = _interpret_task_logs( self.fs, ls_task_syslogs()) return log_interpretation['task']
def _interpret_task_logs(self, log_interpretation): """Find and interpret the task logs, storing the interpretation in ``log_interpretation['task']``.""" if 'task' not in log_interpretation: # get job/application ID from output of hadoop command step_interpretation = log_interpretation.get('step') or {} application_id = step_interpretation.get('application_id') job_id = step_interpretation.get('job_id') yarn = uses_yarn(self.get_hadoop_version()) if yarn and application_id is None: log.warning("Can't fetch task logs without application ID") return {} elif not yarn and job_id is None: log.warning("Can't fetch task logs without job ID") return {} # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs( output_dir=step_interpretation.get('output_dir'))): if yarn: path = self.fs.join( log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by # job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('Scanning task syslogs in %s' % path) yield [path] # wrap _ls_task_syslogs() to add logging def ls_task_syslogs(): # there should be at most one history log for match in _ls_task_syslogs( self.fs, stream_task_log_dirs(), application_id=application_id, job_id=job_id): # TODO: this isn't really correct because # _interpret_task_logs() sorts the logs paths and # scans starting at the most recent one. Probably # should have _ls_task_syslogs() do the sorting. log.info(' Scanning for errors: %s' % match['path']) yield match log_interpretation['task'] = _interpret_task_logs( self.fs, ls_task_syslogs()) return log_interpretation['task']
def interpret_task_logs(self, **kwargs): return _interpret_task_logs(self.mock_fs, self.mock_path_matches(), log_callback=self.mock_log_callback, **kwargs)
def interpret_task_logs(self, **kwargs): return _interpret_task_logs( self.mock_fs, self.mock_path_matches(), **kwargs)