def test_container_to_attempt_id(self): container_id = "container_1449525218032_0005_01_000010" attempt_id = "attempt_1449525218032_0005_m_000000_3" task_id = _attempt_id_to_task_id(attempt_id) container_to_attempt_id = {container_id: attempt_id} log_interpretation = dict( history=dict( container_to_attempt_id=container_to_attempt_id, errors=[ dict(attempt_id=attempt_id, hadoop_error=dict(message="SwordsMischiefException"), task_id=task_id) ], ), task=dict( errors=[ dict( container_id=container_id, hadoop_error=dict(message="SwordsMischiefException"), task_error=dict(message="en garde!"), ) ] ), ) self.assertEqual( _pick_error(log_interpretation), dict( attempt_id=attempt_id, container_id=container_id, hadoop_error=dict(message="SwordsMischiefException"), task_error=dict(message="en garde!"), task_id=task_id, ), )
def test_task_error_beats_timestamp(self): log_interpretation = dict( history=dict( errors=[ dict( container_id='container_1450486922681_0005_01_000003', hadoop_error=dict(message='BOOM'), task_error=dict(message='things exploding'), ), dict( container_id='container_1450489999999_0005_01_000004', hadoop_error=dict(message='elephant problems'), ), ], ), ) self.assertEqual( _pick_error(log_interpretation), dict( container_id='container_1450486922681_0005_01_000003', hadoop_error=dict(message='BOOM'), task_error=dict(message='things exploding'), ), )
def test_merge_order(self): # task logs usually have the best info and should be merged last log_interpretation = dict( step=dict( errors=[dict(container_id="container_1450486922681_0005_01_000004", hadoop_error=dict(message="BOOM"))] ), history=dict( errors=[ dict( container_id="container_1450486922681_0005_01_000004", hadoop_error=dict(message="BOOM", path="history.jhist"), split=dict(path="snake_facts.txt"), ) ] ), task=dict( errors=[ dict( container_id="container_1450486922681_0005_01_000004", hadoop_error=dict(message="BOOM", path="some_syslog"), task_error=dict(message="exploding snakes, now?!", path="some_stderr"), ) ] ), ) self.assertEqual( _pick_error(log_interpretation), dict( container_id="container_1450486922681_0005_01_000004", hadoop_error=dict(message="BOOM", path="some_syslog"), split=dict(path="snake_facts.txt"), task_error=dict(message="exploding snakes, now?!", path="some_stderr"), ), )
def test_spark_error_beats_task_error(self): log_interpretation = dict(task=dict( application_id='application_1566607039137_0001', errors=[ dict( container_id='container_1450486922681_0001_01_000001', spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=1, num_lines=10, ), ), dict( container_id='container_1450486922681_0005_01_000004', task_error=dict( message='exploding snakes, now?!', path='some_stderr', ), ), ])) self.assertEqual( _pick_error(log_interpretation), dict(container_id='container_1450486922681_0001_01_000001', spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=1, num_lines=10, )))
def _pick_error(self, log_interpretation): """Pick probable cause of failure (only call this if job fails).""" if not all(log_type in log_interpretation for log_type in ('job', 'step', 'task')): log.info('Scanning logs for probable cause of failure...') self._interpret_step_logs(log_interpretation) self._interpret_history_log(log_interpretation) self._interpret_task_logs(log_interpretation) return _pick_error(log_interpretation)
def test_merge_order(self): # task logs usually have the best info and should be merged last log_interpretation = dict( step=dict( errors=[ dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict(message='BOOM'), ), ], ), history=dict( errors=[ dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict( message='BOOM', path='history.jhist', ), split=dict(path='snake_facts.txt'), ), ], ), task=dict( errors=[ dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict( message='BOOM', path='some_syslog', ), task_error=dict( message='exploding snakes, now?!', path='some_stderr', ), ), ], ) ) self.assertEqual( _pick_error(log_interpretation), dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict( message='BOOM', path='some_syslog', ), split=dict(path='snake_facts.txt'), task_error=dict( message='exploding snakes, now?!', path='some_stderr', ), ), )
def _run_step_on_spark(self, step, step_num, last_step_num=None): if self._opts['upload_archives'] and self._spark_master() != 'yarn': log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num, last_step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode, step_interpretation = self._run_spark_submit( spark_submit_args, env, record_callback=_log_log4j_record) counters = None if step['type'] == 'streaming': counter_file = self.fs.join(self._counter_output_dir(step_num), 'part-*') counter_json = b''.join(self.fs.cat(counter_file)) if counter_json.strip(): # json.loads() on Python 3.4/3.5 can't take bytes counters = json.loads(to_unicode(counter_json)) if isinstance(counters, list): self._counters.extend(counters) # desc_num is 1-indexed user-readable step num for desc_num, counter_dict in enumerate(counters, start=(step_num + 1)): if counter_dict: log.info( _format_counters(counter_dict, desc=('Counters for step %d' % desc_num))) # for non-streaming steps, there are no counters. # pad self._counters to match number of steps while len(self._counters) < (last_step_num or step_num) + 1: self._counters.append({}) if returncode: error = _pick_error(dict(step=step_interpretation)) if error: _log_probable_cause_of_failure(log, error) reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException(reason=reason, step_num=step_num, last_step_num=last_step_num, num_steps=self._num_steps())
def _pick_error(self, log_interpretation, step_type): """Pick probable cause of failure (only call this if job fails).""" if self._read_logs() and not all( log_type in log_interpretation for log_type in ('step', 'history', 'task')): log.info('Scanning logs for probable cause of failure...') self._interpret_step_logs(log_interpretation, step_type) self._interpret_history_log(log_interpretation) error_attempt_ids = _pick_error_attempt_ids(log_interpretation) self._interpret_task_logs( log_interpretation, step_type, error_attempt_ids) return _pick_error(log_interpretation)
def test_pick_most_recent_error(self): log_interpretation = dict(history=dict(errors=[ dict( container_id='container_1450486922681_0005_01_000003', hadoop_error=dict(message='BOOM'), ), dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict(message='elephant problems'), ), ], ), ) self.assertEqual( _pick_error(log_interpretation), dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict(message='elephant problems'), ))
def test_timestamp_beats_task_error(self): log_interpretation = dict(history=dict(errors=[ dict( container_id='container_1450486922681_0005_01_000003', hadoop_error=dict(message='BOOM'), task_error=dict(message='things exploding'), ), dict( container_id='container_1450489999999_0005_01_000004', hadoop_error=dict(message='elephant problems'), ), ], ), ) self.assertEqual( _pick_error(log_interpretation), dict( container_id='container_1450489999999_0005_01_000004', hadoop_error=dict(message='elephant problems'), ))
def test_pick_most_recent_error(self): log_interpretation = dict( history=dict( errors=[ dict( container_id="container_1450486922681_0005_01_000003", hadoop_error=dict(message="BOOM"), task_error=dict(message="things exploding"), ), dict( container_id="container_1450486922681_0005_01_000004", hadoop_error=dict(message="elephant problems"), ), ] ) ) self.assertEqual( _pick_error(log_interpretation), dict(container_id="container_1450486922681_0005_01_000004", hadoop_error=dict(message="elephant problems")), )
def test_pick_shortest_spark_error(self): log_interpretation = dict(step=dict(errors=[ dict(spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=1, num_lines=10, )), dict(spark_error=dict( message=_MULTI_LINE_WARNING[180:], start_line=12, num_lines=13, )), ])) self.assertEqual( _pick_error(log_interpretation), dict(spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=1, num_lines=10, )))
def test_multiline_spark_error_beats_single_line(self): log_interpretation = dict(step=dict(errors=[ dict(spark_error=(dict( message=_SINGLE_LINE_ERROR[49:], start_line=0, num_lines=1, ))), dict(spark_error=dict( message=_MULTI_LINE_WARNING[180:], start_line=12, num_lines=13, )), ])) self.assertEqual( _pick_error(log_interpretation), dict(spark_error=dict( message=_MULTI_LINE_WARNING[180:], start_line=12, num_lines=13, )))
def _run_step_on_spark(self, step, step_num): if self._opts['upload_archives']: log.warning('Spark master %r will probably ignore archives' % self._spark_master()) spark_submit_args = self._args_for_spark_step(step_num) env = dict(os.environ) env.update(self._spark_cmdenv(step_num)) returncode, step_interpretation = self._run_spark_submit( spark_submit_args, env, record_callback=_log_log4j_record) if returncode: error = _pick_error(dict(step=step_interpretation)) if error: _log_probable_cause_of_failure(log, error) reason = str(CalledProcessError(returncode, spark_submit_args)) raise StepFailedException(reason=reason, step_num=step_num, num_steps=self._num_steps())
def test_can_get_spark_errors_from_task_logs(self): log_interpretation = dict(task=dict( application_id='application_1566607039137_0001', errors=[ dict( container_id='container_1450486922681_0005_01_000004', spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=1, num_lines=10, ), ), ])) self.assertEqual( _pick_error(log_interpretation), dict(container_id='container_1450486922681_0005_01_000004', spark_error=dict( message=_MULTI_LINE_ERROR[37:], start_line=1, num_lines=10, )))
def test_container_to_attempt_id(self): container_id = 'container_1449525218032_0005_01_000010' attempt_id = 'attempt_1449525218032_0005_m_000000_3' task_id = _attempt_id_to_task_id(attempt_id) container_to_attempt_id = {container_id: attempt_id} log_interpretation = dict( history=dict( container_to_attempt_id=container_to_attempt_id, errors=[ dict( attempt_id=attempt_id, hadoop_error=dict(message='SwordsMischiefException'), task_id=task_id, ), ], ), task=dict( errors=[ dict( container_id=container_id, hadoop_error=dict(message='SwordsMischiefException'), task_error=dict(message='en garde!'), ), ], ), ) self.assertEqual( _pick_error(log_interpretation), dict( attempt_id=attempt_id, container_id=container_id, hadoop_error=dict(message='SwordsMischiefException'), task_error=dict(message='en garde!'), task_id=task_id, ))
def test_pick_most_recent_error(self): log_interpretation = dict( history=dict( errors=[ dict( container_id='container_1450486922681_0005_01_000003', hadoop_error=dict(message='BOOM'), ), dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict(message='elephant problems'), ), ], ), ) self.assertEqual( _pick_error(log_interpretation), dict( container_id='container_1450486922681_0005_01_000004', hadoop_error=dict(message='elephant problems'), ) )
def _pick_error(self, log_interpretation): """Find probable cause of failure, and return it.""" self._interpret_history_log(log_interpretation) self._interpret_task_logs(log_interpretation) return _pick_error(log_interpretation)
def test_empty(self): self.assertEqual(_pick_error({}), None) # make sure we can handle log interpretations without error self.assertEqual(_pick_error(dict(history={})), None)