Beispiel #1
0
    def test_container_to_attempt_id(self):
        container_id = "container_1449525218032_0005_01_000010"
        attempt_id = "attempt_1449525218032_0005_m_000000_3"
        task_id = _attempt_id_to_task_id(attempt_id)

        container_to_attempt_id = {container_id: attempt_id}

        log_interpretation = dict(
            history=dict(
                container_to_attempt_id=container_to_attempt_id,
                errors=[
                    dict(attempt_id=attempt_id, hadoop_error=dict(message="SwordsMischiefException"), task_id=task_id)
                ],
            ),
            task=dict(
                errors=[
                    dict(
                        container_id=container_id,
                        hadoop_error=dict(message="SwordsMischiefException"),
                        task_error=dict(message="en garde!"),
                    )
                ]
            ),
        )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                attempt_id=attempt_id,
                container_id=container_id,
                hadoop_error=dict(message="SwordsMischiefException"),
                task_error=dict(message="en garde!"),
                task_id=task_id,
            ),
        )
Beispiel #2
0
    def test_task_error_beats_timestamp(self):
        log_interpretation = dict(
            history=dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000003',
                        hadoop_error=dict(message='BOOM'),
                        task_error=dict(message='things exploding'),
                    ),
                    dict(
                        container_id='container_1450489999999_0005_01_000004',
                        hadoop_error=dict(message='elephant problems'),
                    ),
                ],
            ),
        )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                container_id='container_1450486922681_0005_01_000003',
                hadoop_error=dict(message='BOOM'),
                task_error=dict(message='things exploding'),
            ),
        )
Beispiel #3
0
    def test_merge_order(self):
        # task logs usually have the best info and should be merged last
        log_interpretation = dict(
            step=dict(
                errors=[dict(container_id="container_1450486922681_0005_01_000004", hadoop_error=dict(message="BOOM"))]
            ),
            history=dict(
                errors=[
                    dict(
                        container_id="container_1450486922681_0005_01_000004",
                        hadoop_error=dict(message="BOOM", path="history.jhist"),
                        split=dict(path="snake_facts.txt"),
                    )
                ]
            ),
            task=dict(
                errors=[
                    dict(
                        container_id="container_1450486922681_0005_01_000004",
                        hadoop_error=dict(message="BOOM", path="some_syslog"),
                        task_error=dict(message="exploding snakes, now?!", path="some_stderr"),
                    )
                ]
            ),
        )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                container_id="container_1450486922681_0005_01_000004",
                hadoop_error=dict(message="BOOM", path="some_syslog"),
                split=dict(path="snake_facts.txt"),
                task_error=dict(message="exploding snakes, now?!", path="some_stderr"),
            ),
        )
Beispiel #4
0
    def test_spark_error_beats_task_error(self):
        log_interpretation = dict(task=dict(
            application_id='application_1566607039137_0001',
            errors=[
                dict(
                    container_id='container_1450486922681_0001_01_000001',
                    spark_error=dict(
                        message=_MULTI_LINE_ERROR[37:],
                        start_line=1,
                        num_lines=10,
                    ),
                ),
                dict(
                    container_id='container_1450486922681_0005_01_000004',
                    task_error=dict(
                        message='exploding snakes, now?!',
                        path='some_stderr',
                    ),
                ),
            ]))

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(container_id='container_1450486922681_0001_01_000001',
                 spark_error=dict(
                     message=_MULTI_LINE_ERROR[37:],
                     start_line=1,
                     num_lines=10,
                 )))
Beispiel #5
0
    def _pick_error(self, log_interpretation):
        """Pick probable cause of failure (only call this if job fails)."""
        if not all(log_type in log_interpretation for
                   log_type in ('job', 'step', 'task')):
            log.info('Scanning logs for probable cause of failure...')
            self._interpret_step_logs(log_interpretation)
            self._interpret_history_log(log_interpretation)
            self._interpret_task_logs(log_interpretation)

        return _pick_error(log_interpretation)
Beispiel #6
0
    def _pick_error(self, log_interpretation):
        """Pick probable cause of failure (only call this if job fails)."""
        if not all(log_type in log_interpretation for
                   log_type in ('job', 'step', 'task')):
            log.info('Scanning logs for probable cause of failure...')
            self._interpret_step_logs(log_interpretation)
            self._interpret_history_log(log_interpretation)
            self._interpret_task_logs(log_interpretation)

        return _pick_error(log_interpretation)
Beispiel #7
0
    def test_merge_order(self):
        # task logs usually have the best info and should be merged last
        log_interpretation = dict(
            step=dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(message='BOOM'),
                    ),
                ],
            ),
            history=dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(
                            message='BOOM',
                            path='history.jhist',
                        ),
                        split=dict(path='snake_facts.txt'),
                    ),
                ],
            ),
            task=dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(
                            message='BOOM',
                            path='some_syslog',
                        ),
                        task_error=dict(
                            message='exploding snakes, now?!',
                            path='some_stderr',
                        ),
                    ),
                ],
            )
        )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                container_id='container_1450486922681_0005_01_000004',
                hadoop_error=dict(
                    message='BOOM',
                    path='some_syslog',
                ),
                split=dict(path='snake_facts.txt'),
                task_error=dict(
                    message='exploding snakes, now?!',
                    path='some_stderr',
                ),
            ),
        )
Beispiel #8
0
    def test_merge_order(self):
        # task logs usually have the best info and should be merged last
        log_interpretation = dict(
            step=dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(message='BOOM'),
                    ),
                ],
            ),
            history=dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(
                            message='BOOM',
                            path='history.jhist',
                        ),
                        split=dict(path='snake_facts.txt'),
                    ),
                ],
            ),
            task=dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(
                            message='BOOM',
                            path='some_syslog',
                        ),
                        task_error=dict(
                            message='exploding snakes, now?!',
                            path='some_stderr',
                        ),
                    ),
                ],
            )
        )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                container_id='container_1450486922681_0005_01_000004',
                hadoop_error=dict(
                    message='BOOM',
                    path='some_syslog',
                ),
                split=dict(path='snake_facts.txt'),
                task_error=dict(
                    message='exploding snakes, now?!',
                    path='some_stderr',
                ),
            ),
        )
Beispiel #9
0
    def _run_step_on_spark(self, step, step_num, last_step_num=None):
        if self._opts['upload_archives'] and self._spark_master() != 'yarn':
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num, last_step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode, step_interpretation = self._run_spark_submit(
            spark_submit_args, env, record_callback=_log_log4j_record)

        counters = None
        if step['type'] == 'streaming':
            counter_file = self.fs.join(self._counter_output_dir(step_num),
                                        'part-*')
            counter_json = b''.join(self.fs.cat(counter_file))
            if counter_json.strip():
                # json.loads() on Python 3.4/3.5 can't take bytes
                counters = json.loads(to_unicode(counter_json))

        if isinstance(counters, list):
            self._counters.extend(counters)

            # desc_num is 1-indexed user-readable step num
            for desc_num, counter_dict in enumerate(counters,
                                                    start=(step_num + 1)):
                if counter_dict:
                    log.info(
                        _format_counters(counter_dict,
                                         desc=('Counters for step %d' %
                                               desc_num)))

        # for non-streaming steps, there are no counters.
        # pad self._counters to match number of steps
        while len(self._counters) < (last_step_num or step_num) + 1:
            self._counters.append({})

        if returncode:
            error = _pick_error(dict(step=step_interpretation))
            if error:
                _log_probable_cause_of_failure(log, error)

            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      last_step_num=last_step_num,
                                      num_steps=self._num_steps())
Beispiel #10
0
    def _pick_error(self, log_interpretation, step_type):
        """Pick probable cause of failure (only call this if job fails)."""
        if self._read_logs() and not all(
                log_type in log_interpretation for
                log_type in ('step', 'history', 'task')):
            log.info('Scanning logs for probable cause of failure...')
            self._interpret_step_logs(log_interpretation, step_type)
            self._interpret_history_log(log_interpretation)

            error_attempt_ids = _pick_error_attempt_ids(log_interpretation)

            self._interpret_task_logs(
                log_interpretation, step_type, error_attempt_ids)

        return _pick_error(log_interpretation)
Beispiel #11
0
    def test_pick_most_recent_error(self):
        log_interpretation = dict(history=dict(errors=[
            dict(
                container_id='container_1450486922681_0005_01_000003',
                hadoop_error=dict(message='BOOM'),
            ),
            dict(
                container_id='container_1450486922681_0005_01_000004',
                hadoop_error=dict(message='elephant problems'),
            ),
        ], ), )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                container_id='container_1450486922681_0005_01_000004',
                hadoop_error=dict(message='elephant problems'),
            ))
Beispiel #12
0
    def test_timestamp_beats_task_error(self):
        log_interpretation = dict(history=dict(errors=[
            dict(
                container_id='container_1450486922681_0005_01_000003',
                hadoop_error=dict(message='BOOM'),
                task_error=dict(message='things exploding'),
            ),
            dict(
                container_id='container_1450489999999_0005_01_000004',
                hadoop_error=dict(message='elephant problems'),
            ),
        ], ), )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                container_id='container_1450489999999_0005_01_000004',
                hadoop_error=dict(message='elephant problems'),
            ))
Beispiel #13
0
    def test_pick_most_recent_error(self):
        log_interpretation = dict(
            history=dict(
                errors=[
                    dict(
                        container_id="container_1450486922681_0005_01_000003",
                        hadoop_error=dict(message="BOOM"),
                        task_error=dict(message="things exploding"),
                    ),
                    dict(
                        container_id="container_1450486922681_0005_01_000004",
                        hadoop_error=dict(message="elephant problems"),
                    ),
                ]
            )
        )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(container_id="container_1450486922681_0005_01_000004", hadoop_error=dict(message="elephant problems")),
        )
Beispiel #14
0
    def test_pick_shortest_spark_error(self):
        log_interpretation = dict(step=dict(errors=[
            dict(spark_error=dict(
                message=_MULTI_LINE_ERROR[37:],
                start_line=1,
                num_lines=10,
            )),
            dict(spark_error=dict(
                message=_MULTI_LINE_WARNING[180:],
                start_line=12,
                num_lines=13,
            )),
        ]))

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(spark_error=dict(
                message=_MULTI_LINE_ERROR[37:],
                start_line=1,
                num_lines=10,
            )))
Beispiel #15
0
    def test_multiline_spark_error_beats_single_line(self):
        log_interpretation = dict(step=dict(errors=[
            dict(spark_error=(dict(
                message=_SINGLE_LINE_ERROR[49:],
                start_line=0,
                num_lines=1,
            ))),
            dict(spark_error=dict(
                message=_MULTI_LINE_WARNING[180:],
                start_line=12,
                num_lines=13,
            )),
        ]))

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(spark_error=dict(
                message=_MULTI_LINE_WARNING[180:],
                start_line=12,
                num_lines=13,
            )))
Beispiel #16
0
    def _run_step_on_spark(self, step, step_num):
        if self._opts['upload_archives']:
            log.warning('Spark master %r will probably ignore archives' %
                        self._spark_master())

        spark_submit_args = self._args_for_spark_step(step_num)

        env = dict(os.environ)
        env.update(self._spark_cmdenv(step_num))

        returncode, step_interpretation = self._run_spark_submit(
            spark_submit_args, env, record_callback=_log_log4j_record)

        if returncode:
            error = _pick_error(dict(step=step_interpretation))
            if error:
                _log_probable_cause_of_failure(log, error)

            reason = str(CalledProcessError(returncode, spark_submit_args))
            raise StepFailedException(reason=reason,
                                      step_num=step_num,
                                      num_steps=self._num_steps())
Beispiel #17
0
    def test_can_get_spark_errors_from_task_logs(self):
        log_interpretation = dict(task=dict(
            application_id='application_1566607039137_0001',
            errors=[
                dict(
                    container_id='container_1450486922681_0005_01_000004',
                    spark_error=dict(
                        message=_MULTI_LINE_ERROR[37:],
                        start_line=1,
                        num_lines=10,
                    ),
                ),
            ]))

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(container_id='container_1450486922681_0005_01_000004',
                 spark_error=dict(
                     message=_MULTI_LINE_ERROR[37:],
                     start_line=1,
                     num_lines=10,
                 )))
Beispiel #18
0
    def test_container_to_attempt_id(self):
        container_id = 'container_1449525218032_0005_01_000010'
        attempt_id = 'attempt_1449525218032_0005_m_000000_3'
        task_id = _attempt_id_to_task_id(attempt_id)

        container_to_attempt_id = {container_id: attempt_id}

        log_interpretation = dict(
            history=dict(
                container_to_attempt_id=container_to_attempt_id,
                errors=[
                    dict(
                        attempt_id=attempt_id,
                        hadoop_error=dict(message='SwordsMischiefException'),
                        task_id=task_id,
                    ),
                ],
            ),
            task=dict(
                errors=[
                    dict(
                        container_id=container_id,
                        hadoop_error=dict(message='SwordsMischiefException'),
                        task_error=dict(message='en garde!'),
                    ),
                ],
            ),
        )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                attempt_id=attempt_id,
                container_id=container_id,
                hadoop_error=dict(message='SwordsMischiefException'),
                task_error=dict(message='en garde!'),
                task_id=task_id,
            ))
Beispiel #19
0
    def test_pick_most_recent_error(self):
        log_interpretation = dict(
            history=dict(
                errors=[
                    dict(
                        container_id='container_1450486922681_0005_01_000003',
                        hadoop_error=dict(message='BOOM'),
                    ),
                    dict(
                        container_id='container_1450486922681_0005_01_000004',
                        hadoop_error=dict(message='elephant problems'),
                    ),
                ],
            ),
        )

        self.assertEqual(
            _pick_error(log_interpretation),
            dict(
                container_id='container_1450486922681_0005_01_000004',
                hadoop_error=dict(message='elephant problems'),
            )
        )
Beispiel #20
0
    def _pick_error(self, log_interpretation):
        """Find probable cause of failure, and return it."""
        self._interpret_history_log(log_interpretation)
        self._interpret_task_logs(log_interpretation)

        return _pick_error(log_interpretation)
Beispiel #21
0
 def test_empty(self):
     self.assertEqual(_pick_error({}), None)
     # make sure we can handle log interpretations without error
     self.assertEqual(_pick_error(dict(history={})), None)
Beispiel #22
0
 def test_empty(self):
     self.assertEqual(_pick_error({}), None)
     # make sure we can handle log interpretations without error
     self.assertEqual(_pick_error(dict(history={})), None)
Beispiel #23
0
    def _pick_error(self, log_interpretation):
        """Find probable cause of failure, and return it."""
        self._interpret_history_log(log_interpretation)
        self._interpret_task_logs(log_interpretation)

        return _pick_error(log_interpretation)