コード例 #1
0
class LogDigest:
    def __init__(
        self,
        logs_file: Path,
        logger: logging.Logger,
    ) -> None:
        self.logger = logger
        self.logs_file = logs_file
        self.run_study: RunStudy = RunStudy(0)
        self.start_epoch_time: str = ""
        self.container_ids: Dict[str, Set[str]] = {}

    def analyze_logs(self, ) -> RunStudy:
        line_num = 0
        parser_state: Optional[ParsingState] = None
        with open(self.logs_file) as infile:
            for log_line in infile:
                # Read a line and remove any trailing whitespace or newline chars
                line_num += 1
                parser_state = self._parse_one_line(line_num,
                                                    log_line.rstrip(),
                                                    parser_state)

        self.run_study.total_line_num = line_num
        self._aggregate_summary()
        return self.run_study

    def _aggregate_summary(self, ) -> None:
        for instance_id in self.container_ids:
            instance_flow = self.run_study.instances[instance_id]
            instance_flow.instance_container_count = len(
                self.container_ids[instance_id])
            # Make the summary of the stages in the instance
            stage_ids = []
            for stage in instance_flow.stages:
                elapsed_hms = "n/a"
                if stage.context.elapsed_second:
                    s = int(float(stage.context.elapsed_second))
                    elapsed_hms = "{:d}h{:02d}m{:02d}s".format(
                        s // 3600, s % 3600 // 60, s % 60)
                instance_flow.summary_stages.append(
                    f"{stage.stage_id}: failed={stage.failed_container_count},"
                    f" end_elapsed={elapsed_hms}, end_update_line={stage.context.line_num}"
                )
                stage_ids.append(stage.stage_id)
            # Make the summary of the instance
            self.run_study.summary_instances.append(
                f"i={instance_id}/o={instance_flow.objective_id}/c={instance_flow.cell_id}: failed_container_count={instance_flow.instance_failed_container_count}, last_stages={stage_ids[-3:]}"
            )

    def _parse_one_line(
        self,
        line_num: int,
        log_line: str,
        parsing_state: Optional[ParsingState],
    ) -> Optional[ParsingState]:
        pattern_handlers = [
            PatternAndHandler(
                # E.g. Created instance 252502207342908 for cell 451002203420028 and objective 159502204793395
                r"Created instance ([^ ]+) for cell ([^ ]+) and objective ([^ ]+)$",
                self._add_created_instance_objective_cell,
            ),
            PatternAndHandler(
                # E.g.: ... Instances to run for cell-obj pairs:
                # {
                #     "7595610074714724": {
                #         "25065264566973790": {
                #             "input_path": "https://fbpcs-github-e2e.s3.us-west-2.amazonaws.com/lift/inputs/partner_e2e_input.csv",
                #             "instance_id": "7540993020268572",
                #             "latest_data_ts": 1647202674,
                #             "num_shards": 1,
                #             "status": "CREATED"
                #         }
                #     }
                # }
                r"Instances to run for cell-obj pairs:",
                self._add_existing_instance,
            ),
            PatternAndHandler(
                # E.g. [252502207342908] Valid stage found: PrivateComputationStageFlow.PID_SHARD
                r"\[([^ ]+)\] Valid stage found: PrivateComputationStageFlow\.([^ ]+)$",
                self._add_flow_stage,
            ),
            PatternAndHandler(
                # E.g. [4547351303806882] {"input_path": ... "status_update_ts": 1648146505, ... }
                # Also have to contain like: "role": "PARTNER"
                r"\[([^ ]+)\] {(?=.*\"role\": \"PARTNER\".*)(\".*status_update_ts\": (\d+).+)}$",
                self._add_containers_from_status_update,
            ),
        ]

        if line_num == 1:
            context = self._parse_line_context(log_line)
            self.run_study.first_log = log_line
            self.run_study.start_epoch_time = context.epoch_time

        # E.g. any of the following cases (incomplete list):
        # 2022-06-06 20:12:54,535Z ERROR t:MainThread n:__main__ ! [7540993020268572] Error: type: ...
        # 2022-06-06 20:16:23,432Z ERROR t:MainThread n:root ! instance_id='7540993020268572' FAILED.
        # ERROR:__main__:[15398047007316153] Error: type: ...
        # ERROR:__main__:instance_id='15398047007316153' FAILED.
        if match := re.search(
                r"^(.{16}:\d{2},\d{3}Z ERROR t:[^!]+! |ERROR:[^:]+:)(.+)$",
                log_line):
            self._add_line_with_error_log_level(line_num, match)

        if parsing_state:
            # Match the whole log line
            match = re.search(r".*", log_line)
            return parsing_state.handler(parsing_state.context, match
                                         or Match(), parsing_state.last_lines)

        for pattern_handler in pattern_handlers:
            if match := re.search(pattern_handler.pattern, log_line):
                context = self._parse_line_context(log_line)
                context.line_num = line_num
                return pattern_handler.handler(context, match, None)