class LogDigest: def __init__( self, logs_file: Path, logger: logging.Logger, ) -> None: self.logger = logger self.logs_file = logs_file self.run_study: RunStudy = RunStudy(0) self.start_epoch_time: str = "" self.container_ids: Dict[str, Set[str]] = {} def analyze_logs(self, ) -> RunStudy: line_num = 0 parser_state: Optional[ParsingState] = None with open(self.logs_file) as infile: for log_line in infile: # Read a line and remove any trailing whitespace or newline chars line_num += 1 parser_state = self._parse_one_line(line_num, log_line.rstrip(), parser_state) self.run_study.total_line_num = line_num self._aggregate_summary() return self.run_study def _aggregate_summary(self, ) -> None: for instance_id in self.container_ids: instance_flow = self.run_study.instances[instance_id] instance_flow.instance_container_count = len( self.container_ids[instance_id]) # Make the summary of the stages in the instance stage_ids = [] for stage in instance_flow.stages: elapsed_hms = "n/a" if stage.context.elapsed_second: s = int(float(stage.context.elapsed_second)) elapsed_hms = "{:d}h{:02d}m{:02d}s".format( s // 3600, s % 3600 // 60, s % 60) instance_flow.summary_stages.append( f"{stage.stage_id}: failed={stage.failed_container_count}," f" end_elapsed={elapsed_hms}, end_update_line={stage.context.line_num}" ) stage_ids.append(stage.stage_id) # Make the summary of the instance self.run_study.summary_instances.append( f"i={instance_id}/o={instance_flow.objective_id}/c={instance_flow.cell_id}: failed_container_count={instance_flow.instance_failed_container_count}, last_stages={stage_ids[-3:]}" ) def _parse_one_line( self, line_num: int, log_line: str, parsing_state: Optional[ParsingState], ) -> Optional[ParsingState]: pattern_handlers = [ PatternAndHandler( # E.g. Created instance 252502207342908 for cell 451002203420028 and objective 159502204793395 r"Created instance ([^ ]+) for cell ([^ ]+) and objective ([^ ]+)$", self._add_created_instance_objective_cell, ), PatternAndHandler( # E.g.: ... Instances to run for cell-obj pairs: # { # "7595610074714724": { # "25065264566973790": { # "input_path": "https://fbpcs-github-e2e.s3.us-west-2.amazonaws.com/lift/inputs/partner_e2e_input.csv", # "instance_id": "7540993020268572", # "latest_data_ts": 1647202674, # "num_shards": 1, # "status": "CREATED" # } # } # } r"Instances to run for cell-obj pairs:", self._add_existing_instance, ), PatternAndHandler( # E.g. [252502207342908] Valid stage found: PrivateComputationStageFlow.PID_SHARD r"\[([^ ]+)\] Valid stage found: PrivateComputationStageFlow\.([^ ]+)$", self._add_flow_stage, ), PatternAndHandler( # E.g. [4547351303806882] {"input_path": ... "status_update_ts": 1648146505, ... } # Also have to contain like: "role": "PARTNER" r"\[([^ ]+)\] {(?=.*\"role\": \"PARTNER\".*)(\".*status_update_ts\": (\d+).+)}$", self._add_containers_from_status_update, ), ] if line_num == 1: context = self._parse_line_context(log_line) self.run_study.first_log = log_line self.run_study.start_epoch_time = context.epoch_time # E.g. any of the following cases (incomplete list): # 2022-06-06 20:12:54,535Z ERROR t:MainThread n:__main__ ! [7540993020268572] Error: type: ... # 2022-06-06 20:16:23,432Z ERROR t:MainThread n:root ! instance_id='7540993020268572' FAILED. # ERROR:__main__:[15398047007316153] Error: type: ... # ERROR:__main__:instance_id='15398047007316153' FAILED. if match := re.search( r"^(.{16}:\d{2},\d{3}Z ERROR t:[^!]+! |ERROR:[^:]+:)(.+)$", log_line): self._add_line_with_error_log_level(line_num, match) if parsing_state: # Match the whole log line match = re.search(r".*", log_line) return parsing_state.handler(parsing_state.context, match or Match(), parsing_state.last_lines) for pattern_handler in pattern_handlers: if match := re.search(pattern_handler.pattern, log_line): context = self._parse_line_context(log_line) context.line_num = line_num return pattern_handler.handler(context, match, None)