def analyze_log(parser): root = Measurement('Luigi Worker') scheduling_time = analyze_overall_scheduling(parser) root.add_child(scheduling_time) execution_time = analyze_overall_execution(parser) root.add_child(execution_time) return root
def analyze_output_move(parser): output_message = parser.peek_message() if re.match(r'.*?Output:.*-temp-', output_message.content): start_timestamp = output_message.timestamp parser.next_message() next_message = parser.peek_message() return Measurement('Moving Job Output', next_message.timestamp - start_timestamp)
def analyze_overall_execution(parser): all_execution = Measurement('Executing Tasks') pattern = r'.*? Worker Worker.* (?P<state>running|done|failed)\s+(?P<task_id>.*)' message = True overall_start_timestamp = None running_measurement = None while message: message = parser.next_message() if message.content == 'Done': if overall_start_timestamp: all_execution.set_time_from_range(message.timestamp, overall_start_timestamp) return all_execution match = re.match(pattern, message.content, (re.MULTILINE | re.DOTALL)) if not match: if 'Running job:' in message.content or 'Starting Job =' in message.content: for measurement in analyze_hadoop_job(message, parser): running_measurement.add_child(measurement) continue task = LuigiTaskDescription.from_string(match.group('task_id')) state = match.group('state') if state == 'running': start_timestamp = message.timestamp running_measurement = Measurement('Executing {}'.format(task)) if not overall_start_timestamp: overall_start_timestamp = start_timestamp else: running_measurement.set_time_from_range(message.timestamp, start_timestamp) all_execution.add_child(running_measurement)
def analyze_hadoop_job(starting_message, parser): match = re.match(r'.*?(?P<job_id>job_\d{12}_\d{4})', starting_message.content) job_id = match.group('job_id') start_timestamp = starting_message.timestamp message = starting_message while message: message = parser.next_message() if 'Job complete:' in message.content or 'Ended Job = ' in message.content: if 'Job complete:' in message.content: move_measure = analyze_output_move(parser) if move_measure: yield move_measure yield Measurement('Hadoop ' + job_id, message.timestamp - start_timestamp) return
def analyze_task_scheduling(start_message, start_match, parser): scheduled_pattern = r'Scheduled (?P<task_id>.*?) \((?P<status>\w+)\)' task = LuigiTaskDescription.from_string(start_match.group('task_id')) start_timestamp = start_message.timestamp message = start_message while message: message = parser.peek_message() if message.content == 'Done scheduling tasks': break if re.match(scheduled_pattern, message.content, (re.MULTILINE | re.DOTALL)): break parser.next_message() if not task.name == 'UncheckedExternalURL': return Measurement('Scheduling {}'.format(task), message.timestamp - start_timestamp) else: return None
def analyze_overall_scheduling(parser): all_scheduling = Measurement('Scheduling Tasks') start_scheduling_pattern = r'Checking if (?P<task_id>.*?) is complete' message = True start_scheduling_timestamp = None while message: message = parser.next_message() if message.content == 'Done scheduling tasks': all_scheduling.set_time_from_range(message.timestamp, start_scheduling_timestamp) return all_scheduling start_match = re.match(start_scheduling_pattern, message.content, (re.MULTILINE | re.DOTALL)) if start_match: if start_scheduling_timestamp is None: start_scheduling_timestamp = message.timestamp measurement = analyze_task_scheduling(message, start_match, parser) if measurement: all_scheduling.add_child(measurement)