Esempio n. 1
0
def analyze_log(parser):
    root = Measurement('Luigi Worker')
    scheduling_time = analyze_overall_scheduling(parser)
    root.add_child(scheduling_time)
    execution_time = analyze_overall_execution(parser)
    root.add_child(execution_time)
    return root
Esempio n. 2
0
def analyze_output_move(parser):
    output_message = parser.peek_message()
    if re.match(r'.*?Output:.*-temp-', output_message.content):
        start_timestamp = output_message.timestamp
        parser.next_message()
        next_message = parser.peek_message()
        return Measurement('Moving Job Output',
                           next_message.timestamp - start_timestamp)
Esempio n. 3
0
def analyze_overall_execution(parser):
    all_execution = Measurement('Executing Tasks')

    pattern = r'.*? Worker Worker.* (?P<state>running|done|failed)\s+(?P<task_id>.*)'
    message = True
    overall_start_timestamp = None
    running_measurement = None

    while message:
        message = parser.next_message()

        if message.content == 'Done':
            if overall_start_timestamp:
                all_execution.set_time_from_range(message.timestamp,
                                                  overall_start_timestamp)
            return all_execution

        match = re.match(pattern, message.content, (re.MULTILINE | re.DOTALL))
        if not match:
            if 'Running job:' in message.content or 'Starting Job =' in message.content:
                for measurement in analyze_hadoop_job(message, parser):
                    running_measurement.add_child(measurement)

            continue

        task = LuigiTaskDescription.from_string(match.group('task_id'))
        state = match.group('state')
        if state == 'running':
            start_timestamp = message.timestamp
            running_measurement = Measurement('Executing {}'.format(task))
            if not overall_start_timestamp:
                overall_start_timestamp = start_timestamp
        else:
            running_measurement.set_time_from_range(message.timestamp,
                                                    start_timestamp)
            all_execution.add_child(running_measurement)
Esempio n. 4
0
def analyze_hadoop_job(starting_message, parser):
    match = re.match(r'.*?(?P<job_id>job_\d{12}_\d{4})',
                     starting_message.content)
    job_id = match.group('job_id')
    start_timestamp = starting_message.timestamp

    message = starting_message
    while message:
        message = parser.next_message()

        if 'Job complete:' in message.content or 'Ended Job = ' in message.content:
            if 'Job complete:' in message.content:
                move_measure = analyze_output_move(parser)
                if move_measure:
                    yield move_measure

            yield Measurement('Hadoop ' + job_id,
                              message.timestamp - start_timestamp)
            return
Esempio n. 5
0
def analyze_task_scheduling(start_message, start_match, parser):
    scheduled_pattern = r'Scheduled (?P<task_id>.*?) \((?P<status>\w+)\)'

    task = LuigiTaskDescription.from_string(start_match.group('task_id'))
    start_timestamp = start_message.timestamp
    message = start_message
    while message:
        message = parser.peek_message()

        if message.content == 'Done scheduling tasks':
            break

        if re.match(scheduled_pattern, message.content,
                    (re.MULTILINE | re.DOTALL)):
            break

        parser.next_message()

    if not task.name == 'UncheckedExternalURL':
        return Measurement('Scheduling {}'.format(task),
                           message.timestamp - start_timestamp)
    else:
        return None
Esempio n. 6
0
def analyze_overall_scheduling(parser):
    all_scheduling = Measurement('Scheduling Tasks')

    start_scheduling_pattern = r'Checking if (?P<task_id>.*?) is complete'
    message = True
    start_scheduling_timestamp = None

    while message:
        message = parser.next_message()

        if message.content == 'Done scheduling tasks':
            all_scheduling.set_time_from_range(message.timestamp,
                                               start_scheduling_timestamp)
            return all_scheduling

        start_match = re.match(start_scheduling_pattern, message.content,
                               (re.MULTILINE | re.DOTALL))
        if start_match:
            if start_scheduling_timestamp is None:
                start_scheduling_timestamp = message.timestamp

            measurement = analyze_task_scheduling(message, start_match, parser)
            if measurement:
                all_scheduling.add_child(measurement)