Esempio n. 1
0
def analyze_log(parser):
    root = Measurement('Luigi Worker')
    scheduling_time = analyze_overall_scheduling(parser)
    root.add_child(scheduling_time)
    execution_time = analyze_overall_execution(parser)
    root.add_child(execution_time)
    return root
Esempio n. 2
0
def analyze():
    arg_parser = argparse.ArgumentParser(description='Analyze log files.')
    arg_parser.add_argument(
        '-o',
        '--output',
        help=
        'Save the results of the analysis to a file that can be read later.')
    arg_parser.add_argument(
        '-r',
        '--report',
        choices=['text', 'json', 'html'],
        help=
        'Generate a report in the requested format and stream it to stdout.')
    arg_parser.add_argument(
        '--threshold-percent',
        type=float,
        default=None,
        help=
        "Don't include entries that accounted for less than this amount of time relative to the total execution"
        " time in the report.")
    group = arg_parser.add_argument_group('Input File')
    group_ex = group.add_mutually_exclusive_group()
    group_ex.add_argument(
        '-l',
        '--log',
        default='edx_analytics.log',
        help=
        'A log file to analyze. Defaults to "edx_analytics.log" in the current directory.'
    )
    group_ex.add_argument('-i',
                          '--input',
                          help='Reads a previously saved result file.')
    group_ex.add_argument(
        '-t',
        '--trace',
        help=
        'Path to an execution trace of the launch-task process captured by pyinstrument and saved as JSON.'
    )

    args = arg_parser.parse_args()

    if args.input:
        root = Measurement.from_json(args.input)
    elif args.trace:
        root = Measurement.from_pyinstrument_trace(args.trace)
    else:
        root = analyze_log_file(args.log)

    if args.output:
        json_report(root,
                    args.output,
                    pretty=False,
                    threshold_percent=args.threshold_percent)

    if args.report == 'json':
        json_report(root, threshold_percent=args.threshold_percent)
    elif args.report == 'text':
        text_report(root, threshold_percent=args.threshold_percent)
    elif args.report == 'html':
        html_report(root, threshold_percent=args.threshold_percent)
Esempio n. 3
0
def analyze():
    arg_parser = argparse.ArgumentParser(description='Analyze log files.')
    arg_parser.add_argument(
        '-o', '--output',
        help='Save the results of the analysis to a file that can be read later.'
    )
    arg_parser.add_argument(
        '-r', '--report',
        choices=['text', 'json', 'html'],
        help='Generate a report in the requested format and stream it to stdout.'
    )
    arg_parser.add_argument(
        '--threshold-percent',
        type=float,
        default=None,
        help="Don't include entries that accounted for less than this amount of time relative to the total execution"
             " time in the report."
    )
    group = arg_parser.add_argument_group('Input File')
    group_ex = group.add_mutually_exclusive_group()
    group_ex.add_argument(
        '-l', '--log',
        default='edx_analytics.log',
        help='A log file to analyze. Defaults to "edx_analytics.log" in the current directory.'
    )
    group_ex.add_argument(
        '-i', '--input',
        help='Reads a previously saved result file.'
    )
    group_ex.add_argument(
        '-t', '--trace',
        help='Path to an execution trace of the launch-task process captured by pyinstrument and saved as JSON.'
    )

    args = arg_parser.parse_args()

    if args.input:
        root = Measurement.from_json(args.input)
    elif args.trace:
        root = Measurement.from_pyinstrument_trace(args.trace)
    else:
        root = analyze_log_file(args.log)

    if args.output:
        json_report(root, args.output, pretty=False, threshold_percent=args.threshold_percent)

    if args.report == 'json':
        json_report(root, threshold_percent=args.threshold_percent)
    elif args.report == 'text':
        text_report(root, threshold_percent=args.threshold_percent)
    elif args.report == 'html':
        html_report(root, threshold_percent=args.threshold_percent)
Esempio n. 4
0
def analyze_overall_execution(parser):
    all_execution = Measurement('Executing Tasks')

    pattern = r'.*? Worker Worker.* (?P<state>running|done|failed)\s+(?P<task_id>.*)'
    message = True
    overall_start_timestamp = None
    running_measurement = None

    while message:
        message = parser.next_message()

        if message.content == 'Done':
            if overall_start_timestamp:
                all_execution.set_time_from_range(message.timestamp, overall_start_timestamp)
            return all_execution

        match = re.match(pattern, message.content, (re.MULTILINE | re.DOTALL))
        if not match:
            if 'Running job:' in message.content or 'Starting Job =' in message.content:
                for measurement in analyze_hadoop_job(message, parser):
                    running_measurement.add_child(measurement)

            continue

        task = LuigiTaskDescription.from_string(match.group('task_id'))
        state = match.group('state')
        if state == 'running':
            start_timestamp = message.timestamp
            running_measurement = Measurement('Executing {}'.format(task))
            if not overall_start_timestamp:
                overall_start_timestamp = start_timestamp
        else:
            running_measurement.set_time_from_range(message.timestamp, start_timestamp)
            all_execution.add_child(running_measurement)
Esempio n. 5
0
def analyze_output_move(parser):
    output_message = parser.peek_message()
    if re.match(r'.*?Output:.*-temp-', output_message.content):
        start_timestamp = output_message.timestamp
        parser.next_message()
        next_message = parser.peek_message()
        return Measurement('Moving Job Output',
                           next_message.timestamp - start_timestamp)
Esempio n. 6
0
def analyze_log(parser):
    root = Measurement('Luigi Worker')
    scheduling_time = analyze_overall_scheduling(parser)
    root.add_child(scheduling_time)
    execution_time = analyze_overall_execution(parser)
    root.add_child(execution_time)
    return root
Esempio n. 7
0
def analyze_overall_scheduling(parser):
    all_scheduling = Measurement('Scheduling Tasks')

    start_scheduling_pattern = r'Checking if (?P<task_id>.*?) is complete'
    message = True
    start_scheduling_timestamp = None

    while message:
        message = parser.next_message()

        if message.content == 'Done scheduling tasks':
            all_scheduling.set_time_from_range(message.timestamp, start_scheduling_timestamp)
            return all_scheduling

        start_match = re.match(start_scheduling_pattern, message.content, (re.MULTILINE | re.DOTALL))
        if start_match:
            if start_scheduling_timestamp is None:
                start_scheduling_timestamp = message.timestamp

            measurement = analyze_task_scheduling(message, start_match, parser)
            if measurement:
                all_scheduling.add_child(measurement)
Esempio n. 8
0
def analyze_overall_execution(parser):
    all_execution = Measurement('Executing Tasks')

    pattern = r'.*? Worker Worker.* (?P<state>running|done|failed)\s+(?P<task_id>.*)'
    message = True
    overall_start_timestamp = None
    running_measurement = None

    while message:
        message = parser.next_message()

        if message.content == 'Done':
            if overall_start_timestamp:
                all_execution.set_time_from_range(message.timestamp,
                                                  overall_start_timestamp)
            return all_execution

        match = re.match(pattern, message.content, (re.MULTILINE | re.DOTALL))
        if not match:
            if 'Running job:' in message.content or 'Starting Job =' in message.content:
                for measurement in analyze_hadoop_job(message, parser):
                    running_measurement.add_child(measurement)

            continue

        task = LuigiTaskDescription.from_string(match.group('task_id'))
        state = match.group('state')
        if state == 'running':
            start_timestamp = message.timestamp
            running_measurement = Measurement('Executing {}'.format(task))
            if not overall_start_timestamp:
                overall_start_timestamp = start_timestamp
        else:
            running_measurement.set_time_from_range(message.timestamp,
                                                    start_timestamp)
            all_execution.add_child(running_measurement)
Esempio n. 9
0
def analyze_hadoop_job(starting_message, parser):
    match = re.match(r'.*?(?P<job_id>job_\d{12}_\d{4})',
                     starting_message.content)
    job_id = match.group('job_id')
    start_timestamp = starting_message.timestamp

    message = starting_message
    while message:
        message = parser.next_message()

        if 'Job complete:' in message.content or 'Ended Job = ' in message.content:
            if 'Job complete:' in message.content:
                move_measure = analyze_output_move(parser)
                if move_measure:
                    yield move_measure

            yield Measurement('Hadoop ' + job_id,
                              message.timestamp - start_timestamp)
            return
Esempio n. 10
0
def analyze_task_scheduling(start_message, start_match, parser):
    scheduled_pattern = r'Scheduled (?P<task_id>.*?) \((?P<status>\w+)\)'

    task = LuigiTaskDescription.from_string(start_match.group('task_id'))
    start_timestamp = start_message.timestamp
    message = start_message
    while message:
        message = parser.peek_message()

        if message.content == 'Done scheduling tasks':
            break

        if re.match(scheduled_pattern, message.content,
                    (re.MULTILINE | re.DOTALL)):
            break

        parser.next_message()

    if not task.name == 'UncheckedExternalURL':
        return Measurement('Scheduling {}'.format(task),
                           message.timestamp - start_timestamp)
    else:
        return None
Esempio n. 11
0
def analyze_overall_scheduling(parser):
    all_scheduling = Measurement('Scheduling Tasks')

    start_scheduling_pattern = r'Checking if (?P<task_id>.*?) is complete'
    message = True
    start_scheduling_timestamp = None

    while message:
        message = parser.next_message()

        if message.content == 'Done scheduling tasks':
            all_scheduling.set_time_from_range(message.timestamp,
                                               start_scheduling_timestamp)
            return all_scheduling

        start_match = re.match(start_scheduling_pattern, message.content,
                               (re.MULTILINE | re.DOTALL))
        if start_match:
            if start_scheduling_timestamp is None:
                start_scheduling_timestamp = message.timestamp

            measurement = analyze_task_scheduling(message, start_match, parser)
            if measurement:
                all_scheduling.add_child(measurement)