Esempio n. 1
0
def get_freeze_locations(scan_log_filename, scan):
    """
    [Wed Nov  1 23:43:16 2017 - debug] ...
    [Wed Nov  1 23:43:19 2017 - debug] ...

    3 seconds without writing anything to the log? That is a freeze!

    :param scan: The file handler for the scan log
    :return: None, all printed to the output
    """
    scan.seek(0)
    freezes = []

    previous_line_time = get_line_epoch(scan.readline())

    for line in scan:
        try:
            current_line_epoch = get_line_epoch(line)
        except InvalidTimeStamp:
            continue

        time_spent = current_line_epoch - previous_line_time

        if time_spent > 5:
            line = line.strip()
            freezes.append(line)

        previous_line_time = current_line_epoch

    return KeyValueOutput(
        'debug_log_freeze',
        'Delays greater than 5 seconds between two log lines', freezes)
Esempio n. 2
0
def get_freeze_locations(scan_log_filename, scan):
    """
    [Wed Nov  1 23:43:16 2017 - debug] ...
    [Wed Nov  1 23:43:19 2017 - debug] ...

    3 seconds without writing anything to the log? That is a freeze!

    :param scan: The file handler for the scan log
    :return: None, all printed to the output
    """
    scan.seek(0)
    freezes = []

    previous_line_time = get_line_epoch(scan.readline())

    for line in scan:
        try:
            current_line_epoch = get_line_epoch(line)
        except InvalidTimeStamp:
            continue

        time_spent = current_line_epoch - previous_line_time

        if time_spent > 5:
            line = line.strip()
            freezes.append(line)

        previous_line_time = current_line_epoch

    return KeyValueOutput('debug_log_freeze',
                          'Delays greater than 5 seconds between two log lines',
                          freezes)
Esempio n. 3
0
def get_http_requests_over_time_data(scan_log_filename, scan):
    scan.seek(0)
    requests_by_minute = []
    requests_in_this_minute = 0

    line = scan.readline()
    prev_line_epoch = get_line_epoch(line)

    for line in scan:

        match = HTTP_CODE_RE.search(line)
        if match:
            requests_in_this_minute += 1

        try:
            cur_line_epoch = get_line_epoch(line)
        except InvalidTimeStamp:
            continue

        if cur_line_epoch - prev_line_epoch > 60:
            prev_line_epoch = cur_line_epoch
            requests_by_minute.append(requests_in_this_minute)
            requests_in_this_minute = 0

    return requests_by_minute
Esempio n. 4
0
def get_consumer_pool_size_data(scan_log_filename, scan):
    scan.seek(0)

    consumer_pool_perc_audit = []
    consumer_pool_timestamps_audit = []

    consumer_pool_perc_crawl = []
    consumer_pool_timestamps_crawl = []

    worker_pool_perc = []
    worker_pool_timestamps = []

    for line in scan:
        match = IDLE_CONSUMER_WORKERS.search(line)
        if not match:
            continue

        percent = int(match.group(1))
        is_audit = 'audit' in match.group(2).lower()
        is_crawl = 'crawl' in match.group(2).lower()

        if is_audit:
            consumer_pool_perc_audit.append(percent)
            consumer_pool_timestamps_audit.append(get_line_epoch(line))
        elif is_crawl:
            consumer_pool_perc_crawl.append(percent)
            consumer_pool_timestamps_crawl.append(get_line_epoch(line))
        else:
            worker_pool_perc.append(percent)
            worker_pool_timestamps.append(get_line_epoch(line))

    return (consumer_pool_perc_audit, consumer_pool_timestamps_audit,
            consumer_pool_perc_crawl, consumer_pool_timestamps_crawl,
            worker_pool_perc, worker_pool_timestamps)
Esempio n. 5
0
def get_consumer_pool_size_data(scan_log_filename, scan):
    scan.seek(0)

    consumer_pool_perc_audit = []
    consumer_pool_timestamps_audit = []

    consumer_pool_perc_crawl = []
    consumer_pool_timestamps_crawl = []

    worker_pool_perc = []
    worker_pool_timestamps = []

    for line in scan:
        match = IDLE_CONSUMER_WORKERS.search(line)
        if not match:
            continue

        percent = int(match.group(1))
        is_audit = 'audit' in match.group(2).lower()
        is_crawl = 'crawl' in match.group(2).lower()

        if is_audit:
            consumer_pool_perc_audit.append(percent)
            consumer_pool_timestamps_audit.append(get_line_epoch(line))
        elif is_crawl:
            consumer_pool_perc_crawl.append(percent)
            consumer_pool_timestamps_crawl.append(get_line_epoch(line))
        else:
            worker_pool_perc.append(percent)
            worker_pool_timestamps.append(get_line_epoch(line))

    return (consumer_pool_perc_audit, consumer_pool_timestamps_audit,
            consumer_pool_perc_crawl, consumer_pool_timestamps_crawl,
            worker_pool_perc, worker_pool_timestamps)
Esempio n. 6
0
def show_connection_pool_wait(scan_log_filename, scan):
    scan.seek(0)

    connection_pool_waits = []
    connection_pool_timestamps = []

    for line in scan:
        match = CONNECTION_POOL_WAIT.search(line)
        if match:
            connection_pool_waits.append(float(match.group(1)))
            connection_pool_timestamps.append(get_line_epoch(line))

    first_timestamp = get_first_timestamp(scan)
    last_timestamp = get_last_timestamp(scan)
    spent_time_epoch = last_timestamp - first_timestamp
    connection_pool_timestamps = [ts - first_timestamp for ts in connection_pool_timestamps]

    if not connection_pool_waits:
        print('No connection pool wait data found')
        return

    print('Time waited for worker threads for an available TCP/IP connection')
    print('    Total: %.2f sec' % sum(connection_pool_waits))
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Waited time'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_time_epoch)
    fig.set_y_limits(min_=0, max_=None)

    fig.plot(connection_pool_timestamps,
             connection_pool_waits)

    print(fig.show())
    print('')
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Count'
    fig.x_label = 'Time waiting for available TCP/IP connection'
    fig.set_x_limits(min_=0)
    fig.set_y_limits(min_=0)
    fig.color_mode = 'byte'

    print('Time waiting for available TCP/IP connection')
    print('')
    print(plotille.hist(connection_pool_waits, bins=25))
    print('')
    print('')
Esempio n. 7
0
def show_http_requests_over_time(scan_log_filename, scan):
    scan.seek(0)
    requests_by_minute = []
    requests_in_this_minute = 0

    line = scan.readline()
    prev_line_epoch = get_line_epoch(line)

    for line in scan:

        match = HTTP_CODE_RE.search(line)
        if match:
            requests_in_this_minute += 1

        try:
            cur_line_epoch = get_line_epoch(line)
        except InvalidTimeStamp:
            continue

        if cur_line_epoch - prev_line_epoch > 60:
            prev_line_epoch = cur_line_epoch
            requests_by_minute.append(requests_in_this_minute)
            requests_in_this_minute = 0

    print('HTTP requests sent by minute')
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'HTTP requests'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=None)
    fig.set_y_limits(min_=0, max_=None)

    fig.plot(xrange(len(requests_by_minute)),
             requests_by_minute)

    print(fig.show())
    print('')
    print('')
Esempio n. 8
0
def show_freeze_locations(scan_log_filename, scan):
    """
    [Wed Nov  1 23:43:16 2017 - debug] ...
    [Wed Nov  1 23:43:19 2017 - debug] ...

    3 seconds without writing anything to the log? That is a freeze!

    :param scan: The file handler for the scan log
    :return: None, all printed to the output
    """
    scan.seek(0)
    freezes = []

    previous_line_time = get_line_epoch(scan.readline())

    for line in scan:
        try:
            current_line_epoch = get_line_epoch(line)
        except InvalidTimeStamp:
            continue

        time_spent = current_line_epoch - previous_line_time

        if time_spent > 5:
            line = line.strip()

            if len(line) >= 80:
                msg = 'Found %s second freeze at: %s...' % (time_spent, line[:80])
            else:
                msg = 'Found %s second freeze at: %s' % (time_spent, line)

            freezes.append(msg)

        previous_line_time = current_line_epoch

    if not freezes:
        print('No delays greater than 3 seconds were found between two scan log lines')
        return

    print('Found the delays greater than 3 seconds around these scan log lines:')
    for freeze in freezes:
        print('    - %s' % freeze)
Esempio n. 9
0
def get_error_rate_data(scan_log_filename, scan):
    error_rate = []
    error_rate_timestamps = []

    for line in scan:
        match = EXTENDED_URLLIB_ERRORS_RE.search(line)
        if match:
            error_rate.append(int(match.group(1)))
            error_rate_timestamps.append(get_line_epoch(line))

    return error_rate, error_rate_timestamps
Esempio n. 10
0
def get_rtt_data(scan_log_filename, scan):
    scan.seek(0)
    rtt = []
    rtt_timestamps = []

    for line in scan:
        match = RTT_RE.search(line)
        if match:
            rtt.append(float(match.group(1)))
            rtt_timestamps.append(get_line_epoch(line))

    return rtt, rtt_timestamps
Esempio n. 11
0
def get_timeout_data(scan_log_filename, scan):
    scan.seek(0)
    timeouts = []
    timeout_timestamps = []

    for line in scan:
        match = SOCKET_TIMEOUT.search(line)
        if match:
            timeouts.append(float(match.group(1)))
            timeout_timestamps.append(get_line_epoch(line))

    return timeouts, timeout_timestamps
Esempio n. 12
0
def get_parser_process_memory_limit_data(scan_log_filename, scan):
    scan.seek(0)

    memory_limit = []
    memory_limit_timestamps = []

    for line in scan:
        match = PARSER_PROCESS_MEMORY_LIMIT.search(line)
        if match:
            memory_limit.append(int(match.group(1)))
            memory_limit_timestamps.append(get_line_epoch(line))

    return memory_limit, memory_limit_timestamps
Esempio n. 13
0
def get_queue_size_crawl_data(scan_log_filename, scan):
    scan.seek(0)

    crawl_queue_sizes = []
    crawl_queue_timestamps = []

    for line in scan:
        match = CRAWLINFRA_DISK_DICT.search(line)
        if match:
            crawl_queue_sizes.append(int(match.group(1)))
            crawl_queue_timestamps.append(get_line_epoch(line))

    return crawl_queue_sizes, crawl_queue_timestamps
Esempio n. 14
0
def get_queue_size_audit_data(scan_log_filename, scan):
    scan.seek(0)

    auditor_queue_sizes = []
    auditor_queue_timestamps = []

    for line in scan:
        match = AUDITOR_DISK_DICT.search(line)
        if match:
            auditor_queue_sizes.append(int(match.group(1)))
            auditor_queue_timestamps.append(get_line_epoch(line))

    return auditor_queue_sizes, auditor_queue_timestamps
Esempio n. 15
0
def get_active_threads_data(scan_log_filename, scan):
    scan.seek(0)

    active_threads = []
    active_threads_timestamps = []

    for line in scan:
        match = ACTIVE_THREADS.search(line)
        if match:
            active_threads.append(float(match.group(1)))
            active_threads_timestamps.append(get_line_epoch(line))

    return active_threads, active_threads_timestamps
Esempio n. 16
0
def get_queue_size_audit_data(scan_log_filename, scan):
    scan.seek(0)

    auditor_queue_sizes = []
    auditor_queue_timestamps = []

    for line in scan:
        match = AUDITOR_DISK_DICT.search(line)
        if match:
            auditor_queue_sizes.append(int(match.group(1)))
            auditor_queue_timestamps.append(get_line_epoch(line))

    return auditor_queue_sizes, auditor_queue_timestamps
Esempio n. 17
0
def get_queue_size_crawl_data(scan_log_filename, scan):
    scan.seek(0)

    crawl_queue_sizes = []
    crawl_queue_timestamps = []

    for line in scan:
        match = CRAWLINFRA_DISK_DICT.search(line)
        if match:
            crawl_queue_sizes.append(int(match.group(1)))
            crawl_queue_timestamps.append(get_line_epoch(line))

    return crawl_queue_sizes, crawl_queue_timestamps
Esempio n. 18
0
def get_queue_size_grep_data(scan_log_filename, scan):
    scan.seek(0)

    grep_queue_sizes = []
    grep_queue_timestamps = []

    for line in scan:
        match = GREP_DISK_DICT.search(line)
        if match:
            grep_queue_sizes.append(int(match.group(1)))
            grep_queue_timestamps.append(get_line_epoch(line))

    return grep_queue_sizes, grep_queue_timestamps
Esempio n. 19
0
def get_connection_pool_wait_data(scan_log_filename, scan):
    scan.seek(0)

    connection_pool_waits = []
    connection_pool_timestamps = []

    for line in scan:
        match = CONNECTION_POOL_WAIT.search(line)
        if match:
            connection_pool_waits.append(float(match.group(1)))
            connection_pool_timestamps.append(get_line_epoch(line))

    return connection_pool_waits, connection_pool_timestamps
Esempio n. 20
0
def get_queue_size_grep_data(scan_log_filename, scan):
    scan.seek(0)

    grep_queue_sizes = []
    grep_queue_timestamps = []

    for line in scan:
        match = GREP_DISK_DICT.search(line)
        if match:
            grep_queue_sizes.append(int(match.group(1)))
            grep_queue_timestamps.append(get_line_epoch(line))

    return grep_queue_sizes, grep_queue_timestamps
Esempio n. 21
0
def get_worker_pool_size_data(scan_log_filename, scan):
    scan.seek(0)

    worker_pool_sizes = []
    worker_pool_timestamps = []

    for line in scan:
        match = WORKER_POOL_SIZE.search(line)
        if match:
            worker_pool_sizes.append(int(match.group(1)))
            worker_pool_timestamps.append(get_line_epoch(line))

    return worker_pool_sizes, worker_pool_timestamps
Esempio n. 22
0
def get_parser_errors_data(scan_log_filename, scan):
    scan.seek(0)

    timeout_count = 0
    timeout_errors = []
    timeout_errors_timestamps = []

    memory_count = 0
    memory_errors = []
    memory_errors_timestamps = []

    for line in scan:
        if PARSER_TIMEOUT in line:
            timeout_count += 1
            timeout_errors.append(timeout_count)
            timeout_errors_timestamps.append(get_line_epoch(line))

        if PARSER_MEMORY_LIMIT in line:
            memory_count += 1
            memory_errors.append(memory_count)
            memory_errors_timestamps.append(get_line_epoch(line))

    return (timeout_count, timeout_errors, timeout_errors_timestamps,
            memory_count, memory_errors, memory_errors_timestamps)
Esempio n. 23
0
def get_not_found_requests_over_time_data(scan_log_filename, scan):
    scan.seek(0)
    requests_by_minute = []
    requests_in_this_minute = 0

    line = scan.readline()
    prev_line_epoch = get_line_epoch(line)

    for line in scan:

        if NOT_FOUND_REQUEST in line:
            requests_in_this_minute += 1

        try:
            cur_line_epoch = get_line_epoch(line)
        except InvalidTimeStamp:
            continue

        if cur_line_epoch - prev_line_epoch > 60:
            prev_line_epoch = cur_line_epoch
            requests_by_minute.append(requests_in_this_minute)
            requests_in_this_minute = 0

    return requests_by_minute
Esempio n. 24
0
def show_queue_size_audit(scan_log_filename, scan):
    scan.seek(0)

    auditor_queue_sizes = []
    auditor_queue_timestamps = []

    for line in scan:
        match = AUDITOR_DISK_DICT.search(line)
        if match:
            auditor_queue_sizes.append(int(match.group(1)))
            auditor_queue_timestamps.append(get_line_epoch(line))

    # Get the last timestamp to use as max in the graphs
    first_timestamp = get_first_timestamp(scan)
    last_timestamp = get_last_timestamp(scan)
    spent_time_epoch = last_timestamp - first_timestamp
    auditor_queue_timestamps = [
        ts - first_timestamp for ts in auditor_queue_timestamps
    ]

    if not auditor_queue_sizes:
        print('No audit consumer queue size data found')
        print('')
        return

    print('Audit consumer queue size')
    print('    Latest queue size value: %s' % auditor_queue_sizes[-1])
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Items in Audit queue'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_time_epoch)
    fig.set_y_limits(min_=0, max_=None)

    fig.plot(auditor_queue_timestamps, auditor_queue_sizes, label='Audit')

    print(fig.show())
    print('')
    print('')
Esempio n. 25
0
def show_extended_urllib_error_rate(scan_log_filename, scan):
    error_rate = []
    error_rate_timestamps = []

    for line in scan:
        match = EXTENDED_URLLIB_ERRORS_RE.search(line)
        if match:
            error_rate.append(int(match.group(1)))
            error_rate_timestamps.append(get_line_epoch(line))

    first_timestamp = get_first_timestamp(scan)
    last_timestamp = get_last_timestamp(scan)
    spent_epoch = last_timestamp - first_timestamp
    error_rate_timestamps = [
        ts - first_timestamp for ts in error_rate_timestamps
    ]

    if not error_rate:
        print('No error rate information found')
        print('')
        return

    print('Extended URL library error rate')
    print('    Error rate exceeded 10%%: %s' % (max(error_rate) > 10, ))
    print('    Error rate exceeded 20%%: %s' % (max(error_rate) > 10, ))
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Error rate'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_epoch)
    fig.set_y_limits(min_=0, max_=max(error_rate) * 1.1)

    fig.plot(error_rate_timestamps, error_rate, label='Error rate')

    print(fig.show())
    print('')
    print('')
Esempio n. 26
0
def show_parser_process_memory_limit(scan_log_filename, scan):
    scan.seek(0)

    memory_limit = []
    memory_limit_timestamps = []

    for line in scan:
        match = PARSER_PROCESS_MEMORY_LIMIT.search(line)
        if match:
            memory_limit.append(int(match.group(1)))
            memory_limit_timestamps.append(get_line_epoch(line))

    first_timestamp = get_first_timestamp(scan)
    last_timestamp = get_last_timestamp(scan)
    spent_epoch = last_timestamp - first_timestamp
    memory_limit_timestamps = [ts - first_timestamp for ts in memory_limit_timestamps]

    if not memory_limit:
        print('No parser process memory limit information found')
        return

    print('Parser process memory limit')
    print('    Latest memory limit: %s MB' % memory_limit[-1])
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Parser memory limit (MB)'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_epoch)
    fig.set_y_limits(min_=0, max_=max(memory_limit) * 1.1)

    fig.plot(memory_limit_timestamps,
             memory_limit,
             label='Memory limit')

    print(fig.show())
    print('')
    print('')
Esempio n. 27
0
def show_worker_pool_size(scan_log_filename, scan):
    scan.seek(0)

    worker_pool_sizes = []
    worker_pool_timestamps = []

    for line in scan:
        match = WORKER_POOL_SIZE.search(line)
        if match:
            worker_pool_sizes.append(int(match.group(1)))
            worker_pool_timestamps.append(get_line_epoch(line))

    first_timestamp = get_first_timestamp(scan)
    last_timestamp = get_last_timestamp(scan)
    spent_time_epoch = last_timestamp - first_timestamp
    worker_pool_timestamps = [
        ts - first_timestamp for ts in worker_pool_timestamps
    ]

    if not worker_pool_sizes:
        print('No worker pool size data found')
        return

    print('Worker pool size over time')
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Worker pool size'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_time_epoch)
    fig.set_y_limits(min_=0, max_=None)

    fig.plot(worker_pool_timestamps, worker_pool_sizes, label='Workers')

    print(fig.show())
    print('')
    print('')
Esempio n. 28
0
def get_should_grep_data(scan_log_filename, scan):
    scan.seek(0)

    should_grep_data = []
    should_grep_timestamps = []

    for line in scan:
        match = SHOULD_GREP_STATS.search(line)
        if not match:
            continue

        try:
            stats_dict = to_dict(match.group(1))
        except:
            print('Warning: %s is not valid JSON' % match.group(1))
            continue
        else:
            should_grep_data.append(stats_dict)
            should_grep_timestamps.append(get_line_epoch(line))

    return should_grep_data, should_grep_timestamps
Esempio n. 29
0
def show_timeout(scan_log_filename, scan):
    scan.seek(0)
    timeouts = []
    timeout_timestamps = []

    for line in scan:
        match = SOCKET_TIMEOUT.search(line)
        if match:
            timeouts.append(float(match.group(1)))
            timeout_timestamps.append(get_line_epoch(line))

    first_timestamp = get_first_timestamp(scan)
    last_timestamp = get_last_timestamp(scan)
    spent_time_epoch = last_timestamp - first_timestamp
    timeout_timestamps = [ts - first_timestamp for ts in timeout_timestamps]

    if not timeouts:
        print('No socket timeout data found')
        return

    print('Socket timeout over time')
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Socket timeout'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_time_epoch)
    fig.set_y_limits(min_=0, max_=None)

    fig.plot(timeout_timestamps, timeouts, label='Timeout')

    print(fig.show())
    print('')
    print('')
Esempio n. 30
0
def show_progress_delta(scan_log_filename, scan):
    first_timestamp = get_first_timestamp(scan)

    #
    # Find the end times for crawl, audit, grep
    #
    scan.seek(0)

    crawl_end_timestamp = None
    audit_end_timestamp = None
    grep_end_timestamp = None

    for line in scan:
        if CRAWL_INFRA_FINISHED in line:
            crawl_end_timestamp = get_line_epoch(line)

        if 'seconds to join' not in line:
            continue

        match = JOIN_TIMES.search(line)
        if match:
            if 'audit' in line.lower():
                audit_end_timestamp = get_line_epoch(line)
            if 'grep' in line.lower():
                grep_end_timestamp = get_line_epoch(line)

    #
    # Find the crawl, audit and grep progress estimations
    #
    scan.seek(0)

    crawl_progress = []
    crawl_progress_timestamps = []

    audit_progress = []
    audit_progress_timestamps = []

    grep_progress = []
    grep_progress_timestamps = []

    for line in scan:
        match = CALCULATED_ETA.search(line)
        if match:
            ts = get_line_epoch(line)

            eta = match.group(2)
            if eta == 'None':
                eta = '0.0'

            eta = float(eta)
            percentage = (ts - first_timestamp) / (ts - first_timestamp + eta) * 100

            if 'crawl' in line.lower():
                crawl_progress_timestamps.append(ts)
                crawl_progress.append(percentage)

            if 'audit' in line.lower():
                audit_progress_timestamps.append(ts)
                audit_progress.append(percentage)

            if 'grep' in line.lower():
                grep_progress_timestamps.append(ts)
                grep_progress.append(percentage)

    # Make the timestamps relative to the scan finish
    crawl_progress_timestamps = [ts - first_timestamp for ts in crawl_progress_timestamps]
    audit_progress_timestamps = [ts - first_timestamp for ts in audit_progress_timestamps]
    grep_progress_timestamps = [ts - first_timestamp for ts in grep_progress_timestamps]

    #
    # Find the overall progress estimations
    #
    scan.seek(0)

    progress = []
    progress_timestamps = []

    for line in scan:
        match = SCAN_PROGRESS.search(line)
        if match:
            progress.append(int(match.group(1)))
            progress_timestamps.append(get_line_epoch(line))

    # Get the last timestamp to use as max in the graphs
    progress_timestamps = [ts - first_timestamp for ts in progress_timestamps]

    scan.seek(0)

    first_timestamp = get_first_timestamp(scan)
    finished_timestamp = None

    for line in scan:
        match = SCAN_FINISHED_IN.search(line)
        if match:
            finished_timestamp = get_line_epoch(line)

    finished_timestamp = finished_timestamp or get_last_timestamp(scan)
    spent_time_epoch = finished_timestamp - first_timestamp

    print('Progress delta (estimated vs. real)')
    print('')

    if crawl_progress and crawl_end_timestamp is not None:
        fig = plotille.Figure()
        fig.width = 90
        fig.height = 20
        fig.register_label_formatter(float, num_formatter)
        fig.register_label_formatter(int, num_formatter)
        fig.y_label = 'Progress'
        fig.x_label = 'Time'
        fig.color_mode = 'byte'
        fig.set_x_limits(min_=0, max_=spent_time_epoch)
        fig.set_y_limits(min_=0, max_=None)

        fig.plot(crawl_progress_timestamps,
                 crawl_progress,
                 label='Crawl (estimated)')

        crawl_real_spent = int(crawl_end_timestamp) - int(first_timestamp)
        crawl_real_progress_timestamps = range(int(first_timestamp),
                                               int(crawl_end_timestamp),
                                               1)
        crawl_real_progress_timestamps = [ts - first_timestamp for ts in crawl_real_progress_timestamps]

        crawl_real_progress = []

        for ts in crawl_real_progress_timestamps:
            crawl_real_progress.append(float(ts) / crawl_real_spent * 100)

        fig.plot(crawl_real_progress_timestamps,
                 crawl_real_progress,
                 label='Crawl (real)')

        print(fig.show(legend=True))
        print('')
        print('')

    if audit_progress and audit_end_timestamp is not None:
        fig = plotille.Figure()
        fig.width = 90
        fig.height = 20
        fig.register_label_formatter(float, num_formatter)
        fig.register_label_formatter(int, num_formatter)
        fig.y_label = 'Progress'
        fig.x_label = 'Time'
        fig.color_mode = 'byte'
        fig.set_x_limits(min_=0, max_=spent_time_epoch)
        fig.set_y_limits(min_=0, max_=None)

        fig.plot(audit_progress_timestamps,
                 audit_progress,
                 label='Audit (estimated)')

        audit_real_spent = int(audit_end_timestamp) - int(first_timestamp)
        audit_real_progress_timestamps = range(int(first_timestamp),
                                               int(audit_end_timestamp),
                                               1)
        audit_real_progress_timestamps = [ts - first_timestamp for ts in audit_real_progress_timestamps]

        audit_real_progress = []

        for ts in audit_real_progress_timestamps:
            audit_real_progress.append(float(ts) / audit_real_spent * 100)

        fig.plot(audit_real_progress_timestamps,
                 audit_real_progress,
                 label='Audit (real)')

        print(fig.show(legend=True))
        print('')
        print('')

    if grep_progress and grep_end_timestamp is not None:
        fig = plotille.Figure()
        fig.width = 90
        fig.height = 20
        fig.register_label_formatter(float, num_formatter)
        fig.register_label_formatter(int, num_formatter)
        fig.y_label = 'Progress'
        fig.x_label = 'Time'
        fig.color_mode = 'byte'
        fig.set_x_limits(min_=0, max_=spent_time_epoch)
        fig.set_y_limits(min_=0, max_=None)

        fig.plot(grep_progress_timestamps,
                 grep_progress,
                 label='Grep (estimated)')

        grep_real_spent = int(grep_end_timestamp) - int(first_timestamp)
        grep_real_progress_timestamps = range(int(first_timestamp),
                                              int(grep_end_timestamp),
                                              1)
        grep_real_progress_timestamps = [ts - first_timestamp for ts in grep_real_progress_timestamps]

        grep_real_progress = []

        for ts in grep_real_progress_timestamps:
            grep_real_progress.append(float(ts) / grep_real_spent * 100)

        fig.plot(grep_real_progress_timestamps,
                 grep_real_progress,
                 label='Grep (real)')

        print(fig.show(legend=True))
        print('')
        print('')

    if not progress:
        print('No progress data to calculate deltas (requirement: enable xml_file plugin)')
        return

    if finished_timestamp is None:
        print('The scan did not finish. Can not show progress delta.')
        return

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, num_formatter)
    fig.register_label_formatter(int, num_formatter)
    fig.y_label = 'Progress'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_time_epoch)
    fig.set_y_limits(min_=0, max_=None)

    fig.plot(progress_timestamps,
             progress,
             label='Overall (estimated)')

    real_progress = []
    for ts in progress_timestamps:
        real_progress_i = ts / float(spent_time_epoch) * 100
        real_progress.append(real_progress_i)

    fig.plot(progress_timestamps,
             real_progress,
             label='Overall (real)')

    print(fig.show(legend=True))
    print('')
    print('')
Esempio n. 31
0
def show_consumer_pool_size(scan_log_filename, scan):
    scan.seek(0)

    consumer_pool_perc_audit = []
    consumer_pool_timestamps_audit = []

    consumer_pool_perc_crawl = []
    consumer_pool_timestamps_crawl = []

    worker_pool_perc = []
    worker_pool_timestamps = []

    for line in scan:
        match = IDLE_CONSUMER_WORKERS.search(line)
        if not match:
            continue

        percent = int(match.group(1))
        is_audit = 'audit' in match.group(2).lower()
        is_crawl = 'crawl' in match.group(2).lower()

        if is_audit:
            consumer_pool_perc_audit.append(percent)
            consumer_pool_timestamps_audit.append(get_line_epoch(line))
        elif is_crawl:
            consumer_pool_perc_crawl.append(percent)
            consumer_pool_timestamps_crawl.append(get_line_epoch(line))
        else:
            worker_pool_perc.append(percent)
            worker_pool_timestamps.append(get_line_epoch(line))

    first_timestamp = get_first_timestamp(scan)
    last_timestamp = get_last_timestamp(scan)
    spent_time_epoch = last_timestamp - first_timestamp
    consumer_pool_timestamps_audit = [
        ts - first_timestamp for ts in consumer_pool_timestamps_audit
    ]
    consumer_pool_timestamps_crawl = [
        ts - first_timestamp for ts in consumer_pool_timestamps_crawl
    ]
    worker_pool_timestamps = [
        ts - first_timestamp for ts in worker_pool_timestamps
    ]

    if not consumer_pool_perc_audit and not consumer_pool_perc_crawl:
        print('No thread pool data found')
        return

    print('Idle thread pool workers over time')
    print('    Latest idle core workers %s%%' % worker_pool_perc[-1])

    if consumer_pool_perc_audit:
        print('    Latest idle audit workers %s%%' %
              consumer_pool_perc_audit[-1])

    if consumer_pool_perc_crawl:
        print('    Latest idle crawl-infra workers %s%%' %
              consumer_pool_perc_crawl[-1])

    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Idle worker (%)'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_time_epoch)
    fig.set_y_limits(min_=0, max_=101)

    fig.plot(consumer_pool_timestamps_audit,
             consumer_pool_perc_audit,
             label='Idle audit workers',
             lc=50)

    fig.plot(consumer_pool_timestamps_crawl,
             consumer_pool_perc_crawl,
             label='Idle crawl workers',
             lc=170)

    fig.plot(worker_pool_timestamps,
             worker_pool_perc,
             label='Idle core workers',
             lc=250)

    print(fig.show(legend=True))
    print('')
    print('')
Esempio n. 32
0
def show_progress_delta(scan_log_filename, scan):
    first_timestamp = get_first_timestamp(scan)

    #
    # Find the end times for crawl, audit, grep
    #
    scan.seek(0)

    crawl_end_timestamp = None
    audit_end_timestamp = None
    grep_end_timestamp = None

    for line in scan:
        if CRAWL_INFRA_FINISHED in line:
            crawl_end_timestamp = get_line_epoch(line)

        if 'seconds to join' not in line:
            continue

        match = JOIN_TIMES.search(line)
        if match:
            if 'audit' in line.lower():
                audit_end_timestamp = get_line_epoch(line)
            if 'grep' in line.lower():
                grep_end_timestamp = get_line_epoch(line)

    #
    # Find the crawl, audit and grep progress estimations
    #
    scan.seek(0)

    crawl_progress = []
    crawl_progress_timestamps = []

    audit_progress = []
    audit_progress_timestamps = []

    grep_progress = []
    grep_progress_timestamps = []

    for line in scan:
        match = CALCULATED_ETA.search(line)
        if match:
            ts = get_line_epoch(line)

            eta = match.group(2)
            if eta == 'None':
                eta = '0.0'

            eta = float(eta)
            percentage = (ts - first_timestamp) / (ts - first_timestamp +
                                                   eta) * 100

            if 'crawl' in line.lower():
                crawl_progress_timestamps.append(ts)
                crawl_progress.append(percentage)

            if 'audit' in line.lower():
                audit_progress_timestamps.append(ts)
                audit_progress.append(percentage)

            if 'grep' in line.lower():
                grep_progress_timestamps.append(ts)
                grep_progress.append(percentage)

    # Make the timestamps relative to the scan finish
    crawl_progress_timestamps = [
        ts - first_timestamp for ts in crawl_progress_timestamps
    ]
    audit_progress_timestamps = [
        ts - first_timestamp for ts in audit_progress_timestamps
    ]
    grep_progress_timestamps = [
        ts - first_timestamp for ts in grep_progress_timestamps
    ]

    #
    # Find the overall progress estimations
    #
    scan.seek(0)

    progress = []
    progress_timestamps = []

    for line in scan:
        match = SCAN_PROGRESS.search(line)
        if match:
            progress.append(int(match.group(1)))
            progress_timestamps.append(get_line_epoch(line))

    # Get the last timestamp to use as max in the graphs
    progress_timestamps = [ts - first_timestamp for ts in progress_timestamps]

    scan.seek(0)

    first_timestamp = get_first_timestamp(scan)
    finished_timestamp = None

    for line in scan:
        match = SCAN_FINISHED_IN.search(line)
        if match:
            finished_timestamp = get_line_epoch(line)

    finished_timestamp = finished_timestamp or get_last_timestamp(scan)
    spent_time_epoch = finished_timestamp - first_timestamp

    print('Progress delta (estimated vs. real)')
    print('')

    if crawl_progress and crawl_end_timestamp is not None:
        fig = plotille.Figure()
        fig.width = 90
        fig.height = 20
        fig.register_label_formatter(float, _num_formatter)
        fig.register_label_formatter(int, _num_formatter)
        fig.y_label = 'Progress'
        fig.x_label = 'Time'
        fig.color_mode = 'byte'
        fig.set_x_limits(min_=0, max_=spent_time_epoch)
        fig.set_y_limits(min_=0, max_=None)

        fig.plot(crawl_progress_timestamps,
                 crawl_progress,
                 label='Crawl (estimated)')

        crawl_real_spent = int(crawl_end_timestamp) - int(first_timestamp)
        crawl_real_progress_timestamps = range(int(first_timestamp),
                                               int(crawl_end_timestamp), 1)
        crawl_real_progress_timestamps = [
            ts - first_timestamp for ts in crawl_real_progress_timestamps
        ]

        crawl_real_progress = []

        for ts in crawl_real_progress_timestamps:
            crawl_real_progress.append(float(ts) / crawl_real_spent * 100)

        fig.plot(crawl_real_progress_timestamps,
                 crawl_real_progress,
                 label='Crawl (real)')

        print(fig.show(legend=True))
        print('')
        print('')

    if audit_progress and audit_end_timestamp is not None:
        fig = plotille.Figure()
        fig.width = 90
        fig.height = 20
        fig.register_label_formatter(float, _num_formatter)
        fig.register_label_formatter(int, _num_formatter)
        fig.y_label = 'Progress'
        fig.x_label = 'Time'
        fig.color_mode = 'byte'
        fig.set_x_limits(min_=0, max_=spent_time_epoch)
        fig.set_y_limits(min_=0, max_=None)

        fig.plot(audit_progress_timestamps,
                 audit_progress,
                 label='Audit (estimated)')

        audit_real_spent = int(audit_end_timestamp) - int(first_timestamp)
        audit_real_progress_timestamps = range(int(first_timestamp),
                                               int(audit_end_timestamp), 1)
        audit_real_progress_timestamps = [
            ts - first_timestamp for ts in audit_real_progress_timestamps
        ]

        audit_real_progress = []

        for ts in audit_real_progress_timestamps:
            audit_real_progress.append(float(ts) / audit_real_spent * 100)

        fig.plot(audit_real_progress_timestamps,
                 audit_real_progress,
                 label='Audit (real)')

        print(fig.show(legend=True))
        print('')
        print('')

    if grep_progress and grep_end_timestamp is not None:
        fig = plotille.Figure()
        fig.width = 90
        fig.height = 20
        fig.register_label_formatter(float, _num_formatter)
        fig.register_label_formatter(int, _num_formatter)
        fig.y_label = 'Progress'
        fig.x_label = 'Time'
        fig.color_mode = 'byte'
        fig.set_x_limits(min_=0, max_=spent_time_epoch)
        fig.set_y_limits(min_=0, max_=None)

        fig.plot(grep_progress_timestamps,
                 grep_progress,
                 label='Grep (estimated)')

        grep_real_spent = int(grep_end_timestamp) - int(first_timestamp)
        grep_real_progress_timestamps = range(int(first_timestamp),
                                              int(grep_end_timestamp), 1)
        grep_real_progress_timestamps = [
            ts - first_timestamp for ts in grep_real_progress_timestamps
        ]

        grep_real_progress = []

        for ts in grep_real_progress_timestamps:
            grep_real_progress.append(float(ts) / grep_real_spent * 100)

        fig.plot(grep_real_progress_timestamps,
                 grep_real_progress,
                 label='Grep (real)')

        print(fig.show(legend=True))
        print('')
        print('')

    if not progress:
        print(
            'No progress data to calculate deltas (requirement: enable xml_file plugin)'
        )
        return

    if finished_timestamp is None:
        print('The scan did not finish. Can not show progress delta.')
        return

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Progress'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    fig.set_x_limits(min_=0, max_=spent_time_epoch)
    fig.set_y_limits(min_=0, max_=None)

    fig.plot(progress_timestamps, progress, label='Overall (estimated)')

    real_progress = []
    for ts in progress_timestamps:
        real_progress_i = ts / float(spent_time_epoch) * 100
        real_progress.append(real_progress_i)

    fig.plot(progress_timestamps, real_progress, label='Overall (real)')

    print(fig.show(legend=True))
    print('')
    print('')
Esempio n. 33
0
def show_parser_errors(scan_log_filename, scan):
    scan.seek(0)

    timeout_count = 0
    timeout_errors = []
    timeout_errors_timestamps = []

    memory_count = 0
    memory_errors = []
    memory_errors_timestamps = []

    for line in scan:
        if PARSER_TIMEOUT in line:
            timeout_count += 1
            timeout_errors.append(timeout_count)
            timeout_errors_timestamps.append(get_line_epoch(line))

        if PARSER_MEMORY_LIMIT in line:
            memory_count += 1
            memory_errors.append(memory_count)
            memory_errors_timestamps.append(get_line_epoch(line))

    first_timestamp = get_first_timestamp(scan)
    last_timestamp = get_last_timestamp(scan)
    spent_epoch = last_timestamp - first_timestamp
    timeout_errors_timestamps = [
        ts - first_timestamp for ts in timeout_errors_timestamps
    ]
    memory_errors_timestamps = [
        ts - first_timestamp for ts in memory_errors_timestamps
    ]

    if not memory_errors and not timeout_errors:
        print('No parser errors found')
        print('')
        return

    print('Parser errors')
    print('    Timeout errors: %s' % timeout_count)
    print('    Memory errors: %s' % memory_count)
    print('')

    fig = plotille.Figure()
    fig.width = 90
    fig.height = 20
    fig.register_label_formatter(float, _num_formatter)
    fig.register_label_formatter(int, _num_formatter)
    fig.y_label = 'Parser errors'
    fig.x_label = 'Time'
    fig.color_mode = 'byte'
    #fig.set_x_limits(min_=0, max_=spent_epoch)
    fig.set_y_limits(min_=0, max_=max(memory_count, timeout_count))

    fig.plot(timeout_errors,
             timeout_errors_timestamps,
             label='Timeout errors',
             lc=50)

    fig.plot(memory_errors,
             memory_errors_timestamps,
             label='Memory errors',
             lc=200)

    print(fig.show(legend=True))
    print('')
    print('')