def main(argv):
    (local_event_log_file, continuous_monitor_file) = copy_logs.copy_logs(argv)
    analyzer = parse_event_logs.Analyzer(local_event_log_file,
                                         shuffle_job_filterer.filter)
    analyzer.output_utilizations(local_event_log_file)
    analyzer.output_load_balancing_badness(local_event_log_file)
    analyzer.output_runtimes(local_event_log_file)
def __get_jcts_from_logs(log_dir, warmup_count):
    """
  Returns a tuple of (list of write job JCTs, list of read job JCTs) parsed from the event log
  contained in the provided directory.
  """
    event_log_filepath = path.join(log_dir, "event_log")
    sorted_job_pairs = sorted(
        parse_event_logs.Analyzer(event_log_filepath).jobs.iteritems())
    return (__get_jcts_for_phase(sorted_job_pairs, warmup_count,
                                 phase="write"),
            __get_jcts_for_phase(sorted_job_pairs, warmup_count, phase="read"))
def __get_num_tasks_to_jcts(log_dir, num_warmup_trials):
    """
  Returns a mapping from number of tasks to a list of the JCTs from the jobs that used that number
  of tasks.
  """
    num_tasks_to_event_log = __get_num_tasks_to_event_log(log_dir)
    partial_filterer = functools.partial(__filterer, num_warmup_trials)
    return {
        num_tasks: [
            float(job.runtime()) / 1000 for job in parse_event_logs.Analyzer(
                event_log, partial_filterer).jobs.itervalues()
        ]
        for num_tasks, event_log in num_tasks_to_event_log.iteritems()
    }
def __add_jct_results(data_file, event_log, query_name, num_warmup_trials,
                      x_coordinate, estimate):
    """
  Parses the provided event log, extracts the JCTs, and writes the min, median, and max JCTs to the
  provided data file.
  """
    # Each trial of queries 3abc and 4 consists of two jobs.,
    has_two_jobs_per_trial = ("3" in query_name) or ("4" in query_name)
    num_warmup_jobs = 2 * num_warmup_trials if has_two_jobs_per_trial else num_warmup_trials

    filterer = functools.partial(__drop_warmup_filterer, num_warmup_jobs)
    analyzer = parse_event_logs.Analyzer(event_log, filterer)
    if not estimate:
        analyzer.output_stage_resource_metrics(event_log)
        analyzer.output_job_resource_metrics(event_log)
        analyzer.output_utilizations(event_log)
        analyzer.output_ideal_time_metrics(event_log)
        jcts = [job.runtime() for _, job in sorted(analyzer.jobs.iteritems())]
    else:
        jcts = []
        for _, job in sorted(analyzer.jobs.iteritems()):
            job_runtime = job.runtime()
            for s_id, stage in job.stages.iteritems():
                (cpu, network, disk) = stage.get_ideal_times_from_metrics(10)
                ser = stage.get_ideal_ser_deser_time_s()
                if ser > 0:
                    disk_read = stage.get_disk_read_time_s()
                    print "ser time is ", str(
                        ser), "and disk read is ", disk_read, "of ", disk
                    old_ideal = max(cpu, network, disk)
                    new_ideal = max(cpu - ser, network, disk - disk_read)
                    print "old idea", old_ideal, "new ideal", new_ideal
                    multiplier = float(new_ideal) / old_ideal
                    stage_time = multiplier * stage.runtime()
                    # This strategy avoids needing to deal with concurrent stages.
                    print "Adjusting job runtime from ", job_runtime
                    job_runtime = job_runtime - stage.runtime() + stage_time
                    print "to", job_runtime
            jcts.append(job_runtime)

    if has_two_jobs_per_trial:
        # We sum adjacent JCTs together in order to get the total JCT for each trial.
        jcts = __sum_adjacent_items(jcts)
    data_values = [numpy.median(jcts), min(jcts), max(jcts)]
    data_file.write(__build_data_line(query_name, x_coordinate, data_values))
def __add_jct_results(data_file, event_log, query_name, num_warmup_trials,
                      x_coordinate):
    """
  Parses the provided event log, extracts the JCTs, and writes the min, median, and max JCTs to the
  provided data file.
  """
    # Each trial of queries 3abc and 4 consists of two jobs.,
    has_two_jobs_per_trial = ("3" in query_name) or ("4" in query_name)
    num_warmup_jobs = 2 * num_warmup_trials if has_two_jobs_per_trial else num_warmup_trials

    filterer = functools.partial(__drop_warmup_filterer, num_warmup_jobs)
    analyzer = parse_event_logs.Analyzer(event_log, filterer)
    analyzer.output_stage_resource_metrics(event_log)
    analyzer.output_job_resource_metrics(event_log)
    analyzer.output_utilizations(event_log)
    analyzer.output_ideal_time_metrics(event_log)
    jcts = [job.runtime() for _, job in sorted(analyzer.jobs.iteritems())]

    if has_two_jobs_per_trial:
        # We sum adjacent JCTs together in order to get the total JCT for each trial.
        jcts = __sum_adjacent_items(jcts)
    data_values = [numpy.median(jcts), min(jcts), max(jcts)]
    data_file.write(__build_data_line(query_name, x_coordinate, data_values))
Exemple #6
0
def main(argv):
    if len(argv) < 2:
        print(
            "Usage: parse_vary_num_tasks.py output_directory [opt (to copy data): driver_hostname "
            + "identity_file num_experiments [opt username]]")
        sys.exit(1)

    output_prefix = argv[1]
    if (not os.path.exists(output_prefix)):
        os.mkdir(output_prefix)

    num_cores = 8

    if len(argv) >= 5:
        driver_hostname = argv[2]
        if "millennium" in driver_hostname:
            # The millennium machines have 16 cores.
            num_cores = 16
        identity_file = argv[3]
        num_experiments = argv[4]
        if len(argv) >= 6:
            username = argv[5]
        else:
            username = "******"
        utils.copy_latest_zipped_logs(driver_hostname, identity_file,
                                      output_prefix, num_experiments, username)

    all_dirnames = [
        d for d in os.listdir(output_prefix)
        if "experiment" in d and "tar.gz" not in d
    ]
    all_dirnames.sort(
        key=lambda d: int(re.search('experiment_log_([0-9]*)_', d).group(1)))

    output_filename = os.path.join(output_prefix, "actual_runtimes")
    output_file = open(output_filename, "w")

    for dirname in all_dirnames:
        local_event_log_filename = os.path.join(output_prefix, dirname,
                                                "event_log")
        print "Parsing event log in %s" % local_event_log_filename
        analyzer = parse_event_logs.Analyzer(local_event_log_filename,
                                             job_filterer=filter)

        all_jobs = analyzer.jobs.values()
        num_tasks_values = [
            len(stage.tasks) for job in all_jobs
            for (stage_id, stage) in job.stages.iteritems()
        ]
        # Assumes all of the map and reduce staages use the same number of tasks.
        num_tasks = num_tasks_values[0]

        ideal_runtimes_millis = []
        ideal_map_runtimes_millis = []
        actual_map_runtimes_millis = []
        ideal_reduce_runtimes_millis = []
        actual_reduce_runtimes_millis = []

        for job in all_jobs:
            job_ideal_millis = 0
            for (stage_id, stage) in job.stages.iteritems():
                stage_ideal_millis = 1000 * stage.ideal_time_s(
                    metrics.AWS_M24XLARGE_MAX_NETWORK_GIGABITS_PER_S,
                    num_cores_per_executor=num_cores)
                job_ideal_millis += stage_ideal_millis
                if stage.has_shuffle_read():
                    ideal_reduce_runtimes_millis.append(stage_ideal_millis)
                    actual_reduce_runtimes_millis.append(stage.runtime())
                else:
                    ideal_map_runtimes_millis.append(stage_ideal_millis)
                    actual_map_runtimes_millis.append(stage.runtime())
            ideal_runtimes_millis.append(job_ideal_millis)

        print "Ideal runtimes:", ideal_runtimes_millis
        print "Ideal map runtimes:", ideal_map_runtimes_millis
        print "Ideal reduce runtimes:", ideal_reduce_runtimes_millis

        actual_runtimes_millis = [job.runtime() for job in all_jobs]
        actual_over_ideal = [
            actual / ideal for actual, ideal in zip(actual_runtimes_millis,
                                                    ideal_runtimes_millis)
        ]

        print "Actual runtimes:", actual_runtimes_millis
        data_to_write = [
            num_tasks,
            min(actual_runtimes_millis),
            numpy.percentile(actual_runtimes_millis, 50),  # 3
            max(actual_runtimes_millis),
            min(ideal_runtimes_millis),
            numpy.percentile(ideal_runtimes_millis, 50),  # 6
            max(ideal_runtimes_millis),
            min(actual_over_ideal),
            numpy.percentile(actual_over_ideal, 50),  # 9
            max(actual_over_ideal),
            min(ideal_runtimes_millis),
            numpy.percentile(ideal_runtimes_millis, 50),  # 12
            max(ideal_runtimes_millis),
            min(actual_map_runtimes_millis),
            numpy.percentile(actual_map_runtimes_millis, 50),  # 15
            max(actual_map_runtimes_millis),
            min(ideal_map_runtimes_millis),
            numpy.percentile(ideal_map_runtimes_millis, 50),  # 18
            max(ideal_map_runtimes_millis),
            min(actual_reduce_runtimes_millis),
            numpy.percentile(actual_reduce_runtimes_millis, 50),  # 21
            max(actual_reduce_runtimes_millis),
            min(ideal_reduce_runtimes_millis),
            numpy.percentile(ideal_reduce_runtimes_millis, 50),  # 24
            max(ideal_reduce_runtimes_millis)
        ]
        output_file.write("\t".join([str(x) for x in data_to_write]))
        output_file.write("\n")
    output_file.close()

    plot(output_prefix, "actual_runtimes", "actual_runtimes.gp",
         "gnuplot_files/plot_vary_num_tasks_base.gp")
    plot(output_prefix, "actual_runtimes", "actual_runtimes_map_reduce.gp",
         "gnuplot_files/plot_vary_num_tasks_map_reduce_base.gp")