def main(argv):
    disk_utilizations = []
    cpu_utilizations = []
    network_utilizations = []
    dirname = argv[0]
    for filename in os.listdir(dirname):
        full_name = os.path.join(dirname, filename)
        if os.path.isfile(full_name) and filename.endswith("job_log"):
            print "Reading %s" % filename
            analyzer = parse_logs.Analyzer(full_name)

            for (id, stage) in analyzer.stages.iteritems():
                for task in stage.tasks:
                    cpu_utilizations.append(task.total_cpu_utilization / 8.)
                    network_utilizations.append(
                        task.network_bytes_transmitted_ps /
                        (1000 * 1000 * 1000))
                    network_utilizations.append(
                        task.network_bytes_received_ps / (1000 * 1000 * 1000))
                    for name, block_device_numbers in task.disk_utilization.iteritems(
                    ):
                        if name in ["xvdb", "xvdf"]:
                            disk_utilizations.append(block_device_numbers[0])

    output_filename = os.path.join(dirname, "cpu_disk_utilization_cdf")
    f = open(output_filename, "w")
    print max(network_utilizations)
    for percent in range(100):
        f.write("%s\t%s\t%s\t%s\n" %
                (percent / 100., numpy.percentile(cpu_utilizations, percent),
                 numpy.percentile(disk_utilizations, percent),
                 numpy.percentile(network_utilizations, percent)))
    f.close()
Exemple #2
0
def main(argv):
    map_utilizations = []
    reduce_utilizations = []
    all_utilizations = []
    dirname = argv[0]
    for filename in os.listdir(dirname):
        full_name = os.path.join(dirname, filename)
        if os.path.isfile(full_name) and filename.endswith("job_log"):
            print "Reading %s" % filename
            analyzer = parse_logs.Analyzer(full_name)

            for (id, stage) in analyzer.stages.iteritems():
                for task in stage.tasks:
                    for name, block_device_numbers in task.disk_utilization.iteritems(
                    ):
                        if name in ["xvdb", "xvdf"]:
                            effective_util = 0
                            if block_device_numbers[0] > 0:
                                effective_util = (block_device_numbers[1] +
                                                  block_device_numbers[2]
                                                  ) / block_device_numbers[0]
                            all_utilizations.append(effective_util)
                            if task.has_fetch:
                                reduce_utilizations.append(effective_util)
                            else:
                                map_utilizations.append(effective_util)

    output_filename = os.path.join(dirname, "disk_utilization_cdf")
    f = open(output_filename, "w")
    for percent in range(100):
        f.write("%s\t%s\t%s\t%s\n" %
                (percent / 100., numpy.percentile(map_utilizations, percent),
                 numpy.percentile(reduce_utilizations, percent),
                 numpy.percentile(all_utilizations, percent)))
    f.close()
    def __init__(self, filename):
        analyzer = parse_logs.Analyzer(filename)
        self.total_disk_input_mb = 0
        self.total_input_mb = 0
        self.total_shuffle_write_mb = 0
        self.total_shuffle_read_mb = 0
        self.total_output_mb = 0
        self.runtime = 0
        self.total_shuffle_time = 0
        self.total_reduce_time = 0
        self.total_reduce_cpu_time = 0
        self.total_runtime = 0
        self.total_cpu_time = 0
        for stage in analyzer.stages.values():
            self.total_disk_input_mb += sum([
                t.input_mb for t in stage.tasks
                if t.input_read_method != "Memory"
            ])
            self.total_input_mb += sum([t.input_mb for t in stage.tasks])
            self.total_shuffle_write_mb += sum(
                [t.shuffle_mb_written for t in stage.tasks])
            self.total_shuffle_read_mb += sum([
                t.remote_mb_read + t.local_mb_read for t in stage.tasks
                if t.has_fetch
            ])
            self.total_output_mb += sum([t.output_mb for t in stage.tasks])
            self.runtime += stage.finish_time() - stage.start_time
            self.total_shuffle_time += sum(
                [t.fetch_wait for t in stage.tasks if t.has_fetch])
            self.total_reduce_time += sum(
                [t.runtime() for t in stage.tasks if t.has_fetch])
            self.total_runtime += sum([t.runtime() for t in stage.tasks])
            self.total_cpu_time += sum([
                t.process_cpu_utilization * t.executor_run_time
                for t in stage.tasks
            ])
            self.total_reduce_cpu_time += sum([
                t.process_cpu_utilization * t.executor_run_time
                for t in stage.tasks if t.has_fetch
            ])
            #Comment this line in to estimate the effective CPU time when multiple tasks are running
            #concurrently.
            #[t.compute_time_without_gc() for t in stage.tasks if t.has_fetch])

        # Get the SQL query for this file.
        self.sql = ""
        for line in open(filename, "r"):
            if line.startswith("STAGE_ID"):
                break
            self.sql += line

        self.filename = filename

        self.num_joins = self.sql.lower().count("join")
def main(argv):
  filename = argv[0]
  analyzer = parse_logs.Analyzer(filename)

  start_time = min([x.start_time for x in analyzer.stages.values()])

  for (id, stage) in analyzer.stages.iteritems():
    stage_filename = "%s_%s_utilization" % (filename, id)
    f = open(stage_filename, "w")

    for task in stage.tasks:
      items = [task.start_time, task.executor_run_time, task.total_cpu_utilization]
      for block_device_numbers in task.disk_utilization.values():
        items.extend(block_device_numbers)
      items.append(task.network_bytes_transmitted_ps / 125000000)
      items.append(task.network_bytes_received_ps / 125000000)
      write_data_to_file(items, f)
    f.close()    

    plot_base_file = open("utilization_scatter_base.gp", "r")
    plot_file = open("%s_%s_utilization.gp" % (filename, id), "w")
    for line in plot_base_file:
      plot_file.write(line)
    plot_base_file.close()
    plot_file.write("set output \"%s_%s_utilization.pdf\"\n" % (filename, id))
    plot_file.write("plot \"%s\" using ($1-%s):4 with p title \"Disk1\",\\\n" %
      (stage_filename, start_time))
    plot_file.write("\"%s\" using ($1-%s):7 with p title \"Disk2\",\\\n" %
      (stage_filename, start_time))
    plot_file.write("\"%s\" using ($1-%s):13 with p title\"Network T\",\\\n" %
      (stage_filename, start_time))
    plot_file.write("\"%s\" using ($1-%s):14 with p title\"Network R\",\\\n" %
      (stage_filename, start_time))
    plot_file.write("\"%s\" using ($1-%s):($3/8) with p title \"CPU\"\n" %
      (stage_filename, start_time))
    plot_file.close()
Exemple #5
0
    def __init__(self, filename):
        analyzer = parse_logs.Analyzer(filename)
        self.total_input_size = 0
        self.total_shuffle_mb = 0
        self.total_output_mb = 0
        self.runtime = 0
        self.no_disk_runtime = analyzer.no_disk_speedup()[2]
        for stage in analyzer.stages.values():
            self.total_input_size += sum([t.input_mb for t in stage.tasks])
            self.total_shuffle_mb += sum(
                [t.shuffle_mb_written for t in stage.tasks])
            self.total_output_mb += sum([t.output_mb for t in stage.tasks])
            self.runtime += stage.finish_time() - stage.start_time

        # Get the SQL query for this file.
        self.sql = ""
        for line in open(filename, "r"):
            if line.startswith("STAGE_ID"):
                break
            self.sql += line

        self.filename = filename

        self.num_joins = self.sql.lower().count("join")
Exemple #6
0
def estimate(filename):
    analyzer = parse_logs.Analyzer(filename)

    total_job_runtime = 0
    actual_job_runtime = 0
    min_job_cpu_millis = 0
    total_job_cpu_millis = 0
    min_job_network_millis = 0
    min_job_disk_millis = 0

    # Used as a sanity check: shuffle write and shuffle read should be the same.
    all_stages_shuffle_write_mb = 0
    all_stages_shuffle_read_mb = 0
    all_stages_disk_input_mb = 0
    for id in sorted(analyzer.stages.keys(), reverse=True):
        stage = analyzer.stages[id]
        total_cpu_milliseconds = 0
        total_disk_input_data = 0
        total_hdfs_output_data = 0
        total_remote_mb_read = 0
        total_shuffle_read_mb = 0
        total_shuffle_write_mb = 0
        total_machine_time_spent = 0
        total_runtime_incl_delay = 0

        print "*****STAGE has %s tasks" % len(stage.tasks)
        for task in stage.tasks:
            total_cpu_milliseconds += task.process_cpu_utilization * task.executor_run_time
            if task.input_read_method != "Memory":
                total_disk_input_data += task.input_mb
            total_shuffle_write_mb += task.shuffle_mb_written
            total_machine_time_spent += task.executor_run_time
            total_runtime_incl_delay += task.runtime()
            total_hdfs_output_data += task.output_mb
            if task.has_fetch:
                total_remote_mb_read += task.remote_mb_read
                # Remote MB still need to be read from disk.
                shuffle_mb = task.local_mb_read + task.remote_mb_read
                total_disk_input_data += shuffle_mb
                total_shuffle_read_mb += shuffle_mb
        all_stages_shuffle_write_mb += total_shuffle_write_mb
        all_stages_shuffle_read_mb += total_shuffle_read_mb
        all_stages_disk_input_mb += total_disk_input_data
        print "*******************Stage runtime: ", stage.finish_time(
        ) - stage.start_time

        print "Total millis across all tasks: ", total_machine_time_spent
        print "Total millis including scheduler delay: ", total_runtime_incl_delay
        print "Total CPU millis: ", total_cpu_milliseconds
        min_cpu_milliseconds = total_cpu_milliseconds / (NUM_MACHINES *
                                                         CPUS_PER_MACHINE)
        print "Total input MB: ", total_disk_input_data
        print "Total remote MB: ", total_remote_mb_read
        print "Total shuffle read MB: ", total_shuffle_read_mb
        print "Total output MB: ", total_hdfs_output_data
        total_input_disk_milliseconds = 1000 * total_disk_input_data / DISK_MB_PER_SECOND
        total_output_disk_milliseconds = 1000 * (
            (total_shuffle_write_mb + total_hdfs_output_data) /
            DISK_MB_PER_SECOND)

        min_disk_milliseconds = (
            (total_input_disk_milliseconds + total_output_disk_milliseconds) /
            (NUM_MACHINES * DISKS_PER_MACHINE))

        print "Min disk millis: %s, min cpu millis: %s" % (
            min_disk_milliseconds, min_cpu_milliseconds)
        # Add twice the amount of HDFS output data because the data needs to be sent to two locations.

        print "Total shuffle write MB: ", total_shuffle_write_mb
        total_network_mb = total_remote_mb_read + 2 * total_hdfs_output_data
        total_network_milliseconds = 1000 * total_network_mb / NETWORK_MB_PER_SECOND
        min_network_milliseconds = total_network_milliseconds / NUM_MACHINES
        print "Min network millis: %s" % (min_network_milliseconds)

        min_stage_runtime = max(min_disk_milliseconds, min_cpu_milliseconds,
                                min_network_milliseconds)
        print "Min stage runtime: ", min_stage_runtime
        total_job_runtime += min_stage_runtime
        actual_job_runtime += stage.finish_time() - stage.start_time

        min_job_cpu_millis += min_cpu_milliseconds
        total_job_cpu_millis += total_cpu_milliseconds
        min_job_network_millis += min_network_milliseconds
        min_job_disk_millis += min_disk_milliseconds

    print "--------------------------------------------------------------"
    print "Total pipelined job runtime:", total_job_runtime, "milliseconds"
    total_not_pipelined_runtime = min_job_cpu_millis + min_job_network_millis + min_job_disk_millis
    print "Total not pipelined job runtime:", total_not_pipelined_runtime, "milliseconds"
    print "Min CPU milliseconds for job: %s milliseconds (%s total)" % (
        min_job_cpu_millis, total_job_cpu_millis)
    print "Min network milliseconds for job", min_job_network_millis, "milliseconds"
    print "Min disk milliseconds for job", min_job_disk_millis, "milliseconds"
    print "Actual job runtime:", actual_job_runtime, "milliseconds"
    print("Shuffle write MB: %s, read MB: %s, all input: %s" %
          (all_stages_shuffle_write_mb, all_stages_shuffle_read_mb,
           all_stages_disk_input_mb))
    return (total_not_pipelined_runtime, total_job_runtime, min_job_cpu_millis,
            min_job_network_millis, min_job_disk_millis, total_job_cpu_millis)