def main(argv): disk_utilizations = [] cpu_utilizations = [] network_utilizations = [] dirname = argv[0] for filename in os.listdir(dirname): full_name = os.path.join(dirname, filename) if os.path.isfile(full_name) and filename.endswith("job_log"): print "Reading %s" % filename analyzer = parse_logs.Analyzer(full_name) for (id, stage) in analyzer.stages.iteritems(): for task in stage.tasks: cpu_utilizations.append(task.total_cpu_utilization / 8.) network_utilizations.append( task.network_bytes_transmitted_ps / (1000 * 1000 * 1000)) network_utilizations.append( task.network_bytes_received_ps / (1000 * 1000 * 1000)) for name, block_device_numbers in task.disk_utilization.iteritems( ): if name in ["xvdb", "xvdf"]: disk_utilizations.append(block_device_numbers[0]) output_filename = os.path.join(dirname, "cpu_disk_utilization_cdf") f = open(output_filename, "w") print max(network_utilizations) for percent in range(100): f.write("%s\t%s\t%s\t%s\n" % (percent / 100., numpy.percentile(cpu_utilizations, percent), numpy.percentile(disk_utilizations, percent), numpy.percentile(network_utilizations, percent))) f.close()
def main(argv): map_utilizations = [] reduce_utilizations = [] all_utilizations = [] dirname = argv[0] for filename in os.listdir(dirname): full_name = os.path.join(dirname, filename) if os.path.isfile(full_name) and filename.endswith("job_log"): print "Reading %s" % filename analyzer = parse_logs.Analyzer(full_name) for (id, stage) in analyzer.stages.iteritems(): for task in stage.tasks: for name, block_device_numbers in task.disk_utilization.iteritems( ): if name in ["xvdb", "xvdf"]: effective_util = 0 if block_device_numbers[0] > 0: effective_util = (block_device_numbers[1] + block_device_numbers[2] ) / block_device_numbers[0] all_utilizations.append(effective_util) if task.has_fetch: reduce_utilizations.append(effective_util) else: map_utilizations.append(effective_util) output_filename = os.path.join(dirname, "disk_utilization_cdf") f = open(output_filename, "w") for percent in range(100): f.write("%s\t%s\t%s\t%s\n" % (percent / 100., numpy.percentile(map_utilizations, percent), numpy.percentile(reduce_utilizations, percent), numpy.percentile(all_utilizations, percent))) f.close()
def __init__(self, filename): analyzer = parse_logs.Analyzer(filename) self.total_disk_input_mb = 0 self.total_input_mb = 0 self.total_shuffle_write_mb = 0 self.total_shuffle_read_mb = 0 self.total_output_mb = 0 self.runtime = 0 self.total_shuffle_time = 0 self.total_reduce_time = 0 self.total_reduce_cpu_time = 0 self.total_runtime = 0 self.total_cpu_time = 0 for stage in analyzer.stages.values(): self.total_disk_input_mb += sum([ t.input_mb for t in stage.tasks if t.input_read_method != "Memory" ]) self.total_input_mb += sum([t.input_mb for t in stage.tasks]) self.total_shuffle_write_mb += sum( [t.shuffle_mb_written for t in stage.tasks]) self.total_shuffle_read_mb += sum([ t.remote_mb_read + t.local_mb_read for t in stage.tasks if t.has_fetch ]) self.total_output_mb += sum([t.output_mb for t in stage.tasks]) self.runtime += stage.finish_time() - stage.start_time self.total_shuffle_time += sum( [t.fetch_wait for t in stage.tasks if t.has_fetch]) self.total_reduce_time += sum( [t.runtime() for t in stage.tasks if t.has_fetch]) self.total_runtime += sum([t.runtime() for t in stage.tasks]) self.total_cpu_time += sum([ t.process_cpu_utilization * t.executor_run_time for t in stage.tasks ]) self.total_reduce_cpu_time += sum([ t.process_cpu_utilization * t.executor_run_time for t in stage.tasks if t.has_fetch ]) #Comment this line in to estimate the effective CPU time when multiple tasks are running #concurrently. #[t.compute_time_without_gc() for t in stage.tasks if t.has_fetch]) # Get the SQL query for this file. self.sql = "" for line in open(filename, "r"): if line.startswith("STAGE_ID"): break self.sql += line self.filename = filename self.num_joins = self.sql.lower().count("join")
def main(argv): filename = argv[0] analyzer = parse_logs.Analyzer(filename) start_time = min([x.start_time for x in analyzer.stages.values()]) for (id, stage) in analyzer.stages.iteritems(): stage_filename = "%s_%s_utilization" % (filename, id) f = open(stage_filename, "w") for task in stage.tasks: items = [task.start_time, task.executor_run_time, task.total_cpu_utilization] for block_device_numbers in task.disk_utilization.values(): items.extend(block_device_numbers) items.append(task.network_bytes_transmitted_ps / 125000000) items.append(task.network_bytes_received_ps / 125000000) write_data_to_file(items, f) f.close() plot_base_file = open("utilization_scatter_base.gp", "r") plot_file = open("%s_%s_utilization.gp" % (filename, id), "w") for line in plot_base_file: plot_file.write(line) plot_base_file.close() plot_file.write("set output \"%s_%s_utilization.pdf\"\n" % (filename, id)) plot_file.write("plot \"%s\" using ($1-%s):4 with p title \"Disk1\",\\\n" % (stage_filename, start_time)) plot_file.write("\"%s\" using ($1-%s):7 with p title \"Disk2\",\\\n" % (stage_filename, start_time)) plot_file.write("\"%s\" using ($1-%s):13 with p title\"Network T\",\\\n" % (stage_filename, start_time)) plot_file.write("\"%s\" using ($1-%s):14 with p title\"Network R\",\\\n" % (stage_filename, start_time)) plot_file.write("\"%s\" using ($1-%s):($3/8) with p title \"CPU\"\n" % (stage_filename, start_time)) plot_file.close()
def __init__(self, filename): analyzer = parse_logs.Analyzer(filename) self.total_input_size = 0 self.total_shuffle_mb = 0 self.total_output_mb = 0 self.runtime = 0 self.no_disk_runtime = analyzer.no_disk_speedup()[2] for stage in analyzer.stages.values(): self.total_input_size += sum([t.input_mb for t in stage.tasks]) self.total_shuffle_mb += sum( [t.shuffle_mb_written for t in stage.tasks]) self.total_output_mb += sum([t.output_mb for t in stage.tasks]) self.runtime += stage.finish_time() - stage.start_time # Get the SQL query for this file. self.sql = "" for line in open(filename, "r"): if line.startswith("STAGE_ID"): break self.sql += line self.filename = filename self.num_joins = self.sql.lower().count("join")
def estimate(filename): analyzer = parse_logs.Analyzer(filename) total_job_runtime = 0 actual_job_runtime = 0 min_job_cpu_millis = 0 total_job_cpu_millis = 0 min_job_network_millis = 0 min_job_disk_millis = 0 # Used as a sanity check: shuffle write and shuffle read should be the same. all_stages_shuffle_write_mb = 0 all_stages_shuffle_read_mb = 0 all_stages_disk_input_mb = 0 for id in sorted(analyzer.stages.keys(), reverse=True): stage = analyzer.stages[id] total_cpu_milliseconds = 0 total_disk_input_data = 0 total_hdfs_output_data = 0 total_remote_mb_read = 0 total_shuffle_read_mb = 0 total_shuffle_write_mb = 0 total_machine_time_spent = 0 total_runtime_incl_delay = 0 print "*****STAGE has %s tasks" % len(stage.tasks) for task in stage.tasks: total_cpu_milliseconds += task.process_cpu_utilization * task.executor_run_time if task.input_read_method != "Memory": total_disk_input_data += task.input_mb total_shuffle_write_mb += task.shuffle_mb_written total_machine_time_spent += task.executor_run_time total_runtime_incl_delay += task.runtime() total_hdfs_output_data += task.output_mb if task.has_fetch: total_remote_mb_read += task.remote_mb_read # Remote MB still need to be read from disk. shuffle_mb = task.local_mb_read + task.remote_mb_read total_disk_input_data += shuffle_mb total_shuffle_read_mb += shuffle_mb all_stages_shuffle_write_mb += total_shuffle_write_mb all_stages_shuffle_read_mb += total_shuffle_read_mb all_stages_disk_input_mb += total_disk_input_data print "*******************Stage runtime: ", stage.finish_time( ) - stage.start_time print "Total millis across all tasks: ", total_machine_time_spent print "Total millis including scheduler delay: ", total_runtime_incl_delay print "Total CPU millis: ", total_cpu_milliseconds min_cpu_milliseconds = total_cpu_milliseconds / (NUM_MACHINES * CPUS_PER_MACHINE) print "Total input MB: ", total_disk_input_data print "Total remote MB: ", total_remote_mb_read print "Total shuffle read MB: ", total_shuffle_read_mb print "Total output MB: ", total_hdfs_output_data total_input_disk_milliseconds = 1000 * total_disk_input_data / DISK_MB_PER_SECOND total_output_disk_milliseconds = 1000 * ( (total_shuffle_write_mb + total_hdfs_output_data) / DISK_MB_PER_SECOND) min_disk_milliseconds = ( (total_input_disk_milliseconds + total_output_disk_milliseconds) / (NUM_MACHINES * DISKS_PER_MACHINE)) print "Min disk millis: %s, min cpu millis: %s" % ( min_disk_milliseconds, min_cpu_milliseconds) # Add twice the amount of HDFS output data because the data needs to be sent to two locations. print "Total shuffle write MB: ", total_shuffle_write_mb total_network_mb = total_remote_mb_read + 2 * total_hdfs_output_data total_network_milliseconds = 1000 * total_network_mb / NETWORK_MB_PER_SECOND min_network_milliseconds = total_network_milliseconds / NUM_MACHINES print "Min network millis: %s" % (min_network_milliseconds) min_stage_runtime = max(min_disk_milliseconds, min_cpu_milliseconds, min_network_milliseconds) print "Min stage runtime: ", min_stage_runtime total_job_runtime += min_stage_runtime actual_job_runtime += stage.finish_time() - stage.start_time min_job_cpu_millis += min_cpu_milliseconds total_job_cpu_millis += total_cpu_milliseconds min_job_network_millis += min_network_milliseconds min_job_disk_millis += min_disk_milliseconds print "--------------------------------------------------------------" print "Total pipelined job runtime:", total_job_runtime, "milliseconds" total_not_pipelined_runtime = min_job_cpu_millis + min_job_network_millis + min_job_disk_millis print "Total not pipelined job runtime:", total_not_pipelined_runtime, "milliseconds" print "Min CPU milliseconds for job: %s milliseconds (%s total)" % ( min_job_cpu_millis, total_job_cpu_millis) print "Min network milliseconds for job", min_job_network_millis, "milliseconds" print "Min disk milliseconds for job", min_job_disk_millis, "milliseconds" print "Actual job runtime:", actual_job_runtime, "milliseconds" print("Shuffle write MB: %s, read MB: %s, all input: %s" % (all_stages_shuffle_write_mb, all_stages_shuffle_read_mb, all_stages_disk_input_mb)) return (total_not_pipelined_runtime, total_job_runtime, min_job_cpu_millis, min_job_network_millis, min_job_disk_millis, total_job_cpu_millis)