def postprocess(runtime_info, experiment_directory): postprocessed_data = [] stats_info = runtime_info["stats"] stage_info = runtime_info["stage_info"] # Emit epochs in the order in which they were executed job_sequence = utils.job_sequence(experiment_directory) descriptions = {} for job in job_sequence: description_dir = utils.job_description(experiment_directory, job) if os.path.exists(description_dir): descriptions[job] = Description(description_dir) phase_sequence = {} for job in job_sequence: if job in descriptions: phase_sequence[job] = descriptions[job].getPhaseList() else: phase_sequence[job] = [] comp_function = epoch_comparator(job_sequence, phase_sequence) sorted_epochs = sorted(stats_info.keys(), key=comp_function) for (job, phase, epoch) in sorted_epochs: epoch_info = stats_info[(job, phase, epoch)] if job in descriptions: description = descriptions[job] else: description = None postprocessed_epoch_info = postprocess_epoch( epoch_info, stage_info[(job, phase, epoch)], phase, description) postprocessed_epoch_info["job"] = job postprocessed_epoch_info["phase"] = phase postprocessed_epoch_info["epoch"] = epoch postprocessed_data.append(postprocessed_epoch_info) return postprocessed_data
def run_benchmark_iterations( binary, log_directory, config, peer_ips, profiler, profiler_options, iterations, sleep, delete_output, per_peer_config, dump_core_directory, solo_mode, stage_stats, interfaces, params="", ): # Get ssh username and themis directory username, themis_directory = read_conf_file("cluster.conf", "cluster", ["username", "themis_directory"]) themis_directory = os.path.expanduser(themis_directory) # Get cloud provider if applicable. provider = read_conf_file("cluster.conf", "cluster", "provider") if interfaces == None: vnstat_interface = None else: interface_list = filter(lambda x: len(x) > 0, interfaces.split(",")) vnstat_interface = interface_list[0] if not os.path.exists(config): sys.exit("Config file %s does not exist." % config) with open(config, "r") as fp: app_config = yaml.load(fp) # If we're using more than 1 network interface per peer, the peer list is # going to look like: # Peer1_interface1, Peer1_interface2, Peer2_interface1, Peer2_interface2, .. # In this case, we only want to launch the benchmark once per peer, so # make sure we only look at the first interface for each peer, and let # the application itself deal with the other interfaces. num_interfaces = 1 if "NUM_INTERFACES" in app_config: num_interfaces = app_config["NUM_INTERFACES"] # Remove trailing comma if any from the IP list. This will be the string we # pass into the benchmark binary. peer_list = peer_ips.rstrip(",") # If we're using multiple interfaces, only launch the benchmark once per # node. node_list = peer_list.split(",")[::num_interfaces] # Look for description files in the same directory as the binary. binary_dir = os.path.dirname(binary) description_directory = os.path.join(binary_dir, "description") if not os.path.exists(description_directory): sys.exit("Could not find description directory %s" % (description_directory)) # Check for the phase name. For simplicity we're going to require that # the benchmark have only 1 phase description = Description(description_directory) phases = description.getPhaseList() if len(phases) != 1: sys.exit("Benchmark must have exactly one phase. Got %s" % phases) phase_name = phases[0] data_size_per_node = int(app_config["BENCHMARK_DATA_SIZE_PER_NODE"][phase_name]) data_size = data_size_per_node * len(node_list) total_throughputs = {} if stage_stats is not None: stage_stats = stage_stats.split(",") for stage in stage_stats: total_throughputs[stage] = 0.0 node_benchmark_throughputs = [] for i in xrange(iterations): # Pick a unique batch ID batch = 0 while os.path.exists(os.path.join(log_directory, "batch_%d" % batch)): batch += 1 batch_directory = os.path.join(log_directory, "batch_%d" % batch) # Create directories phase_directory = os.path.join(batch_directory, phase_name) parallel_ssh(None, "mkdir -p %s" % phase_directory, username, node_list, False, True, False) # Copy description files and create phase directory. if not os.path.exists(batch_directory): os.makedirs(batch_directory) shutil.copy(os.path.join(description_directory, "stages.json"), batch_directory) shutil.copy(os.path.join(description_directory, "structure.json"), batch_directory) os.chmod(os.path.join(batch_directory, "stages.json"), 0777) os.chmod(os.path.join(batch_directory, "structure.json"), 0777) # Copy config file shutil.copyfile(config, os.path.join(batch_directory, "config.yaml")) print "\nLogging to %s" % (batch_directory) print "Running %s with batch ID %d on %d nodes..." % (phase_name, batch, len(node_list)) (elapsed, elapsed_times, completed_ips) = run_benchmark( binary, config, batch_directory, phase_directory, profiler, profiler_options, peer_list, node_list, per_peer_config, dump_core_directory, solo_mode, vnstat_interface, params, ) # Compute overall throughput throughput = (data_size / elapsed) / 1000000 per_node_throughput = (data_size_per_node / elapsed) / 1000000 print "Completed in %.2f seconds." % elapsed print " Throughput: %.2f MB/s" % throughput print " Per-server: %.2f MB/s" % per_node_throughput # Record individual throughputs throughputs = [(data_size_per_node / x) / 1000000 for x in elapsed_times] node_benchmark_throughputs += throughputs # Dump these results to a file in the batch directory results_file = open(os.path.join(batch_directory, "results"), "w") results_file.write( "Runtime: %.2f seconds\nThroughput: %.2f MB/s\nPer-server: " "%.2f MB/s\n\n" % (elapsed, throughput, per_node_throughput) ) results_file.write("Node throughputs: %s\n\n" % throughputs) for ip, elapsed_time, throughput in zip(completed_ips, elapsed_times, throughputs): results_file.write("Node %s completed in %.2f seconds (%.2f MB/s)\n" % (ip, elapsed_time, throughput)) results_file.write("\n") if stage_stats is not None: # Compute runtime stat throughputs done = False while not done: # Upload all logs. upload_logs() # Download logs locally. download_logs() try: runtime_info = gather_runtime_info(batch_directory, False) done = True except ValueError: print "Runtime info script failed. Retrying log upload/downloads." stage_info = runtime_info[0]["stages"] node_throughputs = {} for worker_info in stage_info: stats_info = worker_info["stats_info"] # We only want to look at the overall stats, which includes all # nodes (hostname or worker ID won't be specified) if len(stats_info) == 1: stage_name = stats_info["stage"] if stage_name in stage_stats: # This is one of the stages we care about node_throughputs[stage_name] = worker_info["observed_processing_rate_per_node"] total_throughputs[stage_name] += node_throughputs[stage_name] # Print throughputs in the correct order. for stage_name in stage_stats: print " %s throughput: %.2f MB/s/node" % (stage_name, node_throughputs[stage_name]) results_file.write("%s throughput: %.2f MB/s\n" % (stage_name, node_throughputs[stage_name])) results_file.close() if delete_output and "OUTPUT_DISK_LIST" in app_config and phase_name in app_config["OUTPUT_DISK_LIST"]: output_disk_list = app_config["OUTPUT_DISK_LIST"][phase_name] output_disks = output_disk_list.split(",") for disk in output_disks: print "Clearing %s" % disk parallel_ssh(None, "rm -rf %s" % disk, username, node_list, False, False, False) if sleep > 0 and i != iterations - 1: print "Sleeping %d seconds" % sleep time.sleep(sleep) print "\nCompleted %d iterations\n" % iterations # Format node throughputs node_benchmark_throughput_strings = ["%.2f" % x for x in node_benchmark_throughputs] print " Node throughputs (MB/s):" print " %s" % node_benchmark_throughput_strings print " Average node throughput: %.2f MB/s" % (numpy.mean(node_benchmark_throughputs)) print " Standard deviation: %.2f MB/s" % (numpy.std(node_benchmark_throughputs)) print " Min node throughput: %.2f MB/s" % (numpy.min(node_benchmark_throughputs)) print " Max node throughput: %.2f MB/s\n" % (numpy.max(node_benchmark_throughputs)) if stage_stats is not None: for stage_name in stage_stats: print " Average %s throughput: %.2f MB/s/node" % (stage_name, total_throughputs[stage_name] / iterations)
def run_benchmark_iterations( binary, log_directory, config, peer_ips, profiler, profiler_options, iterations, sleep, delete_output, per_peer_config, dump_core_directory, solo_mode, stage_stats, interfaces, params=""): # Get ssh username and themis directory username, themis_directory = read_conf_file( "cluster.conf", "cluster", ["username", "themis_directory"]) themis_directory = os.path.expanduser(themis_directory) # Get cloud provider if applicable. provider = read_conf_file("cluster.conf", "cluster", "provider") if interfaces == None: vnstat_interface = None else: interface_list = filter(lambda x: len(x) > 0, interfaces.split(',')) vnstat_interface = interface_list[0] if not os.path.exists(config): sys.exit("Config file %s does not exist." % config) with open(config, 'r') as fp: app_config = yaml.load(fp) # If we're using more than 1 network interface per peer, the peer list is # going to look like: # Peer1_interface1, Peer1_interface2, Peer2_interface1, Peer2_interface2, .. # In this case, we only want to launch the benchmark once per peer, so # make sure we only look at the first interface for each peer, and let # the application itself deal with the other interfaces. num_interfaces = 1 if "NUM_INTERFACES" in app_config: num_interfaces = app_config["NUM_INTERFACES"] # Remove trailing comma if any from the IP list. This will be the string we # pass into the benchmark binary. peer_list = peer_ips.rstrip(",") # If we're using multiple interfaces, only launch the benchmark once per # node. node_list = peer_list.split(",")[::num_interfaces] # Look for description files in the same directory as the binary. binary_dir = os.path.dirname(binary) description_directory = os.path.join(binary_dir, "description") if not os.path.exists(description_directory): sys.exit("Could not find description directory %s" % ( description_directory)) # Check for the phase name. For simplicity we're going to require that # the benchmark have only 1 phase description = Description(description_directory) phases = description.getPhaseList() if len(phases) != 1: sys.exit("Benchmark must have exactly one phase. Got %s" % phases) phase_name = phases[0] data_size_per_node = int( app_config["BENCHMARK_DATA_SIZE_PER_NODE"][phase_name]) data_size = data_size_per_node * len(node_list) total_throughputs = {} if stage_stats is not None: stage_stats = stage_stats.split(",") for stage in stage_stats: total_throughputs[stage] = 0.0 node_benchmark_throughputs = [] for i in xrange(iterations): # Pick a unique batch ID batch = 0 while os.path.exists( os.path.join(log_directory, "batch_%d" % batch)): batch += 1 batch_directory = os.path.join(log_directory, "batch_%d" % batch) # Create directories phase_directory = os.path.join(batch_directory, phase_name) parallel_ssh( None, "mkdir -p %s" % phase_directory, username, node_list, False, True, False) # Copy description files and create phase directory. if not os.path.exists(batch_directory): os.makedirs(batch_directory) shutil.copy( os.path.join(description_directory, "stages.json"), batch_directory) shutil.copy( os.path.join(description_directory, "structure.json"), batch_directory) os.chmod(os.path.join(batch_directory, "stages.json"), 0777) os.chmod(os.path.join(batch_directory, "structure.json"), 0777) # Copy config file shutil.copyfile(config, os.path.join(batch_directory, "config.yaml")) print "\nLogging to %s" % (batch_directory) print "Running %s with batch ID %d on %d nodes..." % ( phase_name, batch, len(node_list)) (elapsed, elapsed_times, completed_ips) = run_benchmark( binary, config, batch_directory, phase_directory, profiler, profiler_options, peer_list, node_list, per_peer_config, dump_core_directory, solo_mode, vnstat_interface, params) # Compute overall throughput throughput = (data_size / elapsed) / 1000000 per_node_throughput = (data_size_per_node / elapsed) / 1000000 print "Completed in %.2f seconds." % elapsed print " Throughput: %.2f MB/s" % throughput print " Per-server: %.2f MB/s" % per_node_throughput # Record individual throughputs throughputs = [(data_size_per_node / x) / 1000000 \ for x in elapsed_times] node_benchmark_throughputs += throughputs # Dump these results to a file in the batch directory results_file = open(os.path.join(batch_directory, "results"), "w") results_file.write( "Runtime: %.2f seconds\nThroughput: %.2f MB/s\nPer-server: " \ "%.2f MB/s\n\n" % (elapsed, throughput, per_node_throughput)) results_file.write("Node throughputs: %s\n\n" % throughputs) for ip, elapsed_time, throughput in zip( completed_ips, elapsed_times, throughputs): results_file.write( "Node %s completed in %.2f seconds (%.2f MB/s)\n" % ( ip, elapsed_time, throughput)) results_file.write("\n") if stage_stats is not None: # Compute runtime stat throughputs done = False while not done: # Upload all logs. upload_logs() # Download logs locally. download_logs() try: runtime_info = gather_runtime_info(batch_directory, False) done = True except ValueError: print "Runtime info script failed. Retrying log upload/downloads." stage_info = runtime_info[0]["stages"] node_throughputs = {} for worker_info in stage_info: stats_info = worker_info["stats_info"] # We only want to look at the overall stats, which includes all # nodes (hostname or worker ID won't be specified) if len(stats_info) == 1: stage_name = stats_info["stage"] if stage_name in stage_stats: # This is one of the stages we care about node_throughputs[stage_name] = \ worker_info["observed_processing_rate_per_node"] total_throughputs[stage_name] += \ node_throughputs[stage_name] # Print throughputs in the correct order. for stage_name in stage_stats: print " %s throughput: %.2f MB/s/node" % ( stage_name, node_throughputs[stage_name]) results_file.write("%s throughput: %.2f MB/s\n" % ( stage_name, node_throughputs[stage_name])) results_file.close() if delete_output and "OUTPUT_DISK_LIST" in app_config and \ phase_name in app_config["OUTPUT_DISK_LIST"]: output_disk_list = app_config["OUTPUT_DISK_LIST"][phase_name] output_disks = output_disk_list.split(",") for disk in output_disks: print "Clearing %s" % disk parallel_ssh( None, "rm -rf %s" % disk, username, node_list, False, False, False) if sleep > 0 and i != iterations - 1: print "Sleeping %d seconds" % sleep time.sleep(sleep) print "\nCompleted %d iterations\n" % iterations # Format node throughputs node_benchmark_throughput_strings = [ "%.2f" % x for x in node_benchmark_throughputs] print " Node throughputs (MB/s):" print " %s" % node_benchmark_throughput_strings print " Average node throughput: %.2f MB/s" % ( numpy.mean(node_benchmark_throughputs)) print " Standard deviation: %.2f MB/s" % ( numpy.std(node_benchmark_throughputs)) print " Min node throughput: %.2f MB/s" % ( numpy.min(node_benchmark_throughputs)) print " Max node throughput: %.2f MB/s\n" % ( numpy.max(node_benchmark_throughputs)) if stage_stats is not None: for stage_name in stage_stats: print " Average %s throughput: %.2f MB/s/node" % ( stage_name, total_throughputs[stage_name] / iterations)
def plot_timeline_for_phase(log_directory, job, phase, phase_data): min_timestamp = phase_data["min_timestamp"] max_timestamp = phase_data["max_timestamp"] description = Description(os.path.join(log_directory, job, "description")) stage_ordering = description.getStageOrdering(phase) duration_lists = {} for stage in stage_ordering: duration_lists[stage] = [] for key in phase_data: if key in ["min_timestamp", "max_timestamp"]: continue hostname, stage, worker_id = key worker_duration_info = Duration( hostname.split('.')[0], stage, worker_id, (phase_data[key][0] - min_timestamp) / 1000000.0, (phase_data[key][1] - min_timestamp) / 1000000.0) duration_lists[stage].append(worker_duration_info) def sort_function(x): return (x.hostname, x.worker_id, x.start_time, x.stop_time) layout = PlotLayout() for stage in stage_ordering: duration_list = duration_lists[stage] duration_list.sort(key=sort_function) bars = {} # Set up a "padding" bar that will appear to move bars up so that they # start when the worker starts start_bar = Bar() start_bar.linewidth = 0 start_bar.color = "white" for i, duration in enumerate(duration_list): if duration.hostname not in bars: bars[duration.hostname] = Bar() bars[duration.hostname].yValues.append(duration.stop_time - duration.start_time) start_bar.yValues.append(duration.start_time) # Make sure that all bars have the same number of y-axis values, # give them x-axis values and set their colors start_bar.xValues = range(len(start_bar.yValues)) start_bar.xTickLabelProperties = {"rotation": 90} bar_colors = [ "red", "blue", "green", "orange", "gray", "pink", "purple", "black" ] offset = 0 for i, (hostname, bar) in enumerate(bars.items()): # Pad y axis with zeroes so that bars can be laid out next to # each other with a StackedBars num_y_values = len(bar.yValues) bar.yValues = (([0] * offset) + bar.yValues + ([0] * (len(duration_list) - (num_y_values + offset)))) # Put the label for this hostname roughly in the middle of its bar # cluster start_bar.xTickLabels.append(hostname) # Subtracting 0.5 to account for half the width of the bar start_bar.xTickLabelPoints.append(offset + (num_y_values / 2.0) - 0.5) offset += num_y_values bar.xValues = range(len(bar.yValues)) bar.color = bar_colors[i % len(bar_colors)] bar.label = hostname stacked_bars = StackedBars() stacked_bars.add(start_bar) for hostname in sorted(bars.keys()): stacked_bars.add(bars[hostname]) plot = Plot() plot.setYLimits(0, ((max_timestamp - min_timestamp) / 1000000.0) * 1.05) plot.setXLabel("Worker") plot.setYLabel("Time (s)") plot.setTitle(stage) plot.add(stacked_bars) layout.addPlot(plot) return layout
def plot_timeline_for_phase(log_directory, job, phase, phase_data): min_timestamp = phase_data["min_timestamp"] max_timestamp = phase_data["max_timestamp"] description = Description(os.path.join(log_directory, job, "description")) stage_ordering = description.getStageOrdering(phase) duration_lists = {} for stage in stage_ordering: duration_lists[stage] = [] for key in phase_data: if key in ["min_timestamp", "max_timestamp"]: continue hostname, stage, worker_id = key worker_duration_info = Duration( hostname.split('.')[0], stage, worker_id, (phase_data[key][0] - min_timestamp) / 1000000.0, (phase_data[key][1] - min_timestamp) / 1000000.0) duration_lists[stage].append(worker_duration_info) def sort_function(x): return (x.hostname, x.worker_id, x.start_time, x.stop_time) layout = PlotLayout() for stage in stage_ordering: duration_list = duration_lists[stage] duration_list.sort(key=sort_function) bars = {} # Set up a "padding" bar that will appear to move bars up so that they # start when the worker starts start_bar = Bar() start_bar.linewidth = 0 start_bar.color = "white" for i, duration in enumerate(duration_list): if duration.hostname not in bars: bars[duration.hostname] = Bar() bars[duration.hostname].yValues.append( duration.stop_time - duration.start_time) start_bar.yValues.append(duration.start_time) # Make sure that all bars have the same number of y-axis values, # give them x-axis values and set their colors start_bar.xValues = range(len(start_bar.yValues)) start_bar.xTickLabelProperties = { "rotation" : 90 } bar_colors = ["red", "blue", "green", "orange", "gray", "pink", "purple", "black"] offset = 0 for i, (hostname, bar) in enumerate(bars.items()): # Pad y axis with zeroes so that bars can be laid out next to # each other with a StackedBars num_y_values = len(bar.yValues) bar.yValues = (([0] * offset) + bar.yValues + ([0] * (len(duration_list) - (num_y_values + offset)))) # Put the label for this hostname roughly in the middle of its bar # cluster start_bar.xTickLabels.append(hostname) # Subtracting 0.5 to account for half the width of the bar start_bar.xTickLabelPoints.append(offset + (num_y_values / 2.0) - 0.5) offset += num_y_values bar.xValues = range(len(bar.yValues)) bar.color = bar_colors[i % len(bar_colors)] bar.label = hostname stacked_bars = StackedBars() stacked_bars.add(start_bar) for hostname in sorted(bars.keys()): stacked_bars.add(bars[hostname]) plot = Plot() plot.setYLimits(0, ((max_timestamp - min_timestamp) / 1000000.0) * 1.05) plot.setXLabel("Worker") plot.setYLabel("Time (s)") plot.setTitle(stage) plot.add(stacked_bars) layout.addPlot(plot) return layout