def graph_results(self, folder, logfile): cornflakes_repo = self.config_yaml["cornflakes_dir"] plotting_script = Path(cornflakes_repo) / \ "experiments" / "plotting_scripts" / "sg_bench.R" plot_path = Path(folder) / "plots" plot_path.mkdir(exist_ok=True) full_log = Path(folder) / logfile for size in SEGMENT_SIZES_TO_LOOP: output_file = plot_path / "segsize_{}.pdf".format(size) args = [ str(plotting_script), str(full_log), str(size), str(output_file) ] try: sh.run(args) except: utils.warn("Failed to run plot command: {}".format(args)) # plot heatmap heatmap_script = Path(cornflakes_repo) / "experiments" / \ "plotting_scripts" / "sg_bench_map.R" heatmap_file = plot_path / "heatmap.pdf" args = [str(heatmap_script), str(full_log), str(heatmap_file)] try: sh.run(args) except: utils.warn("Failed to run heatmap plot command: {}".format(args))
def run_iterations(self, total_args, iterations): """ Loops over the iterations and runs each experiment. """ program_version_info = self.get_program_version_info() folder_path = Path(total_args.folder) ct = 0 total = len(iterations) start = time.time() expected_time = 25 * total / 3600.0 for iteration in iterations: ct += 1 utils.info( "Running iteration # {} out of {}, {} % done with iterations expected time to finish: {} hours" .format(ct - 1, total, (float(ct - 1) / float(total) * 100.0), expected_time)) if iteration.get_folder_name(folder_path).exists(): utils.info("Iteration already exists, skipping:" "{}".format(iteration)) continue if not total_args.pprint: iteration.create_folder(folder_path) utils.debug("Running iteration: ", iteration) status = iteration.run(iteration.get_folder_name(folder_path), self.get_exp_config(), self.get_machine_config(), total_args.pprint, program_version_info, total_args.use_perf) if not status: time.sleep(5) for i in range(utils.NUM_RETRIES): utils.debug( "Retrying iteration because it failed for the ", "{}th time.".format(i + 1)) status = iteration.run( iteration.get_folder_name(folder_path), self.get_exp_config(), self.get_machine_config(), total_args.pprint, program_version_info) if status: break if not status: utils.warn( "Failed to execute program after {} retries.".format( utils.NUM_RETRIES)) exit(1) # sleep before next trial time.sleep(5) now = time.time() total_so_far = now - start expected_time = (float(total_so_far) / ct * total) / 3600.0
def parse_log_info(log): if not (os.path.exists(log)): utils.warn("Path {} does not exist.".format(log)) return {} ret = {} with open(log, 'r') as f: raw_lines = f.readlines() lines = [line.strip() for line in raw_lines] for line in lines: if "High level sending stats" in line: (sent, recved, retries, total_time) = parse_client_time_and_pkts(line) ret["sent"] = sent ret["recved"] = recved ret["retries"] = retries ret["total_time"] = total_time return ret
def parse_log_info(log): if not (os.path.exists(log)): utils.warn("Path {} does not exist".format(log)) return {} ret = {} with open(log) as f: raw_lines = f.readlines() lines = [line.strip() for line in raw_lines] for line in lines: if line.startswith("Ran for"): (time, pkts_sent) = parse_client_time_and_pkts(line) ret["totaltime"] = float(time) ret["pkts_sent"] = int(pkts_sent) elif line.startswith("Num fully received"): pkts_received = parse_client_pkts_received(line) ret["pkts_received"] = int(pkts_received) return ret
def run_analysis_individual_trial(self, higher_level_folder, program_metadata, iteration, print_stats=False): exp_folder = iteration.get_folder_name(higher_level_folder) # parse each client log total_achieved_load = 0 total_offered_rate = 0 total_offered_load = 0 total_achieved_rate = 0 client_latency_lists = [] clients = iteration.get_iteration_clients( program_metadata["start_client"]["hosts"]) for host in clients: args = {"folder": str(exp_folder), "host": host} stdout_log = program_metadata["start_client"]["log"]["out"].format( **args) stdout_info = parse_log_info(stdout_log) if stdout_info == {}: utils.warn("Error parsing stdout log {}".format(stdout_log)) return "" run_metadata_log = program_metadata["start_client"]["log"][ "record"].format(**args) run_info = utils.parse_command_line_args(run_metadata_log) if run_info == {}: utils.warn("Error parsing yaml run info for {}".format( run_metadata_log)) return "" latency_log = "{folder}/{host}.latency.log".format(**args) latencies = utils.parse_latency_log(latency_log, STRIP_THRESHOLD) if latencies == []: utils.warn("Error parsing latency log {}".format(latency_log)) return "" client_latency_lists.append(latencies) host_offered_rate = float(run_info["args"]["rate"]) total_offered_rate += host_offered_rate host_offered_load = float( utils.get_tput_gbps(host_offered_rate, iteration.get_total_size())) total_offered_load += host_offered_load host_pkts_sent = stdout_info["pkts_sent"] host_total_time = stdout_info["totaltime"] host_achieved_rate = float(host_pkts_sent) / float(host_total_time) total_achieved_rate += host_achieved_rate host_achieved_load = utils.get_tput_gbps( host_achieved_rate, iteration.get_total_size()) total_achieved_load += host_achieved_load host_p99 = utils.p99_func(latencies) / 1000.0 host_p999 = utils.p999_func(latencies) / 1000.0 host_median = utils.median_func(latencies) / 1000.0 host_avg = utils.mean_func(latencies) / 1000.0 if print_stats: utils.info( "Client {}: " "offered load: {:.2f} req/s | {:.2f} Gbps, " "achieved load: {:.2f} req/s | {:.2f} Gbps, " "percentage achieved rate: {:.3f}," "avg latency: {:.2f} us, p99: {:.2f} us, p999: {:.2f}, median: {:.2f} us" .format(host, host_offered_rate, host_offered_load, host_achieved_rate, host_achieved_load, float(host_achieved_rate / host_offered_rate), host_avg, host_p99, host_p999, host_median)) # print total stats sorted_latencies = list(heapq.merge(*client_latency_lists)) median = utils.median_func(sorted_latencies) / float(1000) p99 = utils.p99_func(sorted_latencies) / float(1000) p999 = utils.p999_func(sorted_latencies) / float(1000) avg = utils.mean_func(sorted_latencies) / float(1000) if print_stats: total_stats = "offered load: {:.2f} req/s | {:.2f} Gbps, " \ "achieved load: {:.2f} req/s | {:.2f} Gbps, " \ "percentage achieved rate: {:.3f}," \ "avg latency: {:.2f} us, p99: {:.2f} us, p999: {:.2f}, median: {:.2f} us".format( total_offered_rate, total_offered_load, total_achieved_rate, total_achieved_load, float(total_achieved_rate / total_offered_rate), avg, p99, p999, median) utils.info("Total Stats: ", total_stats) percent_acheived_rate = float(total_achieved_rate / total_offered_rate) csv_line = "{},{},{},{},{},{},{},{},{},{},{},{},{}".format( iteration.get_segment_size(), iteration.get_num_mbufs(), iteration.get_with_copy(), iteration.get_as_one(), total_offered_rate, total_offered_load, total_achieved_rate, total_achieved_load, percent_acheived_rate, avg * 1000, median * 1000, p99 * 1000, p999 * 1000) return csv_line
def graph_results(self, folder, logfile): utils.warn("Graphing not implemented yet")
def run_analysis_individual_trial(self, higher_level_folder, program_metadata, iteration, print_stats=False): exp_folder = iteration.get_folder_name(higher_level_folder) # parse stdout logs total_offered_load_pps = 0 total_offered_load_gbps = 0 total_achieved_load_gbps = 0 total_achieved_load_pps = 0 total_retries = 0 client_latency_lists = [] clients = iteration.get_iteration_clients( program_metadata["start_client"]["hosts"]) for host in clients: args = {"folder": str(exp_folder), "host": host} stdout_log = program_metadata["start_client"]["log"]["out"].format( **args) stdout_info = parse_log_info(stdout_log) if stdout_info == {}: utils.warn("Error parsing stdout log {}".format(stdout_log)) return "" run_metadata_log = program_metadata["start_client"]["log"][ "record"].format(**args) run_info = utils.parse_command_line_args(run_metadata_log) if run_info == {}: utils.warn("Error parsing yaml run info for {}".format( run_metadata_log)) return "" latency_log = "{folder}/{host}.latency.log".format(**args) latencies = utils.parse_latency_log(latency_log, STRIP_THRESHOLD) if latencies == []: utils.warn("Error parsing latency log {}".format(latency_log)) return "" client_latency_lists.append(latencies) host_offered_load = float(run_info["args"]["rate"]) total_offered_load_pps += host_offered_load host_offered_load_gbps = float( utils.get_tput_gbps(host_offered_load, iteration.get_server_size())) total_offered_load_gbps += host_offered_load_gbps host_pkts_recved = stdout_info["recved"] host_total_time = stdout_info["total_time"] host_achieved_load = float(host_pkts_recved) / float( host_total_time) total_achieved_load_pps += host_achieved_load host_achieved_load_gbps = float( utils.get_tput_gbps(host_achieved_load, iteration.get_server_size())) total_achieved_load_gbps += host_achieved_load_gbps # convert to microseconds host_p99 = utils.p99_func(latencies) / 1000.0 host_p999 = utils.p999_func(latencies) / 1000.0 host_median = utils.median_func(latencies) / 1000.0 host_avg = utils.mean_func(latencies) / 1000.0 # add retries retries = int(stdout_info["retries"]) total_retries += retries if print_stats: utils.info( "Client {}: " "offered load: {:.4f} req/s | {:.4f} Gbps, " "achieved load: {:.4f} req/s | {:.4f} Gbps, " "percentage achieved rate: {:.4f}, " "retries: {}, " "avg latency: {: .4f} \u03BCs, p99: {: .4f} \u03BCs, p999:" "{: .4f} \u03BCs, median: {: .4f} \u03BCs".format( host, host_offered_load, host_offered_load_gbps, host_achieved_load, host_achieved_load_gbps, float(host_achieved_load / host_offered_load), retries, host_avg, host_p99, host_p999, host_median)) sorted_latencies = list(heapq.merge(*client_latency_lists)) median = utils.median_func(sorted_latencies) / float(1000) p99 = utils.p99_func(sorted_latencies) / float(1000) p999 = utils.p999_func(sorted_latencies) / float(1000) avg = utils.mean_func(sorted_latencies) / float(1000) if print_stats: total_stats = "offered load: {:.4f} req/s | {:.4f} Gbps, " \ "achieved load: {:.4f} req/s | {:.4f} Gbps, " \ "percentage achieved rate: {:.4f}," \ "retries: {}, " \ "avg latency: {:.4f} \u03BCs, p99: {:.4f} \u03BCs, p999: {:.4f}" \ "\u03BCs, median: {:.4f} \u03BCs".format( total_offered_load_pps, total_offered_load_gbps, total_achieved_load_pps, total_achieved_load_gbps, float(total_achieved_load_pps / total_offered_load_pps), total_retries, avg, p99, p999, median) utils.info("Total Stats: ", total_stats) percent_acheived_load = float(total_achieved_load_pps / total_offered_load_pps) csv_line = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}".format( iteration.get_serialization(), iteration.get_server_message_type(), iteration.get_message_type(), iteration.get_server_size(), iteration.get_size(), iteration.get_recv_string(), total_offered_load_pps, total_offered_load_gbps, total_achieved_load_pps, total_achieved_load_gbps, percent_acheived_load, total_retries, avg, median, p99, p999) return csv_line
def run(self, folder, exp_config, machine_config, pprint, program_version_info, use_perf=False): """ Runs the actual program. Arguments: * folder - Path that all logfiles from this iteration should go. * exp_config - Experiment yaml that contains command lines. Assumes this contains a set of programs to run, each with a list of corresponding hosts that can run that command line. * machine_config - Machine level config yaml. * pprint - Instead of running, just print out command lines. * program_version_info - Metadata about the commit version of the repo at time of experiment. * use_perf - Whether to use perf or not when running the server. """ programs_to_join_immediately = {} programs_to_kill = {} # map from start time (in seconds) to list # of programs with that start time programs_by_start_time = defaultdict(list) # assumes program processes to be executed are in order in the yaml commands = exp_config["commands"] programs = exp_config["programs"] exp_time = exp_config["time"] record_paths = {} # map from a program id to the actual process program_counter = 0 proc_map = {} status_dict = {} manager = mp.Manager() status_dict = manager.dict() # spawn the commands for command in commands: program_name = command["program"] program = programs[program_name] program_hosts = program["hosts"] kill_cmd = None if "stop" in program: kill_cmd = program["stop"] for host in self.get_relevant_hosts(program, program_name): program_cmd = program["start"] if "log" in program: if "out" in program["log"]: stdout = program["log"]["out"] program_cmd += " > {}".format(stdout) if "err" in program["log"]: stderr = program["log"]["err"] program_cmd += " 2> {}".format(stderr) if "record" in program["log"]: record_path = program["log"]["record"] program_args = self.get_program_args(folder, program_name, host, machine_config, programs, exp_time) program_cmd = program_cmd.format(**program_args) if use_perf and "perf" in program: utils.debug("current program args: {}", program_args) perf_cmd = program["perf"].format(**program_args) program_cmd = "{} {}".format(perf_cmd, program_cmd) record_path = record_path.format(**program_args) fail_ok = False if kill_cmd is not None: kill_cmd = kill_cmd.format(**program_args) fail_ok = True yaml_record = { "host": host, "args": program_args, "command": program_cmd, "stop_command": kill_cmd, "version_info": program_version_info } if pprint: utils.debug( "Host {}: \n\t - Running Cmd: {}\n\t - Stopped by: {}". format(host, program_cmd, kill_cmd)) else: record_paths[record_path] = yaml_record proc = mp.Process(target=self.run_cmd_sudo, args=(program_cmd, host, machine_config, fail_ok, status_dict, program_counter)) start_time = int(command["begin"]) proc_map[program_counter] = proc programs_by_start_time[start_time].append( (kill_cmd, program_counter, program_name, host, program_args)) program_counter += 1 # now start each start program cur_time = 0 program_start_times = sorted(programs_by_start_time.keys()) for start_time in program_start_times: if start_time != cur_time: time.sleep(start_time - cur_time) cur_time = start_time progs = programs_by_start_time[start_time] for info in progs: kill_cmd = info[0] program_counter = info[1] program_name = info[2] host = info[3] proc = proc_map[program_counter] program_args = info[4] utils.debug("Starting program {} on host {}, args: {}".format( program_name, host, program_args)) proc.start() if kill_cmd == None: programs_to_join_immediately[host] = program_counter else: programs_to_kill[host] = (program_counter, kill_cmd) any_failed = False # now join all of the joining programs for host in programs_to_join_immediately: prog_counter = programs_to_join_immediately[host] proc = proc_map[prog_counter] res = proc.join() status = status_dict[prog_counter] if not status: any_failed = True utils.debug("Host {} done; status: {}".format(host, status)) # now kill the rest of the programs for host in programs_to_kill: (program_counter, kill_cmd) = programs_to_kill[host] try: kill_cmd_with_sleep = kill_cmd + "; /bin/sleep 3" utils.debug("Trying to run kill command: {} on host {}".format( kill_cmd, host)) self.kill_remote_process(kill_cmd_with_sleep, host, machine_config) except: utils.warn("Failed to run kill command:", "{}".format(kill_cmd_with_sleep)) exit(1) try: proc_map[program_counter].join() except: utils.warn( "Failed to run join command: {}".format(program_counter)) # now, experiment is over, so record experiment metadata for record_path in record_paths: yaml_record = record_paths[record_path] with open(record_path, 'w') as file: yaml.dump(yaml_record, file) file.close() if any_failed: utils.error("One of the programs failed.") return False return True