def run_cmd_sudo(self, cmd, host, machine_config, fail_ok=False, return_dict=None, proc_counter=None): cxn = self.new_connection(host, machine_config) res = None try: res = cxn.sudo(cmd, hide=False) # utils.warn("Ran command: {}".format(cmd)) # res.stdout.strip() if return_dict is not None and proc_counter is not None: return_dict[proc_counter] = True return except: if not fail_ok: utils.error("Failed to run cmd {} on host {}.".format( cmd, host)) if return_dict is not None and proc_counter is not None: return_dict[proc_counter] = False return if return_dict is not None and proc_counter is not None: return_dict[proc_counter] = True return
def get_hosts(self, program, programs_metadata): ret = [] if program == "start_server": return [programs_metadata[program]["hosts"][0]] elif program == "start_client": options = programs_metadata[program]["hosts"] return self.get_iteration_clients(options) else: utils.error("Unknown program name: {}".format(program)) exit(1) return ret
def find_rate(self, client_options, host): rates = [] for info in self.client_rates: rate = info[0] num = info[1] for idx in range(num): rates.append(rate) try: rate_idx = client_options.index(host) return rates[rate_idx] except: utils.error("Host {} not found in client options {}.".format( host, client_options)) exit(1)
def get_iterations(self, total_args): if total_args.exp_type == "individual": if total_args.num_clients > self.config_yaml["max_clients"]: utils.error( "Cannot have {} clients, greater than max {}".format( total_args.num_clients, self.config_yaml["max_clients"])) exit(1) client_rates = [(total_args.rate, total_args.num_clients)] it = ScatterGatherIteration(client_rates, total_args.segment_size, total_args.num_mbufs, total_args.with_copy, total_args.as_one) num_trials_finished = utils.parse_number_trials_done( it.get_parent_folder(total_args.folder)) it.set_trial(num_trials_finished) return [it] else: ret = [] for trial in range(utils.NUM_TRIALS): for segment_size in SEGMENT_SIZES_TO_LOOP: max_num_mbufs = MBUFS_MAX for num_mbufs in range(1, max_num_mbufs + 1): rate_gbps = MAX_RATE_GBPS rate = utils.get_tput_pps(rate_gbps, segment_size * num_mbufs) rate = min(MAX_RATE_PPS, rate) it = ScatterGatherIteration([(rate, 1)], segment_size, num_mbufs, False, False, trial=trial) it_wc = ScatterGatherIteration([(rate, 1)], segment_size, num_mbufs, True, False, trial=trial) it_as_one = ScatterGatherIteration([(rate, 1)], segment_size, num_mbufs, True, True, trial=trial) ret.append(it) ret.append(it_wc) ret.append(it_as_one) return ret
def get_program_args(self, folder, program, host, config_yaml, programs_metadata, exp_time): ret = {} if program == "start_server": ret["cornflakes_dir"] = config_yaml["cornflakes_dir"] ret["server_ip"] = config_yaml["hosts"][host]["ip"] if self.with_copy: ret["with_copy"] = " --with_copy" else: ret["with_copy"] = "" ret["folder"] = str(folder) elif program == "start_client": # set with_copy, segment_size, num_mbufs based on if it is with_copy if self.with_copy: ret["with_copy"] = " --with_copy" if self.as_one: ret["as_one"] = "as_one" ret["segment_size"] = self.segment_size * self.num_mbufs ret["num_mbufs"] = 1 else: ret["segment_size"] = self.segment_size ret["num_mbufs"] = self.num_mbufs else: ret["with_copy"] = "" ret["segment_size"] = self.segment_size ret["num_mbufs"] = self.num_mbufs # calculate client rate host_options = self.get_iteration_clients( programs_metadata[program]["hosts"]) rate = self.find_rate(host_options, host) server_host = programs_metadata["start_server"]["hosts"][0] ret["cornflakes_dir"] = config_yaml["cornflakes_dir"] ret["server_ip"] = config_yaml["hosts"][server_host]["ip"] ret["host_ip"] = config_yaml["hosts"][host]["ip"] ret["server_mac"] = config_yaml["hosts"][server_host]["mac"] ret["rate"] = rate ret["time"] = exp_time ret["latency_log"] = "{}.latency.log".format(host) ret["host"] = host ret["folder"] = str(folder) else: utils.error("Unknown program name: {}".format(program)) exit(1) return ret
def get_program_args(self, folder, program, host, config_yaml, programs_metadata, exp_time): ret = {} ret["cornflakes_dir"] = config_yaml["cornflakes_dir"] ret["config_file"] = config_yaml["config_file"] ret["library"] = self.serialization ret["folder"] = str(folder) ret["server_message"] = self.server_message ret["message"] = self.message_type ret["server_size"] = self.server_size ret["size"] = self.size if program == "start_server": ret["zero_copy_recv"] = "" ret["copy_to_dma_memory"] = "" if self.recv_mode == "zero_copy_recv": ret["zero_copy_recv"] = " -z" elif self.recv_mode == "copy_to_dma_memory": ret["copy_to_dma_memory"] = "--copy_to_dma_memory" elif program == "start_client": ret["zero_copy_recv"] = " -z" # always have zero_copy_recv on ret["copy_to_dma_memory"] = "" # calculate client rate host_options = self.get_iteration_clients( programs_metadata[program]["hosts"]) rate = self.find_rate(host_options, host) ret["rate"] = rate # calculate server host server_host = programs_metadata["start_server"]["hosts"][0] ret["server_ip"] = config_yaml["hosts"][server_host]["ip"] # exp time ret["time"] = exp_time ret["host"] = host else: utils.error("Unknown program name: {}".format(program)) exit(1) return ret
def get_trial_string(self): if self.trial == None: utils.error("TRIAL IS NOT SET FOR ITERATION.") exit(1) return "trial_{}".format(self.trial)
def get_iterations(self, total_args): if total_args.exp_type == "individual": if total_args.num_clients > int(self.config_yaml["max_clients"]): utils.error( "Cannot have {} clients, greater than max {}".format( total_args.num_clients, self.config_yaml["max_clients"])) exit(1) client_rates = [(total_args.rate, total_args.num_clients)] it = DsQueryIteration(client_rates, total_args.server_size, total_args.size, total_args.serialization, total_args.server_message_type, total_args.message_type, total_args.recv_mode) num_trials_finished = utils.parse_number_trials_done( it.get_parent_folder(total_args.folder)) if total_args.analysis_only or total_args.graph_only: ret = [] for i in range(0, num_trials_finished): it_clone = copy.deepcopy(it) it_clone.set_trial(i) ret.append(it_clone) return ret it.set_trial(num_trials_finished) return [it] else: # loop over the options ret = [] for trial in range(3): for server_message_type in MESSAGE_TYPES: for client_message_type in ["single"]: for server_size in SIZES_TO_LOOP: for client_size in CLIENT_SIZES_TO_LOOP: for serialization in SERIALIZATION_LIBRARIES: if client_size > server_size: continue if server_size == 8192\ and server_message_type == "tree-5"\ and (serialization == "cornflakes-dynamic" or serialization == "cornflakes-1cdynamic"): continue recv_modes = ["zero_copy_recv"] for recv_mode in recv_modes: # for client rates: # do some testing to determine optimal rates client_rates = [[(24000, 1)], [(48000, 1)], [(72000, 1)], [(96000, 1)]] for i in range( 2, int(self.config_yaml[ "max_clients"])): client_rates.append([(100000, i)]) for i in range( 1, int(self.config_yaml[ "max_clients"])): client_rates.append([(120000, i)]) # TODO: how do we get "mid range" # e.g.: points that exactly determine # where the knee is for rate in client_rates: it = DsQueryIteration( rate, server_size, client_size, serialization, server_message_type, client_message_type, recv_mode, trial=trial) ret.append(it) return ret
def run(self, folder, exp_config, machine_config, pprint, program_version_info, use_perf=False): """ Runs the actual program. Arguments: * folder - Path that all logfiles from this iteration should go. * exp_config - Experiment yaml that contains command lines. Assumes this contains a set of programs to run, each with a list of corresponding hosts that can run that command line. * machine_config - Machine level config yaml. * pprint - Instead of running, just print out command lines. * program_version_info - Metadata about the commit version of the repo at time of experiment. * use_perf - Whether to use perf or not when running the server. """ programs_to_join_immediately = {} programs_to_kill = {} # map from start time (in seconds) to list # of programs with that start time programs_by_start_time = defaultdict(list) # assumes program processes to be executed are in order in the yaml commands = exp_config["commands"] programs = exp_config["programs"] exp_time = exp_config["time"] record_paths = {} # map from a program id to the actual process program_counter = 0 proc_map = {} status_dict = {} manager = mp.Manager() status_dict = manager.dict() # spawn the commands for command in commands: program_name = command["program"] program = programs[program_name] program_hosts = program["hosts"] kill_cmd = None if "stop" in program: kill_cmd = program["stop"] for host in self.get_relevant_hosts(program, program_name): program_cmd = program["start"] if "log" in program: if "out" in program["log"]: stdout = program["log"]["out"] program_cmd += " > {}".format(stdout) if "err" in program["log"]: stderr = program["log"]["err"] program_cmd += " 2> {}".format(stderr) if "record" in program["log"]: record_path = program["log"]["record"] program_args = self.get_program_args(folder, program_name, host, machine_config, programs, exp_time) program_cmd = program_cmd.format(**program_args) if use_perf and "perf" in program: utils.debug("current program args: {}", program_args) perf_cmd = program["perf"].format(**program_args) program_cmd = "{} {}".format(perf_cmd, program_cmd) record_path = record_path.format(**program_args) fail_ok = False if kill_cmd is not None: kill_cmd = kill_cmd.format(**program_args) fail_ok = True yaml_record = { "host": host, "args": program_args, "command": program_cmd, "stop_command": kill_cmd, "version_info": program_version_info } if pprint: utils.debug( "Host {}: \n\t - Running Cmd: {}\n\t - Stopped by: {}". format(host, program_cmd, kill_cmd)) else: record_paths[record_path] = yaml_record proc = mp.Process(target=self.run_cmd_sudo, args=(program_cmd, host, machine_config, fail_ok, status_dict, program_counter)) start_time = int(command["begin"]) proc_map[program_counter] = proc programs_by_start_time[start_time].append( (kill_cmd, program_counter, program_name, host, program_args)) program_counter += 1 # now start each start program cur_time = 0 program_start_times = sorted(programs_by_start_time.keys()) for start_time in program_start_times: if start_time != cur_time: time.sleep(start_time - cur_time) cur_time = start_time progs = programs_by_start_time[start_time] for info in progs: kill_cmd = info[0] program_counter = info[1] program_name = info[2] host = info[3] proc = proc_map[program_counter] program_args = info[4] utils.debug("Starting program {} on host {}, args: {}".format( program_name, host, program_args)) proc.start() if kill_cmd == None: programs_to_join_immediately[host] = program_counter else: programs_to_kill[host] = (program_counter, kill_cmd) any_failed = False # now join all of the joining programs for host in programs_to_join_immediately: prog_counter = programs_to_join_immediately[host] proc = proc_map[prog_counter] res = proc.join() status = status_dict[prog_counter] if not status: any_failed = True utils.debug("Host {} done; status: {}".format(host, status)) # now kill the rest of the programs for host in programs_to_kill: (program_counter, kill_cmd) = programs_to_kill[host] try: kill_cmd_with_sleep = kill_cmd + "; /bin/sleep 3" utils.debug("Trying to run kill command: {} on host {}".format( kill_cmd, host)) self.kill_remote_process(kill_cmd_with_sleep, host, machine_config) except: utils.warn("Failed to run kill command:", "{}".format(kill_cmd_with_sleep)) exit(1) try: proc_map[program_counter].join() except: utils.warn( "Failed to run join command: {}".format(program_counter)) # now, experiment is over, so record experiment metadata for record_path in record_paths: yaml_record = record_paths[record_path] with open(record_path, 'w') as file: yaml.dump(yaml_record, file) file.close() if any_failed: utils.error("One of the programs failed.") return False return True
def get_iterations(self, total_args): if total_args.exp_type == "individual": if total_args.num_clients > int(self.config_yaml["max_clients"]): utils.error( "Cannot have {} clients, greater than max {}".format( total_args.num_clients, self.config_yaml["max_clients"])) exit(1) client_rates = [(total_args.rate, total_args.num_clients)] it = EchoBenchIteration(client_rates, total_args.size, total_args.serialization, total_args.message_type, total_args.recv_mode) num_trials_finished = utils.parse_number_trials_done( it.get_parent_folder(total_args.folder)) if total_args.analysis_only or total_args.graph_only: ret = [] for i in range(0, num_trials_finished): it_clone = copy.deepcopy(it) it_clone.set_trial(i) ret.append(it_clone) return ret it.set_trial(num_trials_finished) return [it] else: # loop over the options ret = [] for trial in range(3): # for trial in range(utils.NUM_TRIALS): for message_type in MESSAGE_TYPES: for size in SIZES_TO_LOOP: for serialization in SERIALIZATION_LIBRARIES: if size == 8192\ and message_type == "tree-5"\ and (serialization == "cornflakes-dynamic" or serialization == "cornflakes-1cdynamic"): continue for recv_mode in RECV_TYPES: # for client rates: # for now loop over 2 rates and 1-2 machines # do some testing to determine optimal rates client_rates = [[(24000, 1)], [(48000, 1)], [(72000, 1)], [(96000, 1)]] for i in range( 2, int(self.config_yaml["max_clients"])): client_rates.append([(100000, i)]) for i in range( 1, int(self.config_yaml["max_clients"])): client_rates.append([(120000, i)]) for rate in client_rates: it = EchoBenchIteration(rate, size, serialization, message_type, recv_mode, trial=trial) ret.append(it) return ret