def graph_results(self, folder, logfile):
     cornflakes_repo = self.config_yaml["cornflakes_dir"]
     plotting_script = Path(cornflakes_repo) / \
         "experiments" / "plotting_scripts" / "sg_bench.R"
     plot_path = Path(folder) / "plots"
     plot_path.mkdir(exist_ok=True)
     full_log = Path(folder) / logfile
     for size in SEGMENT_SIZES_TO_LOOP:
         output_file = plot_path / "segsize_{}.pdf".format(size)
         args = [
             str(plotting_script),
             str(full_log),
             str(size),
             str(output_file)
         ]
         try:
             sh.run(args)
         except:
             utils.warn("Failed to run plot command: {}".format(args))
     # plot heatmap
     heatmap_script = Path(cornflakes_repo) / "experiments" / \
         "plotting_scripts" / "sg_bench_map.R"
     heatmap_file = plot_path / "heatmap.pdf"
     args = [str(heatmap_script), str(full_log), str(heatmap_file)]
     try:
         sh.run(args)
     except:
         utils.warn("Failed to run heatmap plot command: {}".format(args))
Example #2
0
 def run_iterations(self, total_args, iterations):
     """
     Loops over the iterations and runs each experiment.
     """
     program_version_info = self.get_program_version_info()
     folder_path = Path(total_args.folder)
     ct = 0
     total = len(iterations)
     start = time.time()
     expected_time = 25 * total / 3600.0
     for iteration in iterations:
         ct += 1
         utils.info(
             "Running iteration  # {} out of {}, {} % done with iterations expected time to finish: {} hours"
             .format(ct - 1, total, (float(ct - 1) / float(total) * 100.0),
                     expected_time))
         if iteration.get_folder_name(folder_path).exists():
             utils.info("Iteration already exists, skipping:"
                        "{}".format(iteration))
             continue
         if not total_args.pprint:
             iteration.create_folder(folder_path)
         utils.debug("Running iteration: ", iteration)
         status = iteration.run(iteration.get_folder_name(folder_path),
                                self.get_exp_config(),
                                self.get_machine_config(),
                                total_args.pprint, program_version_info,
                                total_args.use_perf)
         if not status:
             time.sleep(5)
             for i in range(utils.NUM_RETRIES):
                 utils.debug(
                     "Retrying iteration because it failed for the ",
                     "{}th time.".format(i + 1))
                 status = iteration.run(
                     iteration.get_folder_name(folder_path),
                     self.get_exp_config(), self.get_machine_config(),
                     total_args.pprint, program_version_info)
                 if status:
                     break
         if not status:
             utils.warn(
                 "Failed to execute program after {} retries.".format(
                     utils.NUM_RETRIES))
             exit(1)
         # sleep before next trial
         time.sleep(5)
         now = time.time()
         total_so_far = now - start
         expected_time = (float(total_so_far) / ct * total) / 3600.0
Example #3
0
def parse_log_info(log):
    if not (os.path.exists(log)):
        utils.warn("Path {} does not exist.".format(log))
        return {}
    ret = {}
    with open(log, 'r') as f:
        raw_lines = f.readlines()
        lines = [line.strip() for line in raw_lines]
        for line in lines:
            if "High level sending stats" in line:
                (sent, recved, retries,
                 total_time) = parse_client_time_and_pkts(line)
                ret["sent"] = sent
                ret["recved"] = recved
                ret["retries"] = retries
                ret["total_time"] = total_time
        return ret
def parse_log_info(log):
    if not (os.path.exists(log)):
        utils.warn("Path {} does not exist".format(log))
        return {}
    ret = {}
    with open(log) as f:
        raw_lines = f.readlines()
        lines = [line.strip() for line in raw_lines]
        for line in lines:
            if line.startswith("Ran for"):
                (time, pkts_sent) = parse_client_time_and_pkts(line)
                ret["totaltime"] = float(time)
                ret["pkts_sent"] = int(pkts_sent)
            elif line.startswith("Num fully received"):
                pkts_received = parse_client_pkts_received(line)
                ret["pkts_received"] = int(pkts_received)

        return ret
    def run_analysis_individual_trial(self,
                                      higher_level_folder,
                                      program_metadata,
                                      iteration,
                                      print_stats=False):
        exp_folder = iteration.get_folder_name(higher_level_folder)
        # parse each client log
        total_achieved_load = 0
        total_offered_rate = 0
        total_offered_load = 0
        total_achieved_rate = 0
        client_latency_lists = []
        clients = iteration.get_iteration_clients(
            program_metadata["start_client"]["hosts"])

        for host in clients:
            args = {"folder": str(exp_folder), "host": host}

            stdout_log = program_metadata["start_client"]["log"]["out"].format(
                **args)
            stdout_info = parse_log_info(stdout_log)
            if stdout_info == {}:
                utils.warn("Error parsing stdout log {}".format(stdout_log))
                return ""

            run_metadata_log = program_metadata["start_client"]["log"][
                "record"].format(**args)
            run_info = utils.parse_command_line_args(run_metadata_log)
            if run_info == {}:
                utils.warn("Error parsing yaml run info for {}".format(
                    run_metadata_log))
                return ""

            latency_log = "{folder}/{host}.latency.log".format(**args)
            latencies = utils.parse_latency_log(latency_log, STRIP_THRESHOLD)
            if latencies == []:
                utils.warn("Error parsing latency log {}".format(latency_log))
                return ""
            client_latency_lists.append(latencies)

            host_offered_rate = float(run_info["args"]["rate"])
            total_offered_rate += host_offered_rate
            host_offered_load = float(
                utils.get_tput_gbps(host_offered_rate,
                                    iteration.get_total_size()))
            total_offered_load += host_offered_load

            host_pkts_sent = stdout_info["pkts_sent"]
            host_total_time = stdout_info["totaltime"]
            host_achieved_rate = float(host_pkts_sent) / float(host_total_time)
            total_achieved_rate += host_achieved_rate
            host_achieved_load = utils.get_tput_gbps(
                host_achieved_rate, iteration.get_total_size())
            total_achieved_load += host_achieved_load

            host_p99 = utils.p99_func(latencies) / 1000.0
            host_p999 = utils.p999_func(latencies) / 1000.0
            host_median = utils.median_func(latencies) / 1000.0
            host_avg = utils.mean_func(latencies) / 1000.0

            if print_stats:
                utils.info(
                    "Client {}: "
                    "offered load: {:.2f} req/s | {:.2f} Gbps, "
                    "achieved load: {:.2f} req/s | {:.2f} Gbps, "
                    "percentage achieved rate: {:.3f},"
                    "avg latency: {:.2f} us, p99: {:.2f} us, p999: {:.2f}, median: {:.2f} us"
                    .format(host, host_offered_rate, host_offered_load,
                            host_achieved_rate, host_achieved_load,
                            float(host_achieved_rate / host_offered_rate),
                            host_avg, host_p99, host_p999, host_median))
        # print total stats
        sorted_latencies = list(heapq.merge(*client_latency_lists))
        median = utils.median_func(sorted_latencies) / float(1000)
        p99 = utils.p99_func(sorted_latencies) / float(1000)
        p999 = utils.p999_func(sorted_latencies) / float(1000)
        avg = utils.mean_func(sorted_latencies) / float(1000)

        if print_stats:
            total_stats = "offered load: {:.2f} req/s | {:.2f} Gbps, "  \
                "achieved load: {:.2f} req/s | {:.2f} Gbps, " \
                "percentage achieved rate: {:.3f}," \
                "avg latency: {:.2f} us, p99: {:.2f} us, p999: {:.2f}, median: {:.2f} us".format(
                    total_offered_rate, total_offered_load,
                    total_achieved_rate, total_achieved_load,
                    float(total_achieved_rate / total_offered_rate),
                    avg, p99, p999, median)
            utils.info("Total Stats: ", total_stats)
        percent_acheived_rate = float(total_achieved_rate / total_offered_rate)
        csv_line = "{},{},{},{},{},{},{},{},{},{},{},{},{}".format(
            iteration.get_segment_size(), iteration.get_num_mbufs(),
            iteration.get_with_copy(), iteration.get_as_one(),
            total_offered_rate, total_offered_load, total_achieved_rate,
            total_achieved_load, percent_acheived_rate, avg * 1000,
            median * 1000, p99 * 1000, p999 * 1000)
        return csv_line
Example #6
0
 def graph_results(self, folder, logfile):
     utils.warn("Graphing not implemented yet")
Example #7
0
    def run_analysis_individual_trial(self,
                                      higher_level_folder,
                                      program_metadata,
                                      iteration,
                                      print_stats=False):
        exp_folder = iteration.get_folder_name(higher_level_folder)

        # parse stdout logs
        total_offered_load_pps = 0
        total_offered_load_gbps = 0
        total_achieved_load_gbps = 0
        total_achieved_load_pps = 0
        total_retries = 0
        client_latency_lists = []
        clients = iteration.get_iteration_clients(
            program_metadata["start_client"]["hosts"])

        for host in clients:
            args = {"folder": str(exp_folder), "host": host}
            stdout_log = program_metadata["start_client"]["log"]["out"].format(
                **args)
            stdout_info = parse_log_info(stdout_log)
            if stdout_info == {}:
                utils.warn("Error parsing stdout log {}".format(stdout_log))
                return ""
            run_metadata_log = program_metadata["start_client"]["log"][
                "record"].format(**args)
            run_info = utils.parse_command_line_args(run_metadata_log)
            if run_info == {}:
                utils.warn("Error parsing yaml run info for {}".format(
                    run_metadata_log))
                return ""

            latency_log = "{folder}/{host}.latency.log".format(**args)
            latencies = utils.parse_latency_log(latency_log, STRIP_THRESHOLD)
            if latencies == []:
                utils.warn("Error parsing latency log {}".format(latency_log))
                return ""
            client_latency_lists.append(latencies)

            host_offered_load = float(run_info["args"]["rate"])
            total_offered_load_pps += host_offered_load
            host_offered_load_gbps = float(
                utils.get_tput_gbps(host_offered_load,
                                    iteration.get_server_size()))
            total_offered_load_gbps += host_offered_load_gbps
            host_pkts_recved = stdout_info["recved"]
            host_total_time = stdout_info["total_time"]
            host_achieved_load = float(host_pkts_recved) / float(
                host_total_time)
            total_achieved_load_pps += host_achieved_load
            host_achieved_load_gbps = float(
                utils.get_tput_gbps(host_achieved_load,
                                    iteration.get_server_size()))
            total_achieved_load_gbps += host_achieved_load_gbps

            # convert to microseconds
            host_p99 = utils.p99_func(latencies) / 1000.0
            host_p999 = utils.p999_func(latencies) / 1000.0
            host_median = utils.median_func(latencies) / 1000.0
            host_avg = utils.mean_func(latencies) / 1000.0

            # add retries
            retries = int(stdout_info["retries"])
            total_retries += retries

            if print_stats:
                utils.info(
                    "Client {}: "
                    "offered load: {:.4f} req/s | {:.4f} Gbps, "
                    "achieved load: {:.4f} req/s | {:.4f} Gbps, "
                    "percentage achieved rate: {:.4f}, "
                    "retries: {}, "
                    "avg latency: {: .4f} \u03BCs, p99: {: .4f} \u03BCs, p999:"
                    "{: .4f} \u03BCs, median: {: .4f} \u03BCs".format(
                        host, host_offered_load, host_offered_load_gbps,
                        host_achieved_load, host_achieved_load_gbps,
                        float(host_achieved_load / host_offered_load), retries,
                        host_avg, host_p99, host_p999, host_median))

        sorted_latencies = list(heapq.merge(*client_latency_lists))
        median = utils.median_func(sorted_latencies) / float(1000)
        p99 = utils.p99_func(sorted_latencies) / float(1000)
        p999 = utils.p999_func(sorted_latencies) / float(1000)
        avg = utils.mean_func(sorted_latencies) / float(1000)

        if print_stats:
            total_stats = "offered load: {:.4f} req/s | {:.4f} Gbps, "  \
                "achieved load: {:.4f} req/s | {:.4f} Gbps, " \
                "percentage achieved rate: {:.4f}," \
                "retries: {}, " \
                "avg latency: {:.4f} \u03BCs, p99: {:.4f} \u03BCs, p999: {:.4f}" \
                "\u03BCs, median: {:.4f} \u03BCs".format(
                    total_offered_load_pps, total_offered_load_gbps,
                    total_achieved_load_pps, total_achieved_load_gbps,
                    float(total_achieved_load_pps / total_offered_load_pps),
                    total_retries,
                    avg, p99, p999, median)
            utils.info("Total Stats: ", total_stats)
        percent_acheived_load = float(total_achieved_load_pps /
                                      total_offered_load_pps)
        csv_line = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}".format(
            iteration.get_serialization(), iteration.get_server_message_type(),
            iteration.get_message_type(), iteration.get_server_size(),
            iteration.get_size(), iteration.get_recv_string(),
            total_offered_load_pps, total_offered_load_gbps,
            total_achieved_load_pps, total_achieved_load_gbps,
            percent_acheived_load, total_retries, avg, median, p99, p999)
        return csv_line
Example #8
0
    def run(self,
            folder,
            exp_config,
            machine_config,
            pprint,
            program_version_info,
            use_perf=False):
        """
        Runs the actual program.
        Arguments:
            * folder - Path that all logfiles from this iteration should go.
            * exp_config - Experiment yaml that contains command lines. Assumes
            this contains a set of programs to run, each with a list of
            corresponding hosts that can run that command line.
            * machine_config - Machine level config yaml.
            * pprint - Instead of running, just print out command lines.
            * program_version_info - Metadata about the commit version of the
            repo at time of experiment.
            * use_perf - Whether to use perf or not when running the server.
        """
        programs_to_join_immediately = {}
        programs_to_kill = {}
        # map from start time (in seconds) to list
        # of programs with that start time
        programs_by_start_time = defaultdict(list)

        # assumes program processes to be executed are in order in the yaml
        commands = exp_config["commands"]
        programs = exp_config["programs"]
        exp_time = exp_config["time"]

        record_paths = {}

        # map from a program id to the actual process
        program_counter = 0
        proc_map = {}
        status_dict = {}
        manager = mp.Manager()
        status_dict = manager.dict()
        # spawn the commands
        for command in commands:
            program_name = command["program"]
            program = programs[program_name]
            program_hosts = program["hosts"]
            kill_cmd = None
            if "stop" in program:
                kill_cmd = program["stop"]
            for host in self.get_relevant_hosts(program, program_name):
                program_cmd = program["start"]
                if "log" in program:
                    if "out" in program["log"]:
                        stdout = program["log"]["out"]
                        program_cmd += " > {}".format(stdout)
                    if "err" in program["log"]:
                        stderr = program["log"]["err"]
                        program_cmd += " 2> {}".format(stderr)
                    if "record" in program["log"]:
                        record_path = program["log"]["record"]

                program_args = self.get_program_args(folder, program_name,
                                                     host, machine_config,
                                                     programs, exp_time)
                program_cmd = program_cmd.format(**program_args)
                if use_perf and "perf" in program:
                    utils.debug("current program args: {}", program_args)
                    perf_cmd = program["perf"].format(**program_args)
                    program_cmd = "{} {}".format(perf_cmd, program_cmd)
                record_path = record_path.format(**program_args)
                fail_ok = False
                if kill_cmd is not None:
                    kill_cmd = kill_cmd.format(**program_args)
                    fail_ok = True

                yaml_record = {
                    "host": host,
                    "args": program_args,
                    "command": program_cmd,
                    "stop_command": kill_cmd,
                    "version_info": program_version_info
                }

                if pprint:
                    utils.debug(
                        "Host {}: \n\t - Running Cmd: {}\n\t - Stopped by: {}".
                        format(host, program_cmd, kill_cmd))

                else:
                    record_paths[record_path] = yaml_record
                    proc = mp.Process(target=self.run_cmd_sudo,
                                      args=(program_cmd, host, machine_config,
                                            fail_ok, status_dict,
                                            program_counter))

                    start_time = int(command["begin"])
                    proc_map[program_counter] = proc
                    programs_by_start_time[start_time].append(
                        (kill_cmd, program_counter, program_name, host,
                         program_args))
                    program_counter += 1
        # now start each start program
        cur_time = 0
        program_start_times = sorted(programs_by_start_time.keys())
        for start_time in program_start_times:
            if start_time != cur_time:
                time.sleep(start_time - cur_time)
            cur_time = start_time
            progs = programs_by_start_time[start_time]
            for info in progs:
                kill_cmd = info[0]
                program_counter = info[1]
                program_name = info[2]
                host = info[3]
                proc = proc_map[program_counter]
                program_args = info[4]
                utils.debug("Starting program {} on host {}, args: {}".format(
                    program_name, host, program_args))
                proc.start()
                if kill_cmd == None:
                    programs_to_join_immediately[host] = program_counter
                else:
                    programs_to_kill[host] = (program_counter, kill_cmd)

        any_failed = False
        # now join all of the joining programs
        for host in programs_to_join_immediately:
            prog_counter = programs_to_join_immediately[host]
            proc = proc_map[prog_counter]
            res = proc.join()
            status = status_dict[prog_counter]
            if not status:
                any_failed = True
            utils.debug("Host {} done; status: {}".format(host, status))

        # now kill the rest of the programs
        for host in programs_to_kill:
            (program_counter, kill_cmd) = programs_to_kill[host]
            try:
                kill_cmd_with_sleep = kill_cmd + "; /bin/sleep 3"
                utils.debug("Trying to run kill command: {} on host {}".format(
                    kill_cmd, host))
                self.kill_remote_process(kill_cmd_with_sleep, host,
                                         machine_config)
            except:
                utils.warn("Failed to run kill command:",
                           "{}".format(kill_cmd_with_sleep))
                exit(1)
            try:
                proc_map[program_counter].join()
            except:
                utils.warn(
                    "Failed to run join command: {}".format(program_counter))

        # now, experiment is over, so record experiment metadata
        for record_path in record_paths:
            yaml_record = record_paths[record_path]
            with open(record_path, 'w') as file:
                yaml.dump(yaml_record, file)
            file.close()

        if any_failed:
            utils.error("One of the programs failed.")
            return False
        return True