def load(config_file=None, bucket=None, app_name=None, trial=None) -> (pd.DataFrame, dict): if config_file is not None: with open(path_join(_root_dir, config_file), "r") as f: spec = yaml.safe_load(f) bucket = spec["metadata"][ "resultBucket"] if bucket is None else bucket app_name = spec["appName"] if app_name is None else app_name trial = spec["metadata"]["name"] if trial is None else trial else: assert not (bucket is None or trial is None or app_name is None) spec = { "metadata": { "name": trial, "appName": app_name, }, } prefix = s3.path_join(app_name, trial) objs = s3.list_objects(bucket, prefix) for o in objs: if o == s3.path_join(app_name, trial, "df.pickle"): return s3.download_object_as( bucket, o, lambda x: pd.read_pickle(x).infer_objects()), spec print(f"load: unable to find trial: {app_name} {trial}")
def runs(self): # turns a trial config to a list of run configs. def get_ins_type_num(c): itn, aux_num, aux_type = defaultdict(int), dict(), dict() available_ins = {ig["instanceType"] for ig in self.ins_groups} for k, v in c.items(): if k not in self.resource_config: continue if k.startswith("num"): name = k.replace("num", "") aux_num[name.lower()] = v elif k.endswith("Type"): name = k.replace("Type", "") assert v in available_ins, "trial: config error, " \ "missing instance type" aux_type[name.lower()] = v for n, num in aux_num.items(): assert n in aux_type, "trial: config error, " \ "inconsistent resourceConfig" itn[aux_type[n]] += num return itn if self._runs is None: runs = list() num_sample = self.meta_data.get("randomConfigSampling", None) if num_sample is None: rcs = explode(self.all_config) else: rcs = sample_combination(self.all_config, num_sample) for rc in rcs: igs = get_ins_type_num(rc) feature_digest = hashlib.md5(str(rc).encode()).hexdigest() for i in range(int(rc["numRun"])): _rc = dict(rc) # extra run configs to be add below _rc["feature_digest"] = feature_digest _rc["logBucket"] = self.meta_data["logBucket"] _rc["debug"] = self.meta_data.get("debug", False) # TODO: fix run id, identify the varying feature _rc["run_id"] = s3.path_join(self.trial_id, str(i)) _rc["ins_type_num"] = igs runs.append(_rc) self._runs = runs return self._runs
def run(run_config: dict, wrks: dict) -> dict: """ Run wrk2 to benchmark nginx. """ # get workers rid = run_config["run_id"] print("run: assume the remote VM image contains all deps; " "nothing to install;") # get servers and clients sit = run_config["serverInstanceType"] cit = run_config["clientInstanceType"] if sit == cit: ns = run_config["numServerInstance"] nc = run_config["numClientInstance"] nginx_servers = wrks[sit][:ns] nginx_clients = wrks[cit][ns: ns + nc] else: nginx_servers = wrks[sit] nginx_clients = wrks[cit] ex_ip_to_in_ip = k8s.get_worker_external_internal_ip_map() web_dir = "/var/www/html" nginx_dir = "/etc/nginx" test_file_name = "test.txt" test_file = f"{web_dir}/{test_file_name}" config_file = f"{nginx_dir}/perfd-nginx.conf" # Step 0: clear up print(rmt.clean_default_apps(nginx_servers + nginx_clients, extra_app=["nginx", "wrk"])) print(rmt.cmd_remote(nginx_servers + nginx_clients, cmd_=f"sudo chmod 777 {nginx_dir}; " f"sudo chmod 777 {web_dir}; " f"sudo rm -rf {test_file} || true >/dev/null &2>1; " f"sudo rm -rf {config_file} || true >/dev/null &2>1", out=True)) # Step 1: prepare files according to given input scale file_src = run_config.get("fileSource", "/dev/zero") rmt.cmd_remote(nginx_servers, cmd_=f"sudo head -c {run_config['fileSize']}KB {file_src} > {test_file}") print(f"run: nginx servers at public IPs {nginx_servers}") # Step 2: update the nginx config; start the nginx servers config_str = _nginx_config.replace("{numWorkerProc}", str(run_config["numWorkerProc"])) rmt.cmd_remote(nginx_servers, cmd_=f"cat > {config_file} << EOF {config_str}") if len(nginx_servers) > 1: # TODO: multiple servers (may not needed as we just need numWorkerProc scaling) raise Exception("run: unimplemented multiple server") else: svc_ip = ex_ip_to_in_ip[nginx_servers[0]] rmt.cmd_remote(nginx_servers, cmd_=f"sudo nginx -c {config_file}") # Step 3: start the wrk2 _cmd = f"wrk -t{run_config['numClientThread']} " \ f"-c{run_config['numConn']} " \ f"-d{run_config['duration']}s " \ f"-R{run_config['reqRate']} http://{svc_ip}/{test_file_name} --latency" print("run:", _cmd) start = time.time() raws = list(map(lambda x: x.decode("utf-8"), rmt.cmd_remote(nginx_clients, cmd_=_cmd, out=True))) print(f"run: finished in {time.time() - start}s") print("run results, sample:\n", raws[0]) # Step 4: upload logs and post-proc s3.dump_and_upload_file(run_config, bucket=run_config["logBucket"], key=s3.path_join(rid, "config")) s3.dump_and_upload_file(raws, bucket=run_config["logBucket"], key=s3.path_join(rid, "log")) def parse(raw_) -> dict: def parse_time(_t): if "us" in _t: return float(_t.replace("us", "")) / 1000 elif "ms" in _t: return float(_t.replace("ms", "")) elif "s" in _t: return float(_t.replace("s", "")) * 1000 else: return float(_t.replace("s", "")) results = dict() state = "start" for l in raw_.split("\n"): # state transition if "HdrHistogram" in l: state = "cdf" if "#[Mean" in l: state = "stat" # line parsing if state == "cdf": if "50.000%" in l: results["lat_50pc"] = parse_time(l.split()[-1]) elif "75.000%" in l: results["lat_75pc"] = parse_time(l.split()[-1]) elif "99.000%" in l: results["lat_99pc"] = parse_time(l.split()[-1]) elif state == "stat": if "Requests/sec" in l: results["rps"] = float(l.split()[-1]) if "Transfer/sec" in l: tput = l.split()[-1] if "MB" in tput: tput = float(tput.replace("MB", "")) results["throughput"] = tput * _mb elif "KB" in tput: tput = float(tput.replace("KB", "")) results["throughput"] = tput * _kb elif "GB" in tput: tput = float(tput.replace("GB", "")) results["throughput"] = tput * _gb if "#[Mean" in l: results["lat_mean"] = parse_time(l.split()[2].rstrip(",")) results["lat_std"] = parse_time(l.split()[5].rstrip("]")) return results def agg(rs: list) -> dict: ag = defaultdict(list) for _r in rs: for k, v in _r.items(): # all values default to float ag[k].append(float(v)) # sum for k, v in ag.items(): # sum if k in { "rps", "throughput", }: ag[k] = sum(ag[k]) # default to avg else: ag[k] = mean(ag[k]) return ag r = dict() r.update(agg([parse(r) for r in raws])) print("run: results", r) # pair wise throughput info if run_config.get("iperfProfile", False): iperf_server = nginx_servers[0] iperf_client = nginx_clients[0] rmt.cmd_remote([iperf_server, iperf_client], cmd_=f"sudo apt install iperf3 -y") iperf.start_server(iperf_server) out = iperf.start_client(iperf_client, iperf_server, out=True) tput = out["end"]["sum_received"]["bits_per_second"] / (1024 * 1024) r.update({ "avg_client_server_tput": tput, }) # pair wise latency info lat = mean(ping.bipartite_lats(nginx_servers, [ex_ip_to_in_ip[i] for i in nginx_clients])) # lat and timestamp r.update({ "avg_client_server_lat": lat, "machinetime": time.time(), "datetime": datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), }) r.update(run_config) return r
def make_trial_config(config_file: str): def expand(matrix): # validate matrix max_col_len = 1 for v in matrix.values(): if type(v) == list and len(v) > max_col_len: max_col_len = len(v) for v in matrix.values(): if type(v) == list and len(v) != 1 and len(v) != max_col_len: exit_str( msg="expand: (i) all rows in trial config must be either of " "length 1 or the same length as the longest list; " "or check if (ii) randomSampling is set") ex = dict() for k, v in matrix.items(): if type(v) != list: ex[k] = [v] * max_col_len elif len(v) == 1: ex[k] = v * max_col_len else: # TODO: mark this as varying feature # TODO: randomize the order ex[k] = v return ex, max_col_len def explode(matrix, prefix=""): col_sizes = {len(v) for k, v in matrix.items()} assert len(col_sizes) == 1, "explode: columns have different sizes" num_row = list(col_sizes)[0] _rows, ctr = list(), 0 while ctr < num_row: _rows.append({prefix + k: v[ctr] for k, v in matrix.items()}) ctr += 1 return _rows def sample_combination(pool: dict, num_sample=200): import itertools import random kls = list() for k, v in pool.items(): kl = list() if type(v) is list: for i in v: kl.append((k, i)) else: kl.append((k, v)) kls.append(kl) flattened_pool = list(itertools.product(*kls)) _rows = list() for _f in random.sample(flattened_pool, num_sample): _rows.append({_k: v for _k, v in _f}) return _rows class TrialConfig: # TODO: a robust schema for TrialConfig def __init__(self): self.trial_id = None self.meta_data = None self.meta_config = None self.resource_config = None self.app_config = None self.all_config = None self.num_run = 0 self.num_trial = 0 self._runs = None self._igs = None @property def runs(self): # turns a trial config to a list of run configs. def get_ins_type_num(c): itn, aux_num, aux_type = defaultdict(int), dict(), dict() available_ins = {ig["instanceType"] for ig in self.ins_groups} for k, v in c.items(): if k not in self.resource_config: continue if k.startswith("num"): name = k.replace("num", "") aux_num[name.lower()] = v elif k.endswith("Type"): name = k.replace("Type", "") assert v in available_ins, "trial: config error, " \ "missing instance type" aux_type[name.lower()] = v for n, num in aux_num.items(): assert n in aux_type, "trial: config error, " \ "inconsistent resourceConfig" itn[aux_type[n]] += num return itn if self._runs is None: runs = list() num_sample = self.meta_data.get("randomConfigSampling", None) if num_sample is None: rcs = explode(self.all_config) else: rcs = sample_combination(self.all_config, num_sample) for rc in rcs: igs = get_ins_type_num(rc) feature_digest = hashlib.md5(str(rc).encode()).hexdigest() for i in range(int(rc["numRun"])): _rc = dict(rc) # extra run configs to be add below _rc["feature_digest"] = feature_digest _rc["logBucket"] = self.meta_data["logBucket"] _rc["debug"] = self.meta_data.get("debug", False) # TODO: fix run id, identify the varying feature _rc["run_id"] = s3.path_join(self.trial_id, str(i)) _rc["ins_type_num"] = igs runs.append(_rc) self._runs = runs return self._runs @property def ins_groups(self): if self._igs is None: image, tenancy, igs = "", "", list() for ig in tc.meta_data["instanceGroup"]: image = ig.get("image", image) tenancy = ig.get("tenancy", tenancy) assert image != "" and tenancy != "", \ "trial: missing tenancy or image in config" ig.update({ "image": image, "tenancy": tenancy, }) igs.append(ig) self._igs = igs self.meta_data["tenancy"] = tenancy return self._igs def validate(self): # required fields for k in {"tenancy", "logBucket", "resultBucket"}: assert k in self.meta_data for k in {"numRun"}: assert k in self.all_config print("trial: config validated.") # TODO: add more validation tc = TrialConfig() # parse configs with open(path_join(_dir, config_file), "r") as f: raw = yaml.safe_load(f) tc.meta_data = raw["metadata"] tc.meta_data["instanceGroup"] = raw["spec"]["instanceGroup"] if tc.meta_data.get("randomConfigSampling", None): preproc = lambda x: (x, -1) else: preproc = expand # TODO: add shuffle option when preproc is expand tc.meta_config, _ = preproc(raw["spec"]["metaConfig"]) tc.resource_config, _ = preproc(raw["spec"]["resourceConfig"]) tc.app_config, _ = preproc(raw["spec"]["appConfig"]) tc.all_config, tc.num_trial = preproc({ **raw["spec"]["metaConfig"], **raw["spec"]["resourceConfig"], **raw["spec"]["appConfig"], }) tc.trial_id = s3.path_join(tc.meta_data["appName"], tc.meta_data["name"], s3.timestamp()) _ = tc.runs _ = tc.ins_groups tc.num_run = len(tc.runs) tc.validate() return tc
def trial(config_file): # TODO: Trial warm starting (e.g., if fail at run) # TODO: Mark on S3 for trial completion # TODO: add clusterdown option in place of autoscale option # TODO: infer best instance numbers to provision # TODO: allow multiple k8s clusters on the same # access machine, e.g., via kube context or pass in # different k8s configuration files # TODO: checkpoint incremental run results # TODO: make placemegroup part of the configuration print("trial: start with config file", config_file) # load trial specs tc = make_trial_config(config_file) _app_name = tc.meta_data["appName"] if _app_name not in support_app_runner: runner = None exit_str("unsupported application %s, missing runner" % _app_name) else: runner = support_app_runner[_app_name] global _verbosity _verbosity = tc.meta_data.get("verbosity", 0) if _verbosity > 0: print("trial: %d trials and %d runs" % (tc.num_trial, tc.num_run)) if _verbosity > 1: print("trial: overview of runs and resources") pp.pprint(tc.runs) pp.pprint(tc.ins_groups) ray.init(log_to_driver=True) # prepare cluster if kops.is_up(): print("trial: k8s running; check if instance groups match..") if tc.meta_data.get("autoscale", True): kops_scaler.autoscale(tc.ins_groups) print("trial: k8s clearing up..") k8s.delete_all_pods() k8s.clean_all_vc() else: print("trial: k8s not ready; starting one..") kops.create_and_start(tc.ins_groups) def create_if_not_exist(bucket, empty=False): if not s3.bucket_exist(bucket): s3.create_bucket(bucket) elif empty: print(f"trial: empty bucket {bucket}") s3.empty_bucket(bucket) # TODO: fix the logs (currently of no use except for spark) create_if_not_exist(tc.meta_data["logBucket"], empty=True) create_if_not_exist(tc.meta_data["resultBucket"]) # start runs start = time.time() results = sched(tc.runs, runner) print(f"trial: runs complete in {time.time() - start}s.") # post proc exp_path = s3.path_join(tc.meta_data["appName"], tc.meta_data["name"]) df = postproc(results, tc) if tc.meta_data.get("debug", False): with pd.option_context('display.max_rows', None, 'display.max_columns', None): print("trial debug:\n", df) # TODO: handle spark experiments result parsing # TODO: replace the ad-hoc solution below if tc.meta_data["appName"] == "spark": if tc.meta_data.get("sparkBench", "spark-sql-perf"): from thirdparty.microps.oracle.feature.featuredb import remove_training_data from thirdparty.microps.examples.spark_sql_perf.dataset_gen import gen from thirdparty.microps.examples.spark_sql_perf.cmd import load_app_dfs app_name = "sparkperf-" + tc.app_config["appName"][0] bucket_name = tc.meta_data["logBucket"] print("debug:", bucket_name) remove_training_data(db_name=bucket_name, app_id=app_name) gen(bucket_name) app_df, _ = load_app_dfs(bucket_name) df = app_df[app_name] print(df) # upload results s3.dump_and_upload_file(df, bucket=tc.meta_data["resultBucket"], key=s3.path_join(exp_path, "df.pickle")) s3.dump_and_upload_file("", bucket=tc.meta_data["logBucket"], key=s3.path_join(tc.trial_id, "_SUCCEED")) print("trial: results uploaded.") if tc.meta_data.get("clusterdown", False): kops.delete_cluster() print("trial: cluster down.")
def run(run_config: dict, wrks: dict) -> dict: """ Run wrk2 to benchmark go-fasthttp, light-4j, akka. """ # get workers rid = run_config["run_id"] print("run: assume the remote VM image contains all deps; " "nothing to install;") # get servers and clients sit = run_config["serverInstanceType"] cit = run_config["clientInstanceType"] if sit == cit: ns = run_config["numServerInstance"] nc = run_config["numClientInstance"] msvc_servers = wrks[sit][:ns] msvc_clients = wrks[cit][ns: ns + nc] else: msvc_servers = wrks[sit] msvc_clients = wrks[cit] ex_ip_to_in_ip = k8s.get_worker_external_internal_ip_map() nginx_dir = "/etc/nginx" config_file = f"{nginx_dir}/perfd-microsvc.conf" web_dir = "/var/www/html" test_file_name = "test.txt" test_file = f"{web_dir}/{test_file_name}" fm = run_config["frameworkName"] assert fm in fm_cmds, f"run: unsupported framework {fm}" fm_dir = f"/home/admin/microservice/{fm}" fm_cmd = fm_cmds[fm] # Step 0: clear up print(rmt.clean_default_apps(msvc_servers + msvc_clients, extra_app=[fm_proc[run_config["frameworkName"]], "wrk"])) print(rmt.cmd_remote(msvc_clients, cmd_=f"sudo chmod 777 {nginx_dir}; " f"sudo rm -rf {test_file} || true >/dev/null &2>1; " f"sudo rm -rf {config_file} || true >/dev/null &2>1", out=True)) # Step 1: run msvc servers file_src = run_config.get("fileSource", "/dev/zero") file_size = run_config.get("fileSize", 1) rmt.cmd_remote(msvc_servers, cmd_=f"sudo head -c {file_size}KB {file_src} > {test_file}") print(rmt.cmd_remote(msvc_servers, cmd_=f"cd {fm_dir}; " f"{fm_cmd} > /dev/null 2>&1 &", out=True)) print(f"run: {fm_cmd}") print("run: waiting for the server to be ready..") time.sleep(5) print(f"run: msvc servers at public IPs {msvc_servers}") # Step 2: start the nginx for client side load balancing config_str = _nginx_config.replace("{SERVER_ENTRY}", "\n".join([f"server {ex_ip_to_in_ip[s]}:8080;" for s in msvc_servers])) rmt.cmd_remote(msvc_clients, cmd_=f"cat > {config_file} << EOF {config_str}") print(rmt.cmd_remote(msvc_clients, cmd_=f"sudo nginx -c {config_file}", out=True)) time.sleep(1) # Step 3: start the wrk2 _cmd = f"wrk -t{run_config['numClientThread']} " \ f"-c{run_config['numConn']} " \ f"-d{run_config['duration']}s " \ f"-R{run_config['reqRate']} http://localhost:80 --latency" print("run:", _cmd, "at", msvc_clients) start = time.time() raws = list(map(lambda x: x.decode("utf-8"), rmt.cmd_remote(msvc_clients, cmd_=_cmd, out=True))) print(f"run: finished in {time.time() - start}s") print("run: sample result:\n", raws[0]) # Step 4: upload logs and post-proc s3.dump_and_upload_file(run_config, bucket=run_config["logBucket"], key=s3.path_join(rid, "config")) s3.dump_and_upload_file(raws, bucket=run_config["logBucket"], key=s3.path_join(rid, "log")) def parse(raw_) -> dict: def parse_time(_t): if "us" in _t: return float(_t.replace("us", "")) / 1000 elif "ms" in _t: return float(_t.replace("ms", "")) elif "s" in _t: return float(_t.replace("s", "")) * 1000 else: return float(_t.replace("s", "")) results = dict() state = "start" for l in raw_.split("\n"): # state transition if "HdrHistogram" in l: state = "cdf" if "#[Mean" in l: state = "stat" # line parsing if state == "cdf": if "50.000%" in l: results["lat_50pc"] = parse_time(l.split()[-1]) elif "75.000%" in l: results["lat_75pc"] = parse_time(l.split()[-1]) elif "99.000%" in l: results["lat_99pc"] = parse_time(l.split()[-1]) elif state == "stat": if "Requests/sec" in l: results["rps"] = float(l.split()[-1]) if "Transfer/sec" in l: tput = l.split()[-1] if "MB" in tput: tput = float(tput.replace("MB", "")) results["throughput"] = tput * _mb elif "KB" in tput: tput = float(tput.replace("KB", "")) results["throughput"] = tput * _kb elif "GB" in tput: tput = float(tput.replace("GB", "")) results["throughput"] = tput * _gb if "#[Mean" in l: results["lat_mean"] = parse_time(l.split()[2].rstrip(",")) results["lat_std"] = parse_time(l.split()[5].rstrip("]")) return results def agg(rs: list) -> dict: ag = defaultdict(list) for _r in rs: for k, v in _r.items(): # all values default to float ag[k].append(float(v)) # sum for k, v in ag.items(): # sum if k in { "rps", "throughput", }: ag[k] = sum(ag[k]) # default to avg else: ag[k] = mean(ag[k]) return ag r = dict() r.update(agg([parse(r) for r in raws])) print("run: results", r) r.update(run_config) # pair wise latency info lat = mean(ping.bipartite_lats(msvc_servers, [ex_ip_to_in_ip[i] for i in msvc_clients])) r.update({ "avg_client_server_lat": lat, }) return r
def run(run_config: dict, wrks: dict) -> dict: """ Run memcached benchmark with fixed configurations. Returns a list consisting of results from multiple runs, where each result is a map of k-v pairs. """ def validate(): for k in { "keySize", "valueSize", "serverThread", "clientThread", "runTime", "waitTime", "warmupTime" }: assert k in run_config, f"run: missing config entry '{k}', abort" validate() rid = run_config["run_id"] # get servers and clients sit = run_config["serverInstanceType"] cit = run_config["clientInstanceType"] if sit == cit: ns = run_config["numServerInstance"] nc = run_config["numClientInstance"] mcd_servers = wrks[sit][:ns] mut_clients = wrks[cit][ns:ns + nc] else: mcd_servers = wrks[sit] mut_clients = wrks[cit] # install deps and clean up print("run: assume the remote VM image contains all deps; " "nothing to install;") print( rmt.clean_default_apps(mcd_servers + mut_clients, extra_app=["memcached", "mutilate"], docker_cont=["memcached"])) ex_ip_to_in_ip = k8s.get_worker_external_internal_ip_map() # Step 1: start the memcached servers # get memcached server IPs (internal VPC IP); we are # not using a load balancer here, mutilate does client-side # load balancing already port = 11211 server_ex_ips = mcd_servers server_in_ips = [ex_ip_to_in_ip[e] for e in server_ex_ips] client_ex_ips = mut_clients client_in_ips = [ex_ip_to_in_ip[i] for i in client_ex_ips] num_server_thread = run_config.get("serverThread", -1) if num_server_thread < 0: num_server_thread = aws_resource_map[ run_config["serverInstanceType"]]["vCPUs"] run_config["serverThread"] = num_server_thread # demux server runner type, default run on bare metal runner_type = run_config.get("runner", "bare") if runner_type == "bare": cmd_ = f"memcached -t {num_server_thread} -c 32768 > /dev/null 2>&1 & " rmt.cmd_remote(mcd_servers, cmd_=cmd_) elif runner_type == "docker": # default tag: 1.4.33 tag = run_config.get("tag", "1.4.33") # run the container cmd_ = f"sudo docker run --name memcached -d -p {port}:{port} memcached:{tag} " \ f"memcached -t {num_server_thread} -c 32768 > /dev/null 2>&1 & " rmt.cmd_remote(mcd_servers, cmd_=cmd_) # wait a bit for the container to be ready time.sleep(5) print(f"run: docker image memcached:{tag}") else: raise Exception(f"run: unknown runner type {runner_type}") print(f"run: using {runner_type} runner type") print( f"run: memcached servers at internal IPs {server_in_ips}, public IPs {server_ex_ips} with {cmd_}" ) # Step 2: start the mutilate agents master = mut_clients[0] agents = mut_clients[1:] if len(agents) >= 1: _cmd_agent = f"mutilate -T {run_config['clientThread']} " \ f"-K {run_config['keySize']} " \ f"-V {run_config['valueSize']} " \ f"-c 4 " \ f"-A > /dev/null 2>&1 & " print("run: agents", agents, _cmd_agent) rmt.cmd_remote(agents, cmd_=_cmd_agent) # Step 3: start the mutilate master runner # TODO: add input distribution knob def make_master_cmd(): server_str = " ".join([f"-s {si}:{port}" for si in server_in_ips]) agent_str = " ".join([f"-a {ex_ip_to_in_ip[ax]}" for ax in agents]) option_str = f"-T {run_config['clientThread']} " \ f"-K {run_config['keySize']} " \ f"-V {run_config['valueSize']} " \ f"-t {run_config['runTime']} " \ f"-w {run_config['warmupTime']} " \ f"-c 1 " \ f"-W {run_config['waitTime']} --noload" return f"mutilate {server_str} --loadonly", \ f"mutilate {server_str} {agent_str} {option_str}" _cmd_load, _cmd_run = make_master_cmd() print("run: master", master, _cmd_run) start = time.time() rmt.cmd_remote([master], cmd_=_cmd_load) raw = rmt.cmd_remote([master], cmd_=_cmd_run, out=True)[0].decode("utf-8") print(f"run: finished in {time.time() - start}s") print("run results, sample:\n", raw) # Step 4: upload logs s3.dump_and_upload_file(run_config, bucket=run_config["logBucket"], key=s3.path_join(rid, "config")) s3.dump_and_upload_file(raw, bucket=run_config["logBucket"], key=s3.path_join(rid, "log")) # Step 5: parse and aggregate results def parse(_raw) -> dict: _raw = _raw.split("\n") results = dict() for l in _raw: vs = l.split() if len(vs) < 1: continue v_type, v = vs[0], None if v_type == "read": v = { "avg_lat_read": vs[1], "std_lat_read": vs[2], "min_lat_read": vs[3], "99th_lat_read": vs[8], } elif v_type.startswith("Total"): v = {"qps": vs[3]} elif v_type.startswith("RX"): v = {"rx_goodput": vs[-2]} elif v_type.startswith("TX"): v = {"tx_goodput": vs[-2]} if v is not None: results.update(v) return results r = dict() r.update(parse(raw)) print("run: results", r) r.update(run_config) # pair wise latency info lat = mean(ping.bipartite_lats(mcd_servers, client_in_ips)) r.update({ "avg_client_server_lat": lat, }) # debugging info r.update({ "debug_num_server": len(mcd_servers), "debug_num_client": len(mut_clients), "debug_num_agent": len(agents), "debug_client_ex_IPs": mut_clients, "debug_server_ex_IPs": mcd_servers, "debug_client_in_IPs": client_in_ips, "debug_server_in_IPs": server_in_ips, }) return r
def run(run_config: dict, wrks) -> dict: """ Run inch to benchmark influxdb """ # get workers rid = run_config["run_id"] print("run: assume the remote VM image contains all deps; " "nothing to install;") # get servers and clients sit = run_config["serverInstanceType"] cit = run_config["clientInstanceType"] if sit == cit: ns = run_config["numServerInstance"] nc = run_config["numClientInstance"] influx_servers = wrks[sit][:ns] influx_clients = wrks[cit][ns:ns + nc] else: influx_servers = wrks[sit] influx_clients = wrks[cit] nginx_dir = "/etc/nginx" config_file = f"{nginx_dir}/perfd-influxdb.conf" ex_ip_to_in_ip = k8s.get_worker_external_internal_ip_map() # Step 0: clear up print( rmt.clean_default_apps(influx_servers + influx_clients, extra_app=["influxd", "inch"], docker_cont=["influxd"])) print( rmt.cmd_remote(influx_clients, cmd_=f"sudo chmod 777 {nginx_dir}; " f"sudo rm -rf {config_file} || true >/dev/null &2>1", out=True)) # Step 1: run influxd servers """docker run -p 8086:8086 \ -v $PWD:/var/lib/influxdb \ influxdb""" # demux run commands based on runner type runner_type = run_config.get("runner", "bare") if runner_type == "bare": cmd_ = f"sudo influxd > /dev/null 2>&1 &" elif runner_type == "docker": tag = run_config.get("tag", "1.7.10") cmd_ = f"sudo docker run --name influxd -d -p 8086:8086 " \ f"-v $PWD:/var/lib/influxdb influxdb:{tag} > /dev/null 2>&1 &" print(f"run: use docker image influxdb:{tag}") else: raise Exception(f"run: unknown runner type {runner_type}") # start servers print(f"run: using {runner_type} runner type with command {cmd_}") print(rmt.cmd_remote(influx_servers, cmd_=cmd_, out=True)) print(f"run: influxd servers at public IPs {influx_servers}") print("run: waiting for the server to be ready..") time.sleep(5) # Step 2: start the nginx for client side load balancing config_str = _nginx_config.replace( "{SERVER_ENTRY}", "\n".join( [f"server {ex_ip_to_in_ip[s]}:8086;" for s in influx_servers])) rmt.cmd_remote(influx_clients, cmd_=f"cat > {config_file} << EOF {config_str}") print( rmt.cmd_remote(influx_clients, cmd_=f"sudo nginx -c {config_file}", out=True)) # Step 3: start the inch _cmd = f"for r in {{1..{run_config['numReq']}}}; do inch " \ f"-host http://localhost:80 " \ f"-p {run_config['numPointPerSeries']} & \n done; wait" print("run:", _cmd, f"at public IPs {influx_clients}") start = time.time() raws = list( map(lambda x: x.decode("utf-8"), rmt.cmd_remote(influx_clients, cmd_=_cmd, out=True))) # print("debug:", raws) print(f"run: finished in {time.time() - start}s") print("run results, sample:\n", raws[0]) # print("debug:", raws) # Step 4: upload logs and post-proc s3.dump_and_upload_file(run_config, bucket=run_config["logBucket"], key=s3.path_join(rid, "config")) s3.dump_and_upload_file(raws, bucket=run_config["logBucket"], key=s3.path_join(rid, "log")) def parse(raw_) -> dict: results = dict() lats = list() for l in raw_.split("\n"): if "Total time:" in l: lats.append(float(l.split()[2])) for k, v in stats(lats).items(): results[f"query_latency_{k}"] = v return results def agg(rs: list) -> dict: ag = defaultdict(list) for _r in rs: for k, v in _r.items(): # all values default to float ag[k].append(float(v)) # sum for k, v in ag.items(): # sum if k in {}: ag[k] = sum(ag[k]) # default to avg else: ag[k] = mean(ag[k]) return ag r = dict() r.update(agg([parse(r) for r in raws])) print("run: results", r) r.update(run_config) # pair wise latency info lat = mean( ping.bipartite_lats(influx_servers, [ex_ip_to_in_ip[i] for i in influx_clients])) r.update({ "avg_client_server_lat": lat, }) return r