Example #1
0
def load(config_file=None,
         bucket=None,
         app_name=None,
         trial=None) -> (pd.DataFrame, dict):
    if config_file is not None:
        with open(path_join(_root_dir, config_file), "r") as f:
            spec = yaml.safe_load(f)
            bucket = spec["metadata"][
                "resultBucket"] if bucket is None else bucket
            app_name = spec["appName"] if app_name is None else app_name
            trial = spec["metadata"]["name"] if trial is None else trial
    else:
        assert not (bucket is None or trial is None or app_name is None)
        spec = {
            "metadata": {
                "name": trial,
                "appName": app_name,
            },
        }
    prefix = s3.path_join(app_name, trial)
    objs = s3.list_objects(bucket, prefix)
    for o in objs:
        if o == s3.path_join(app_name, trial, "df.pickle"):
            return s3.download_object_as(
                bucket, o, lambda x: pd.read_pickle(x).infer_objects()), spec
    print(f"load: unable to find trial: {app_name} {trial}")
Example #2
0
        def runs(self):
            # turns a trial config to a list of run configs.
            def get_ins_type_num(c):
                itn, aux_num, aux_type = defaultdict(int), dict(), dict()
                available_ins = {ig["instanceType"] for ig in self.ins_groups}

                for k, v in c.items():
                    if k not in self.resource_config:
                        continue
                    if k.startswith("num"):
                        name = k.replace("num", "")
                        aux_num[name.lower()] = v
                    elif k.endswith("Type"):
                        name = k.replace("Type", "")
                        assert v in available_ins, "trial: config error, " \
                                                   "missing instance type"
                        aux_type[name.lower()] = v
                for n, num in aux_num.items():
                    assert n in aux_type, "trial: config error, " \
                                          "inconsistent resourceConfig"
                    itn[aux_type[n]] += num
                return itn

            if self._runs is None:
                runs = list()
                num_sample = self.meta_data.get("randomConfigSampling", None)

                if num_sample is None:
                    rcs = explode(self.all_config)
                else:
                    rcs = sample_combination(self.all_config, num_sample)

                for rc in rcs:
                    igs = get_ins_type_num(rc)
                    feature_digest = hashlib.md5(str(rc).encode()).hexdigest()
                    for i in range(int(rc["numRun"])):
                        _rc = dict(rc)
                        # extra run configs to be add below
                        _rc["feature_digest"] = feature_digest
                        _rc["logBucket"] = self.meta_data["logBucket"]
                        _rc["debug"] = self.meta_data.get("debug", False)
                        # TODO: fix run id, identify the varying feature
                        _rc["run_id"] = s3.path_join(self.trial_id, str(i))
                        _rc["ins_type_num"] = igs
                        runs.append(_rc)
                self._runs = runs
            return self._runs
Example #3
0
def run(run_config: dict, wrks: dict) -> dict:
    """
    Run wrk2 to benchmark nginx.
    """
    # get workers
    rid = run_config["run_id"]
    print("run: assume the remote VM image contains all deps; "
          "nothing to install;")

    # get servers and clients
    sit = run_config["serverInstanceType"]
    cit = run_config["clientInstanceType"]
    if sit == cit:
        ns = run_config["numServerInstance"]
        nc = run_config["numClientInstance"]
        nginx_servers = wrks[sit][:ns]
        nginx_clients = wrks[cit][ns: ns + nc]
    else:
        nginx_servers = wrks[sit]
        nginx_clients = wrks[cit]

    ex_ip_to_in_ip = k8s.get_worker_external_internal_ip_map()
    web_dir = "/var/www/html"
    nginx_dir = "/etc/nginx"
    test_file_name = "test.txt"
    test_file = f"{web_dir}/{test_file_name}"
    config_file = f"{nginx_dir}/perfd-nginx.conf"

    # Step 0: clear up
    print(rmt.clean_default_apps(nginx_servers + nginx_clients,
                                 extra_app=["nginx", "wrk"]))
    print(rmt.cmd_remote(nginx_servers + nginx_clients,
                         cmd_=f"sudo chmod 777 {nginx_dir}; "
                              f"sudo chmod 777 {web_dir}; "
                              f"sudo rm -rf {test_file} || true >/dev/null &2>1; "
                              f"sudo rm -rf {config_file} || true >/dev/null &2>1",
                         out=True))

    # Step 1: prepare files according to given input scale
    file_src = run_config.get("fileSource", "/dev/zero")
    rmt.cmd_remote(nginx_servers,
                   cmd_=f"sudo head -c {run_config['fileSize']}KB {file_src} > {test_file}")

    print(f"run: nginx servers at public IPs {nginx_servers}")

    # Step 2: update the nginx config; start the nginx servers
    config_str = _nginx_config.replace("{numWorkerProc}",
                                       str(run_config["numWorkerProc"]))
    rmt.cmd_remote(nginx_servers, cmd_=f"cat > {config_file} << EOF {config_str}")

    if len(nginx_servers) > 1:
        # TODO: multiple servers (may not needed as we just need numWorkerProc scaling)
        raise Exception("run: unimplemented multiple server")
    else:
        svc_ip = ex_ip_to_in_ip[nginx_servers[0]]
        rmt.cmd_remote(nginx_servers, cmd_=f"sudo nginx -c {config_file}")

    # Step 3: start the wrk2
    _cmd = f"wrk -t{run_config['numClientThread']} " \
           f"-c{run_config['numConn']} " \
           f"-d{run_config['duration']}s " \
           f"-R{run_config['reqRate']} http://{svc_ip}/{test_file_name} --latency"

    print("run:", _cmd)
    start = time.time()
    raws = list(map(lambda x: x.decode("utf-8"),
                    rmt.cmd_remote(nginx_clients, cmd_=_cmd, out=True)))
    print(f"run: finished in {time.time() - start}s")
    print("run results, sample:\n", raws[0])

    # Step 4: upload logs and post-proc
    s3.dump_and_upload_file(run_config,
                            bucket=run_config["logBucket"],
                            key=s3.path_join(rid, "config"))
    s3.dump_and_upload_file(raws,
                            bucket=run_config["logBucket"],
                            key=s3.path_join(rid, "log"))

    def parse(raw_) -> dict:
        def parse_time(_t):
            if "us" in _t:
                return float(_t.replace("us", "")) / 1000
            elif "ms" in _t:
                return float(_t.replace("ms", ""))
            elif "s" in _t:
                return float(_t.replace("s", "")) * 1000
            else:
                return float(_t.replace("s", ""))

        results = dict()
        state = "start"
        for l in raw_.split("\n"):
            # state transition
            if "HdrHistogram" in l:
                state = "cdf"
            if "#[Mean" in l:
                state = "stat"
            # line parsing
            if state == "cdf":
                if "50.000%" in l:
                    results["lat_50pc"] = parse_time(l.split()[-1])
                elif "75.000%" in l:
                    results["lat_75pc"] = parse_time(l.split()[-1])
                elif "99.000%" in l:
                    results["lat_99pc"] = parse_time(l.split()[-1])
            elif state == "stat":
                if "Requests/sec" in l:
                    results["rps"] = float(l.split()[-1])
                if "Transfer/sec" in l:
                    tput = l.split()[-1]
                    if "MB" in tput:
                        tput = float(tput.replace("MB", ""))
                        results["throughput"] = tput * _mb
                    elif "KB" in tput:
                        tput = float(tput.replace("KB", ""))
                        results["throughput"] = tput * _kb
                    elif "GB" in tput:
                        tput = float(tput.replace("GB", ""))
                        results["throughput"] = tput * _gb
                if "#[Mean" in l:
                    results["lat_mean"] = parse_time(l.split()[2].rstrip(","))
                    results["lat_std"] = parse_time(l.split()[5].rstrip("]"))
        return results

    def agg(rs: list) -> dict:
        ag = defaultdict(list)
        for _r in rs:
            for k, v in _r.items():
                # all values default to float
                ag[k].append(float(v))
        # sum
        for k, v in ag.items():
            # sum
            if k in {
                "rps",
                "throughput",
            }:
                ag[k] = sum(ag[k])
            # default to avg
            else:
                ag[k] = mean(ag[k])
        return ag

    r = dict()
    r.update(agg([parse(r) for r in raws]))
    print("run: results", r)

    # pair wise throughput info
    if run_config.get("iperfProfile", False):
        iperf_server = nginx_servers[0]
        iperf_client = nginx_clients[0]
        rmt.cmd_remote([iperf_server, iperf_client],
                       cmd_=f"sudo apt install iperf3 -y")

        iperf.start_server(iperf_server)
        out = iperf.start_client(iperf_client, iperf_server, out=True)
        tput = out["end"]["sum_received"]["bits_per_second"] / (1024 * 1024)
        r.update({
            "avg_client_server_tput": tput,
        })

    # pair wise latency info
    lat = mean(ping.bipartite_lats(nginx_servers, [ex_ip_to_in_ip[i] for i in nginx_clients]))

    # lat and timestamp
    r.update({
        "avg_client_server_lat": lat,
        "machinetime": time.time(),
        "datetime": datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
    })

    r.update(run_config)
    return r
Example #4
0
def make_trial_config(config_file: str):
    def expand(matrix):
        # validate matrix
        max_col_len = 1
        for v in matrix.values():
            if type(v) == list and len(v) > max_col_len:
                max_col_len = len(v)

        for v in matrix.values():
            if type(v) == list and len(v) != 1 and len(v) != max_col_len:
                exit_str(
                    msg="expand: (i) all rows in trial config must be either of "
                    "length 1 or the same length as the longest list; "
                    "or check if (ii) randomSampling is set")

        ex = dict()
        for k, v in matrix.items():
            if type(v) != list:
                ex[k] = [v] * max_col_len
            elif len(v) == 1:
                ex[k] = v * max_col_len
            else:
                # TODO: mark this as varying feature
                # TODO: randomize the order
                ex[k] = v
        return ex, max_col_len

    def explode(matrix, prefix=""):
        col_sizes = {len(v) for k, v in matrix.items()}
        assert len(col_sizes) == 1, "explode: columns have different sizes"
        num_row = list(col_sizes)[0]

        _rows, ctr = list(), 0
        while ctr < num_row:
            _rows.append({prefix + k: v[ctr] for k, v in matrix.items()})
            ctr += 1
        return _rows

    def sample_combination(pool: dict, num_sample=200):
        import itertools
        import random

        kls = list()
        for k, v in pool.items():
            kl = list()
            if type(v) is list:
                for i in v:
                    kl.append((k, i))
            else:
                kl.append((k, v))
            kls.append(kl)
        flattened_pool = list(itertools.product(*kls))

        _rows = list()
        for _f in random.sample(flattened_pool, num_sample):
            _rows.append({_k: v for _k, v in _f})
        return _rows

    class TrialConfig:
        # TODO: a robust schema for TrialConfig
        def __init__(self):
            self.trial_id = None

            self.meta_data = None
            self.meta_config = None
            self.resource_config = None
            self.app_config = None
            self.all_config = None

            self.num_run = 0
            self.num_trial = 0

            self._runs = None
            self._igs = None

        @property
        def runs(self):
            # turns a trial config to a list of run configs.
            def get_ins_type_num(c):
                itn, aux_num, aux_type = defaultdict(int), dict(), dict()
                available_ins = {ig["instanceType"] for ig in self.ins_groups}

                for k, v in c.items():
                    if k not in self.resource_config:
                        continue
                    if k.startswith("num"):
                        name = k.replace("num", "")
                        aux_num[name.lower()] = v
                    elif k.endswith("Type"):
                        name = k.replace("Type", "")
                        assert v in available_ins, "trial: config error, " \
                                                   "missing instance type"
                        aux_type[name.lower()] = v
                for n, num in aux_num.items():
                    assert n in aux_type, "trial: config error, " \
                                          "inconsistent resourceConfig"
                    itn[aux_type[n]] += num
                return itn

            if self._runs is None:
                runs = list()
                num_sample = self.meta_data.get("randomConfigSampling", None)

                if num_sample is None:
                    rcs = explode(self.all_config)
                else:
                    rcs = sample_combination(self.all_config, num_sample)

                for rc in rcs:
                    igs = get_ins_type_num(rc)
                    feature_digest = hashlib.md5(str(rc).encode()).hexdigest()
                    for i in range(int(rc["numRun"])):
                        _rc = dict(rc)
                        # extra run configs to be add below
                        _rc["feature_digest"] = feature_digest
                        _rc["logBucket"] = self.meta_data["logBucket"]
                        _rc["debug"] = self.meta_data.get("debug", False)
                        # TODO: fix run id, identify the varying feature
                        _rc["run_id"] = s3.path_join(self.trial_id, str(i))
                        _rc["ins_type_num"] = igs
                        runs.append(_rc)
                self._runs = runs
            return self._runs

        @property
        def ins_groups(self):
            if self._igs is None:
                image, tenancy, igs = "", "", list()
                for ig in tc.meta_data["instanceGroup"]:
                    image = ig.get("image", image)
                    tenancy = ig.get("tenancy", tenancy)
                    assert image != "" and tenancy != "", \
                        "trial: missing tenancy or image in config"

                    ig.update({
                        "image": image,
                        "tenancy": tenancy,
                    })
                    igs.append(ig)
                self._igs = igs
                self.meta_data["tenancy"] = tenancy
            return self._igs

        def validate(self):
            # required fields
            for k in {"tenancy", "logBucket", "resultBucket"}:
                assert k in self.meta_data
            for k in {"numRun"}:
                assert k in self.all_config
            print("trial: config validated.")
            # TODO: add more validation

    tc = TrialConfig()

    # parse configs
    with open(path_join(_dir, config_file), "r") as f:
        raw = yaml.safe_load(f)
        tc.meta_data = raw["metadata"]
        tc.meta_data["instanceGroup"] = raw["spec"]["instanceGroup"]

        if tc.meta_data.get("randomConfigSampling", None):
            preproc = lambda x: (x, -1)
        else:
            preproc = expand

        # TODO: add shuffle option when preproc is expand
        tc.meta_config, _ = preproc(raw["spec"]["metaConfig"])
        tc.resource_config, _ = preproc(raw["spec"]["resourceConfig"])
        tc.app_config, _ = preproc(raw["spec"]["appConfig"])
        tc.all_config, tc.num_trial = preproc({
            **raw["spec"]["metaConfig"],
            **raw["spec"]["resourceConfig"],
            **raw["spec"]["appConfig"],
        })
        tc.trial_id = s3.path_join(tc.meta_data["appName"],
                                   tc.meta_data["name"], s3.timestamp())
        _ = tc.runs
        _ = tc.ins_groups
        tc.num_run = len(tc.runs)
        tc.validate()
    return tc
Example #5
0
def trial(config_file):
    # TODO: Trial warm starting (e.g., if fail at run)
    # TODO: Mark on S3 for trial completion
    # TODO: add clusterdown option in place of autoscale option
    # TODO: infer best instance numbers to provision
    # TODO: allow multiple k8s clusters on the same
    #  access machine, e.g., via kube context or pass in
    #  different k8s configuration files
    # TODO: checkpoint incremental run results
    # TODO: make placemegroup part of the configuration
    print("trial: start with config file", config_file)

    # load trial specs
    tc = make_trial_config(config_file)

    _app_name = tc.meta_data["appName"]
    if _app_name not in support_app_runner:
        runner = None
        exit_str("unsupported application %s, missing runner" % _app_name)
    else:
        runner = support_app_runner[_app_name]

    global _verbosity
    _verbosity = tc.meta_data.get("verbosity", 0)
    if _verbosity > 0:
        print("trial: %d trials and %d runs" % (tc.num_trial, tc.num_run))
    if _verbosity > 1:
        print("trial: overview of runs and resources")
        pp.pprint(tc.runs)
        pp.pprint(tc.ins_groups)

    ray.init(log_to_driver=True)

    # prepare cluster
    if kops.is_up():
        print("trial: k8s running; check if instance groups match..")
        if tc.meta_data.get("autoscale", True):
            kops_scaler.autoscale(tc.ins_groups)

        print("trial: k8s clearing up..")
        k8s.delete_all_pods()
        k8s.clean_all_vc()
    else:
        print("trial: k8s not ready; starting one..")
        kops.create_and_start(tc.ins_groups)

    def create_if_not_exist(bucket, empty=False):
        if not s3.bucket_exist(bucket):
            s3.create_bucket(bucket)
        elif empty:
            print(f"trial: empty bucket {bucket}")
            s3.empty_bucket(bucket)

    # TODO: fix the logs (currently of no use except for spark)
    create_if_not_exist(tc.meta_data["logBucket"], empty=True)
    create_if_not_exist(tc.meta_data["resultBucket"])

    # start runs
    start = time.time()
    results = sched(tc.runs, runner)
    print(f"trial: runs complete in {time.time() - start}s.")

    # post proc
    exp_path = s3.path_join(tc.meta_data["appName"], tc.meta_data["name"])
    df = postproc(results, tc)

    if tc.meta_data.get("debug", False):
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            print("trial debug:\n", df)

    # TODO: handle spark experiments result parsing
    # TODO: replace the ad-hoc solution below
    if tc.meta_data["appName"] == "spark":
        if tc.meta_data.get("sparkBench", "spark-sql-perf"):
            from thirdparty.microps.oracle.feature.featuredb import remove_training_data
            from thirdparty.microps.examples.spark_sql_perf.dataset_gen import gen
            from thirdparty.microps.examples.spark_sql_perf.cmd import load_app_dfs

            app_name = "sparkperf-" + tc.app_config["appName"][0]
            bucket_name = tc.meta_data["logBucket"]

            print("debug:", bucket_name)
            remove_training_data(db_name=bucket_name, app_id=app_name)
            gen(bucket_name)
            app_df, _ = load_app_dfs(bucket_name)

            df = app_df[app_name]
            print(df)

    # upload results
    s3.dump_and_upload_file(df,
                            bucket=tc.meta_data["resultBucket"],
                            key=s3.path_join(exp_path, "df.pickle"))
    s3.dump_and_upload_file("",
                            bucket=tc.meta_data["logBucket"],
                            key=s3.path_join(tc.trial_id, "_SUCCEED"))
    print("trial: results uploaded.")

    if tc.meta_data.get("clusterdown", False):
        kops.delete_cluster()
        print("trial: cluster down.")
Example #6
0
def run(run_config: dict, wrks: dict) -> dict:
    """
    Run wrk2 to benchmark go-fasthttp, light-4j, akka.
    """
    # get workers
    rid = run_config["run_id"]
    print("run: assume the remote VM image contains all deps; "
          "nothing to install;")

    # get servers and clients
    sit = run_config["serverInstanceType"]
    cit = run_config["clientInstanceType"]
    if sit == cit:
        ns = run_config["numServerInstance"]
        nc = run_config["numClientInstance"]
        msvc_servers = wrks[sit][:ns]
        msvc_clients = wrks[cit][ns: ns + nc]
    else:
        msvc_servers = wrks[sit]
        msvc_clients = wrks[cit]

    ex_ip_to_in_ip = k8s.get_worker_external_internal_ip_map()
    nginx_dir = "/etc/nginx"
    config_file = f"{nginx_dir}/perfd-microsvc.conf"
    web_dir = "/var/www/html"
    test_file_name = "test.txt"
    test_file = f"{web_dir}/{test_file_name}"

    fm = run_config["frameworkName"]
    assert fm in fm_cmds, f"run: unsupported framework {fm}"
    fm_dir = f"/home/admin/microservice/{fm}"
    fm_cmd = fm_cmds[fm]

    # Step 0: clear up
    print(rmt.clean_default_apps(msvc_servers + msvc_clients,
                                 extra_app=[fm_proc[run_config["frameworkName"]], "wrk"]))
    print(rmt.cmd_remote(msvc_clients,
                         cmd_=f"sudo chmod 777 {nginx_dir}; "
                              f"sudo rm -rf {test_file} || true >/dev/null &2>1; "
                              f"sudo rm -rf {config_file} || true >/dev/null &2>1",
                         out=True))

    # Step 1: run msvc servers
    file_src = run_config.get("fileSource", "/dev/zero")
    file_size = run_config.get("fileSize", 1)
    rmt.cmd_remote(msvc_servers,
                   cmd_=f"sudo head -c {file_size}KB {file_src} > {test_file}")

    print(rmt.cmd_remote(msvc_servers,
                         cmd_=f"cd {fm_dir}; "
                              f"{fm_cmd} > /dev/null 2>&1 &", out=True))
    print(f"run: {fm_cmd}")
    print("run: waiting for the server to be ready..")
    time.sleep(5)
    print(f"run: msvc servers at public IPs {msvc_servers}")

    # Step 2: start the nginx for client side load balancing
    config_str = _nginx_config.replace("{SERVER_ENTRY}",
                                       "\n".join([f"server {ex_ip_to_in_ip[s]}:8080;"
                                                  for s in msvc_servers]))
    rmt.cmd_remote(msvc_clients, cmd_=f"cat > {config_file} << EOF {config_str}")
    print(rmt.cmd_remote(msvc_clients, cmd_=f"sudo nginx -c {config_file}", out=True))
    time.sleep(1)

    # Step 3: start the wrk2
    _cmd = f"wrk -t{run_config['numClientThread']} " \
           f"-c{run_config['numConn']} " \
           f"-d{run_config['duration']}s " \
           f"-R{run_config['reqRate']} http://localhost:80 --latency"
    print("run:", _cmd, "at", msvc_clients)

    start = time.time()
    raws = list(map(lambda x: x.decode("utf-8"),
                    rmt.cmd_remote(msvc_clients, cmd_=_cmd, out=True)))
    print(f"run: finished in {time.time() - start}s")
    print("run: sample result:\n", raws[0])

    # Step 4: upload logs and post-proc
    s3.dump_and_upload_file(run_config,
                            bucket=run_config["logBucket"],
                            key=s3.path_join(rid, "config"))
    s3.dump_and_upload_file(raws,
                            bucket=run_config["logBucket"],
                            key=s3.path_join(rid, "log"))

    def parse(raw_) -> dict:
        def parse_time(_t):
            if "us" in _t:
                return float(_t.replace("us", "")) / 1000
            elif "ms" in _t:
                return float(_t.replace("ms", ""))
            elif "s" in _t:
                return float(_t.replace("s", "")) * 1000
            else:
                return float(_t.replace("s", ""))

        results = dict()
        state = "start"
        for l in raw_.split("\n"):
            # state transition
            if "HdrHistogram" in l:
                state = "cdf"
            if "#[Mean" in l:
                state = "stat"
            # line parsing
            if state == "cdf":
                if "50.000%" in l:
                    results["lat_50pc"] = parse_time(l.split()[-1])
                elif "75.000%" in l:
                    results["lat_75pc"] = parse_time(l.split()[-1])
                elif "99.000%" in l:
                    results["lat_99pc"] = parse_time(l.split()[-1])
            elif state == "stat":
                if "Requests/sec" in l:
                    results["rps"] = float(l.split()[-1])
                if "Transfer/sec" in l:
                    tput = l.split()[-1]
                    if "MB" in tput:
                        tput = float(tput.replace("MB", ""))
                        results["throughput"] = tput * _mb
                    elif "KB" in tput:
                        tput = float(tput.replace("KB", ""))
                        results["throughput"] = tput * _kb
                    elif "GB" in tput:
                        tput = float(tput.replace("GB", ""))
                        results["throughput"] = tput * _gb
                if "#[Mean" in l:
                    results["lat_mean"] = parse_time(l.split()[2].rstrip(","))
                    results["lat_std"] = parse_time(l.split()[5].rstrip("]"))
        return results

    def agg(rs: list) -> dict:
        ag = defaultdict(list)
        for _r in rs:
            for k, v in _r.items():
                # all values default to float
                ag[k].append(float(v))
        # sum
        for k, v in ag.items():
            # sum
            if k in {
                "rps",
                "throughput",
            }:
                ag[k] = sum(ag[k])
            # default to avg
            else:
                ag[k] = mean(ag[k])
        return ag

    r = dict()
    r.update(agg([parse(r) for r in raws]))
    print("run: results", r)

    r.update(run_config)

    # pair wise latency info
    lat = mean(ping.bipartite_lats(msvc_servers,
                                   [ex_ip_to_in_ip[i] for i in msvc_clients]))
    r.update({
        "avg_client_server_lat": lat,
    })

    return r
Example #7
0
def run(run_config: dict, wrks: dict) -> dict:
    """
    Run memcached benchmark with fixed configurations.

    Returns a list consisting of results from multiple runs, where
    each result is a map of k-v pairs.
    """
    def validate():
        for k in {
                "keySize", "valueSize", "serverThread", "clientThread",
                "runTime", "waitTime", "warmupTime"
        }:
            assert k in run_config, f"run: missing config entry '{k}', abort"

    validate()

    rid = run_config["run_id"]

    # get servers and clients
    sit = run_config["serverInstanceType"]
    cit = run_config["clientInstanceType"]
    if sit == cit:
        ns = run_config["numServerInstance"]
        nc = run_config["numClientInstance"]
        mcd_servers = wrks[sit][:ns]
        mut_clients = wrks[cit][ns:ns + nc]
    else:
        mcd_servers = wrks[sit]
        mut_clients = wrks[cit]

    # install deps and clean up
    print("run: assume the remote VM image contains all deps; "
          "nothing to install;")
    print(
        rmt.clean_default_apps(mcd_servers + mut_clients,
                               extra_app=["memcached", "mutilate"],
                               docker_cont=["memcached"]))

    ex_ip_to_in_ip = k8s.get_worker_external_internal_ip_map()

    # Step 1: start the memcached servers
    # get memcached server IPs (internal VPC IP); we are
    # not using a load balancer here, mutilate does client-side
    # load balancing already
    port = 11211
    server_ex_ips = mcd_servers
    server_in_ips = [ex_ip_to_in_ip[e] for e in server_ex_ips]
    client_ex_ips = mut_clients
    client_in_ips = [ex_ip_to_in_ip[i] for i in client_ex_ips]

    num_server_thread = run_config.get("serverThread", -1)
    if num_server_thread < 0:
        num_server_thread = aws_resource_map[
            run_config["serverInstanceType"]]["vCPUs"]
        run_config["serverThread"] = num_server_thread

    # demux server runner type, default run on bare metal
    runner_type = run_config.get("runner", "bare")
    if runner_type == "bare":
        cmd_ = f"memcached -t {num_server_thread} -c 32768 > /dev/null 2>&1 & "
        rmt.cmd_remote(mcd_servers, cmd_=cmd_)
    elif runner_type == "docker":
        # default tag: 1.4.33
        tag = run_config.get("tag", "1.4.33")

        # run the container
        cmd_ = f"sudo docker run --name memcached -d -p {port}:{port} memcached:{tag} " \
               f"memcached -t {num_server_thread} -c 32768 > /dev/null 2>&1 & "
        rmt.cmd_remote(mcd_servers, cmd_=cmd_)

        # wait a bit for the container to be ready
        time.sleep(5)
        print(f"run: docker image memcached:{tag}")
    else:
        raise Exception(f"run: unknown runner type {runner_type}")
    print(f"run: using {runner_type} runner type")
    print(
        f"run: memcached servers at internal IPs {server_in_ips}, public IPs {server_ex_ips} with {cmd_}"
    )

    # Step 2: start the mutilate agents
    master = mut_clients[0]
    agents = mut_clients[1:]

    if len(agents) >= 1:
        _cmd_agent = f"mutilate -T {run_config['clientThread']} " \
                     f"-K {run_config['keySize']} " \
                     f"-V {run_config['valueSize']} " \
                     f"-c 4 " \
                     f"-A > /dev/null 2>&1 & "
        print("run: agents", agents, _cmd_agent)
        rmt.cmd_remote(agents, cmd_=_cmd_agent)

    # Step 3: start the mutilate master runner
    # TODO: add input distribution knob
    def make_master_cmd():
        server_str = " ".join([f"-s {si}:{port}" for si in server_in_ips])
        agent_str = " ".join([f"-a {ex_ip_to_in_ip[ax]}" for ax in agents])
        option_str = f"-T {run_config['clientThread']} " \
                     f"-K {run_config['keySize']} " \
                     f"-V {run_config['valueSize']} " \
                     f"-t {run_config['runTime']} " \
                     f"-w {run_config['warmupTime']} " \
                     f"-c 1 " \
                     f"-W {run_config['waitTime']} --noload"
        return f"mutilate {server_str} --loadonly", \
               f"mutilate {server_str} {agent_str} {option_str}"

    _cmd_load, _cmd_run = make_master_cmd()

    print("run: master", master, _cmd_run)
    start = time.time()
    rmt.cmd_remote([master], cmd_=_cmd_load)

    raw = rmt.cmd_remote([master], cmd_=_cmd_run, out=True)[0].decode("utf-8")
    print(f"run: finished in {time.time() - start}s")
    print("run results, sample:\n", raw)

    # Step 4: upload logs
    s3.dump_and_upload_file(run_config,
                            bucket=run_config["logBucket"],
                            key=s3.path_join(rid, "config"))
    s3.dump_and_upload_file(raw,
                            bucket=run_config["logBucket"],
                            key=s3.path_join(rid, "log"))

    # Step 5: parse and aggregate results
    def parse(_raw) -> dict:
        _raw = _raw.split("\n")
        results = dict()
        for l in _raw:
            vs = l.split()
            if len(vs) < 1:
                continue
            v_type, v = vs[0], None
            if v_type == "read":
                v = {
                    "avg_lat_read": vs[1],
                    "std_lat_read": vs[2],
                    "min_lat_read": vs[3],
                    "99th_lat_read": vs[8],
                }
            elif v_type.startswith("Total"):
                v = {"qps": vs[3]}
            elif v_type.startswith("RX"):
                v = {"rx_goodput": vs[-2]}
            elif v_type.startswith("TX"):
                v = {"tx_goodput": vs[-2]}
            if v is not None:
                results.update(v)
        return results

    r = dict()
    r.update(parse(raw))
    print("run: results", r)

    r.update(run_config)

    # pair wise latency info
    lat = mean(ping.bipartite_lats(mcd_servers, client_in_ips))
    r.update({
        "avg_client_server_lat": lat,
    })

    # debugging info
    r.update({
        "debug_num_server": len(mcd_servers),
        "debug_num_client": len(mut_clients),
        "debug_num_agent": len(agents),
        "debug_client_ex_IPs": mut_clients,
        "debug_server_ex_IPs": mcd_servers,
        "debug_client_in_IPs": client_in_ips,
        "debug_server_in_IPs": server_in_ips,
    })
    return r
Example #8
0
def run(run_config: dict, wrks) -> dict:
    """
    Run inch to benchmark influxdb
    """
    # get workers
    rid = run_config["run_id"]
    print("run: assume the remote VM image contains all deps; "
          "nothing to install;")

    # get servers and clients
    sit = run_config["serverInstanceType"]
    cit = run_config["clientInstanceType"]
    if sit == cit:
        ns = run_config["numServerInstance"]
        nc = run_config["numClientInstance"]
        influx_servers = wrks[sit][:ns]
        influx_clients = wrks[cit][ns:ns + nc]
    else:
        influx_servers = wrks[sit]
        influx_clients = wrks[cit]

    nginx_dir = "/etc/nginx"
    config_file = f"{nginx_dir}/perfd-influxdb.conf"

    ex_ip_to_in_ip = k8s.get_worker_external_internal_ip_map()

    # Step 0: clear up
    print(
        rmt.clean_default_apps(influx_servers + influx_clients,
                               extra_app=["influxd", "inch"],
                               docker_cont=["influxd"]))
    print(
        rmt.cmd_remote(influx_clients,
                       cmd_=f"sudo chmod 777 {nginx_dir}; "
                       f"sudo rm -rf {config_file} || true >/dev/null &2>1",
                       out=True))

    # Step 1: run influxd servers
    """docker run -p 8086:8086 \
      -v $PWD:/var/lib/influxdb \
      influxdb"""

    # demux run commands based on runner type
    runner_type = run_config.get("runner", "bare")
    if runner_type == "bare":
        cmd_ = f"sudo influxd > /dev/null 2>&1 &"
    elif runner_type == "docker":
        tag = run_config.get("tag", "1.7.10")
        cmd_ = f"sudo docker run --name influxd -d -p 8086:8086 " \
               f"-v $PWD:/var/lib/influxdb influxdb:{tag} > /dev/null 2>&1 &"
        print(f"run: use docker image influxdb:{tag}")
    else:
        raise Exception(f"run: unknown runner type {runner_type}")

    # start servers
    print(f"run: using {runner_type} runner type with command {cmd_}")
    print(rmt.cmd_remote(influx_servers, cmd_=cmd_, out=True))
    print(f"run: influxd servers at public IPs {influx_servers}")
    print("run: waiting for the server to be ready..")
    time.sleep(5)

    # Step 2: start the nginx for client side load balancing
    config_str = _nginx_config.replace(
        "{SERVER_ENTRY}", "\n".join(
            [f"server {ex_ip_to_in_ip[s]}:8086;" for s in influx_servers]))

    rmt.cmd_remote(influx_clients,
                   cmd_=f"cat > {config_file} << EOF {config_str}")
    print(
        rmt.cmd_remote(influx_clients,
                       cmd_=f"sudo nginx -c {config_file}",
                       out=True))

    # Step 3: start the inch
    _cmd = f"for r in {{1..{run_config['numReq']}}}; do inch " \
           f"-host http://localhost:80 " \
           f"-p {run_config['numPointPerSeries']} & \n done; wait"

    print("run:", _cmd, f"at public IPs {influx_clients}")

    start = time.time()
    raws = list(
        map(lambda x: x.decode("utf-8"),
            rmt.cmd_remote(influx_clients, cmd_=_cmd, out=True)))
    # print("debug:", raws)
    print(f"run: finished in {time.time() - start}s")
    print("run results, sample:\n", raws[0])

    # print("debug:", raws)
    # Step 4: upload logs and post-proc
    s3.dump_and_upload_file(run_config,
                            bucket=run_config["logBucket"],
                            key=s3.path_join(rid, "config"))
    s3.dump_and_upload_file(raws,
                            bucket=run_config["logBucket"],
                            key=s3.path_join(rid, "log"))

    def parse(raw_) -> dict:
        results = dict()
        lats = list()
        for l in raw_.split("\n"):
            if "Total time:" in l:
                lats.append(float(l.split()[2]))

        for k, v in stats(lats).items():
            results[f"query_latency_{k}"] = v
        return results

    def agg(rs: list) -> dict:
        ag = defaultdict(list)
        for _r in rs:
            for k, v in _r.items():
                # all values default to float
                ag[k].append(float(v))
        # sum
        for k, v in ag.items():
            # sum
            if k in {}:
                ag[k] = sum(ag[k])
            # default to avg
            else:
                ag[k] = mean(ag[k])
        return ag

    r = dict()
    r.update(agg([parse(r) for r in raws]))
    print("run: results", r)
    r.update(run_config)

    # pair wise latency info
    lat = mean(
        ping.bipartite_lats(influx_servers,
                            [ex_ip_to_in_ip[i] for i in influx_clients]))
    r.update({
        "avg_client_server_lat": lat,
    })

    return r