Esempio n. 1
0
 def __init__(self, raw_args=None):
     self.args, self.unknowns = parser.parse_known_args(raw_args)
     self._updateArgs(self.args)
     setLoggerLevel(self.args.logger_level)
     if not self.args.benchmark_db_entry:
         assert (
             self.args.server_addr is not None
         ), "Either server_addr or benchmark_db_entry must be specified"
         while self.args.server_addr[-1] == "/":
             self.args.server_addr = self.args.server_addr[:-1]
         self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/"
     self.db = DBDriver(
         self.args.benchmark_db,
         self.args.app_id,
         self.args.token,
         self.args.benchmark_table,
         self.args.job_queue,
         self.args.test,
         self.args.benchmark_db_entry,
     )
     self.url_printer = PrintResultURL(self.args)
     self.file_handler = FileHandler(self.args)
     self.devices = Devices(self.args.devices_config)
     # Hard code scuba table
     self.scuba_dataset = "caffe2_benchmarking"
     self.info = None
     self.temprdir = ""
Esempio n. 2
0
    def __init__(self, raw_args=None):
        self.args, self.unknowns = parser.parse_known_args(raw_args)
        os.environ["CLAIMER"] = self.args.claimer_id
        self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger())
        self.adb = ADB(None, self.args.android_dir)
        setLoggerLevel(self.args.logger_level)
        if not self.args.benchmark_db_entry:
            assert (
                self.args.server_addr is not None
            ), "Either server_addr or benchmark_db_entry must be specified"
            while self.args.server_addr[-1] == "/":
                self.args.server_addr = self.args.server_addr[:-1]
            self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/"
        self.db = DBDriver(
            self.args.benchmark_db,
            self.args.app_id,
            self.args.token,
            self.args.benchmark_table,
            self.args.job_queue,
            self.args.test,
            self.args.benchmark_db_entry,
        )
        self.device_manager = DeviceManager(self.args, self.db)
        self.devices = self.device_manager.getLabDevices()

        if self.args.platform.startswith("host"):
            numProcesses = 2
        else:
            numProcesses = multiprocessing.cpu_count() - 1
        self.pool = Pool(max_workers=numProcesses, initializer=hookSignals)
Esempio n. 3
0
    def __init__(self, raw_args=None):
        self.args, self.unknowns = parser.parse_known_args(raw_args)
        self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger())
        self.adb = ADB(None, self.args.android_dir)
        devices = self._getDevices()
        setLoggerLevel(self.args.logger_level)
        if not self.args.benchmark_db_entry:
            assert self.args.server_addr is not None, \
                "Either server_addr or benchmark_db_entry must be specified"
            while self.args.server_addr[-1] == '/':
                self.args.server_addr = self.args.server_addr[:-1]
            self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/"
        self.db = DBDriver(self.args.benchmark_db, self.args.app_id,
                           self.args.token, self.args.benchmark_table,
                           self.args.job_queue, self.args.test,
                           self.args.benchmark_db_entry)

        self.devices = {}
        for k in devices:
            kind = k["kind"]
            hash = k["hash"]
            entry = {
                "kind":
                kind,
                "hash":
                hash,
                "available":
                True,
                "live":
                True,
                "start_time":
                None,
                "done_time":
                None,
                "output_dir":
                None,
                "job":
                None,
                "adb":
                ADB(hash, self.args.android_dir),
                "reboot_time":
                datetime.datetime.now() - datetime.timedelta(hours=8)
            }
            if kind not in self.devices:
                self.devices[kind] = {}
            assert hash not in self.devices[kind], \
                "Device {} ({}) is attached twice.".format(kind, hash)
            self.devices[kind][hash] = entry

        dvs = [
            self.devices[k][h] for k in self.devices for h in self.devices[k]
        ]
        self.db.updateDevices(self.args.claimer_id, getDevicesString(dvs),
                              True)
        if self.args.platform.startswith("host"):
            numProcesses = 2
        else:
            numProcesses = multiprocessing.cpu_count() - 1
        self.pool = multiprocessing.Pool(processes=numProcesses)
Esempio n. 4
0
class RunRemote(object):
    def __init__(self, raw_args=None):
        self.args, self.unknowns = parser.parse_known_args(raw_args)
        setLoggerLevel(self.args.logger_level)
        if not self.args.benchmark_db_entry:
            assert self.args.server_addr is not None, \
                "Either server_addr or benchmark_db_entry must be specified"
            while self.args.server_addr[-1] == '/':
                self.args.server_addr = self.args.server_addr[:-1]
            self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/"
        self.db = DBDriver(self.args.benchmark_db, self.args.app_id,
                           self.args.token, self.args.benchmark_table,
                           self.args.job_queue, self.args.test,
                           self.args.benchmark_db_entry)
        self.url_printer = PrintResultURL(self.args)
        self.file_handler = FileHandler(self.args)
        self.devices = Devices(self.args.devices_config)
        # Hard code scuba table
        self.scuba_dataset = "caffe2_benchmarking"
        self.info = None
        self.temprdir = ''

    def run(self):
        if self.args.list_devices:
            self._listDevices()
            return
        if self.args.list_job_queues:
            self._printJobQueues()
            return
        if self.args.fetch_status or self.args.fetch_result:
            result = self._fetchResult()
            return result
        if self.args.query_num_devices:
            return self._queryNumDevices(self.args.query_num_devices)

        assert self.args.benchmark_file, \
            "--benchmark_file (-b) must be specified"
        assert self.args.devices, "--devices must be specified"
        assert self.args.framework, "--framework must be specified"
        assert self.args.platform, "--platform must be specified"
        assert self.args.repo_dir, "--repo_dir must be specified"
        assert ((self.args.info is not None) and
            (self.args.custom_binary is None) and
            (self.args.pre_built_binary is None)) or (self.args.info is None), \
            "--info cannot co-exist with --custom_binary and --pre_built_binary"

        list_job_queues = self._listJobQueues()
        if not self.args.force_submit:
            self._checkDevices(self.args.devices, self.args.hashes)
            assert self.args.job_queue != "*" and \
                self.args.job_queue in list_job_queues, \
                "--job_queue must be choosen from " + " ".join(list_job_queues)

        self.tempdir = tempfile.mkdtemp()
        program_filenames = {}
        if self.args.info:
            self.info = json.loads(self.args.info)
        else:
            self.info = {"treatment": {"programs": {}}}
            if self.args.string_map:
                self.info["treatment"]["string_map"] = str(
                    self.args.string_map)

        assert (("treatment" in self.info) and
                ("programs" in self.info["treatment"])), \
            'In --info, field treatment must exist. In info["treatment"] ' \
            "program field must exist (may be None)"

        binary = self.info["treatment"]["programs"]["program"]["location"] \
            if ("programs" in self.info["treatment"] and
                "program" in self.info["treatment"]["programs"]) \
            else self.args.custom_binary if self.args.custom_binary \
            else self.args.pre_built_binary
        t = BuildProgram(self.args, self.file_handler, self.tempdir,
                         program_filenames, binary)
        t.start()

        benchmarks = getBenchmarks(self.args.benchmark_file,
                                   self.args.framework)
        for benchmark in benchmarks:
            self._uploadOneBenchmark(benchmark)
            if self.args.debug:
                for test in benchmark["content"]["tests"]:
                    test["log_output"] = True
            if self.args.env:
                env = {}
                env_vars = self.args.env.split()
                for env_var in env_vars:
                    k, v = parse_kwarg(env_var)
                    env[k] = v
                for test in benchmark["content"]["tests"]:
                    cmd_env = {}
                    cmd_env.update(env)
                    if "env" in test:
                        cmd_env.update(test["env"])
                    test["env"] = cmd_env
        t.join()

        assert "program" in program_filenames, \
            "program does not exist. Build may be failed."

        for fn in program_filenames:
            self.info["treatment"]["programs"][fn] = {
                "location": program_filenames[fn]
            }

        # Pass meta file from build to benchmark
        meta = getMeta(self.args, self.args.platform)
        if meta:
            assert "meta" not in self.info, \
                "info field already has a meta field"
            self.info["meta"] = meta

        new_devices = self.devices.getFullNames(self.args.devices)
        user_identifier = int(self.args.user_identifier) \
            if self.args.user_identifier else randint(1, 1000000000000000)
        user = getuser(
        ) if not self.args.user_string else self.args.user_string
        hashes = self.args.hashes
        for benchmark in benchmarks:
            data = {
                "benchmark": benchmark,
                "info": self.info,
            }
            self.db.submitBenchmarks(data, new_devices, user_identifier, user,
                                     hashes)
        if self.args.async_submit:
            return

        self.url_printer.printURL(self.scuba_dataset, user_identifier,
                                  benchmarks)

        if not self.args.debug:
            shutil.rmtree(self.tempdir, True)
        if self.args.screen_reporter:
            self._screenReporter(user_identifier)

    def _uploadOneBenchmark(self, benchmark):
        filename = benchmark["filename"]
        one_benchmark = benchmark["content"]
        # TODO refactor the code to collect all files to upload
        del_paths = []
        if "model" in one_benchmark:
            if "files" in one_benchmark["model"]:
                for field in one_benchmark["model"]["files"]:
                    value = one_benchmark["model"]["files"][field]
                    assert "location" in value, \
                        "location field is missing in benchmark " \
                        "{}".format(filename)
                    ref_path = ["files", field]
                    if self._uploadFile(value, filename, benchmark, ref_path):
                        del_paths.append(ref_path)
            if "libraries" in one_benchmark["model"]:
                for value in one_benchmark["model"]["libraries"]:
                    assert "location" in value, \
                        "location field is missing in benchmark " \
                        "{}".format(filename)
                    self._uploadFile(value, filename, benchmark)

        for del_path in del_paths:
            self._del_from_benchmark(benchmark["content"]["model"], del_path)

        # upload test file
        assert "tests" in one_benchmark, \
            "tests field is missing in benchmark {}".format(filename)
        tests = one_benchmark["tests"]
        for test in tests:
            if "input_files" in test:
                self._uploadTestFiles(test["input_files"], filename)
            # ignore the outputs for non accuracy metrics
            if "output_files" in test and test["metric"] == "error":
                self._uploadTestFiles(test["output_files"], filename)

    def _uploadTestFiles(self, files, basefilename):
        if isinstance(files, list):
            for i in range(len(files)):
                f = files[i]
                self._uploadFile(f, basefilename)
        elif isinstance(files, dict):
            for f in files:
                value = files[f]
                if isinstance(value, list):
                    for i in range(len(value)):
                        v = value[i]
                        self._uploadFile(v, basefilename)
                else:
                    self._uploadFile(value, basefilename)

    def _uploadFile(self,
                    f,
                    basefilename,
                    benchmark=None,
                    ref_path=None,
                    cache_file=True):
        if "location" not in f:
            return
        location = f["location"]
        if "md5" not in f:
            raise Exception("No md5sum provided for {}".format(f["filename"]))
        md5 = f["md5"]
        """
        For the file from repo, there is special handling
        we need to fetch both control and treatment
        , and also move the file from benchmark to info
        Note: Support the file in model first
        """
        if location.startswith("//repo"):
            assert ref_path is not None, "repo is not yet \
                supported for {}".format(location)
            for side in self.info:
                if side == "extra":
                    continue
                value = self.info[side]
                commit_hash = "master"
                if "commit" in value:
                    commit_hash = value["commit"] or "master"
                tgt_file = self._downloadRepoFile(location, self.tempdir,
                                                  commit_hash)
                f["location"], f["md5"] = self.file_handler.uploadFile(
                    tgt_file, md5, basefilename, cache_file)
                # add to info
                assert len(ref_path), "ref_path must be a path to target file"
                value["programs"][".".join(ref_path)] = {
                    "location": f["location"]
                }
                # remove from benchmark
                assert benchmark is not None, \
                    "benchmark must be passed into _uploadFile"
            return True
        else:
            f["location"], f["md5"] = self.file_handler.uploadFile(
                location, md5, basefilename, cache_file)
            return False

    def _downloadRepoFile(self, location, tgt_dir, commit_hash):
        """
        location: //repo/fbsource/fbcode/aibench/...../a.py
        """
        raw_scm_query = pkg_resources.resource_string(
            "aibench", "benchmarking/bin/scm_query.par")
        query_exe = os.path.join(tgt_dir, "scm_query.par")
        with open(query_exe, "wb") as f:
            f.write(raw_scm_query)
        cmd = ['chmod', '+x', os.path.join(tgt_dir, "scm_query.par")]
        subprocess.check_output(cmd)
        dirs = location[2:].split("/")
        tgt_file = os.path.join(tgt_dir, dirs[-1])
        cmd = [
            query_exe, '--repo', dirs[1], '--file_path', '/'.join(dirs[2:]),
            '--target_file', tgt_file, '--commit_hash', commit_hash
        ]
        getLogger().info("Downloading {}".format(location))
        subprocess.check_output(cmd)
        os.remove(query_exe)
        return tgt_file

    def _del_from_benchmark(self, benchmark, ref_path):
        tgt = benchmark
        for item in ref_path[:-1]:
            tgt = tgt[item]
        tgt.pop(ref_path[-1])

    def _listDevices(self, flag=True):
        devices = self.db.listDevices(self.args.job_queue)
        headers = ["Device", "Status", "Abbrs", "Hash"]
        rows = []
        for device in devices:
            abbrs = self.devices.getAbbrs(device["device"])
            abbrs = ",".join(abbrs) if abbrs else ""
            hash = device["hash"]
            row = [device["device"], device["status"], abbrs, hash]
            rows.append(row)
        rows.sort()
        if flag:
            table = tabulate(rows, headers=headers, tablefmt='orgtbl')
            print("\n{}\n".format(table))
        return rows

    def _checkDevices(self, specified_devices, hashes=None):
        rows = self._listDevices(flag=False)
        specifiedDevices = set(specified_devices.split(","))
        specifiedHashes = None
        if hashes:
            hashes = hashes.split(",")
            devices = specified_devices.split(",")
            if len(hashes) != len(devices):
                raise Exception(
                    "You need to provide same number of hashes and devices")
            specifiedHashes = {}
            for i, hash in enumerate(hashes):
                specifiedHashes[hash] = devices[i]
        devices = {}
        devicesIn = True
        for row in rows:
            abbrs = row[-2].split(",") if row[-2] else []
            if row[-1] not in devices:
                devices[row[-1]] = {row[0]}.union(set(abbrs))
            else:
                devices[row[-1]].union({row[0]}.union(set(abbrs)))
        if specifiedHashes:
            for specifiedHash in specifiedHashes:
                if specifiedHash not in devices or \
                        specifiedHashes[specifiedHash] not in devices[specifiedHash]:
                    devicesIn = False
        else:
            allDevices = set()
            for v in devices.values():
                allDevices = allDevices.union(v)
            devicesIn = not specifiedDevices.difference(allDevices)
        if not devicesIn:
            errMessages = " ".join([
                "Devices", specified_devices,
                "is not available in the job_queue", self.args.job_queue
            ])
            if hashes:
                errMessages = " ".join([
                    "Devices", specified_devices, "with hashes",
                    ",".join(hashes), "is not available in the job_queue",
                    self.args.job_queue
                ])
            raise Exception(errMessages)

    def _queryNumDevices(self, device_name):
        deviceCounter = defaultdict(int)
        for device in self.db.listDevices(self.args.job_queue):
            abbrs = self.devices.getAbbrs(device["device"])
            if device["device"] == device_name or device_name in (abbrs or []):
                deviceCounter[device["status"]] += 1

        return deviceCounter

    def _listJobQueues(self):
        devices = self.db.listDevices(job_queue="*")
        list_job_queues = sorted({device['job_queue'] for device in devices})
        return list_job_queues

    def _printJobQueues(self):
        list_job_queues = self._listJobQueues()
        for jobQueue in list_job_queues:
            print(jobQueue)

    def _screenReporter(self, user_identifier):
        reporter = ScreenReporter(self.db, self.devices, self.args.debug)
        reporter.run(user_identifier, self.args.urlPrefix)

    def _fetchResult(self):
        user_identifier = self.args.user_identifier
        assert user_identifier, "User identifier must be specified for " \
            "fetching the status and/or result of the previously run benchmarks"
        statuses = self.db.statusBenchmarks(user_identifier)
        result = None
        if self.args.fetch_status:
            result = json.dumps(statuses)
        elif self.args.fetch_result:
            ids = ",".join([str(status["id"]) for status in statuses])
            output = self.db.getBenchmarks(ids)
            self._mobilelabResult(output)
            result = json.dumps(output)
        print(result)
        return result

    def _mobilelabResult(self, output):
        # always get the last result
        for item in output:
            raw_result = item["result"]
            if raw_result is None:
                continue
            result = json.loads(raw_result)
            mobilelab_result = {"treatment": {}, "control": {}}
            for k in result:
                # k is identifier
                v = result[k]
                for kk in v:
                    vv = v[kk]
                    # update values if only summary exists
                    if "values" not in vv or len(vv["values"]) == 0:
                        if "summary" in vv:
                            if "mean" in vv["summary"]:
                                vv["values"] = [vv["summary"]["mean"]]
                            elif "p50" in vv["summary"]:
                                vv["values"] = [vv["summary"]["p50"]]
                        if "control_summary" in vv:
                            if "mean" in vv["control_summary"]:
                                vv["control_values"] = \
                                    [vv["control_summary"]["mean"]]
                            elif "p50" in vv["control_summary"]:
                                vv["control_values"] = \
                                    [vv["control_summary"]["p50"]]
                    # check values again
                    if "values" not in vv or len(vv["values"]) == 0:
                        continue
                    assert vv["type"], "type is missing in {}".format(kk)
                    assert vv["metric"], "metric is missing in {}".format(kk)
                    if vv["metric"] == "flops":
                        continue
                    unit = vv["unit"] if "unit" in vv else "null"
                    self._mobilelabAddField(mobilelab_result["treatment"], k,
                                            vv["type"], vv["metric"],
                                            vv["values"], unit)
                    if "control_values" in vv:
                        self._mobilelabAddField(mobilelab_result["control"], k,
                                                vv["type"], vv["metric"],
                                                vv["control_values"], unit)

            item["mobilelab_result"] = mobilelab_result

    def _mobilelabAddField(self, output, identifier, type, metric, values,
                           unit):
        key = "{}__{}__{}".format(identifier, type, metric)
        key = re.sub('\W+', '_', key)
        assert key not in output, \
           "duplicate key {}".format(key)
        output[key] = {
            "values": values,
            "metric": metric,
            "type": type,
            "unit": unit,
        }
Esempio n. 5
0
class RunLab(object):
    def __init__(self, raw_args=None):
        self.args, self.unknowns = parser.parse_known_args(raw_args)
        self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger())
        self.adb = ADB(None, self.args.android_dir)
        devices = self._getDevices()
        setLoggerLevel(self.args.logger_level)
        if not self.args.benchmark_db_entry:
            assert self.args.server_addr is not None, \
                "Either server_addr or benchmark_db_entry must be specified"
            while self.args.server_addr[-1] == '/':
                self.args.server_addr = self.args.server_addr[:-1]
            self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/"
        self.db = DBDriver(self.args.benchmark_db, self.args.app_id,
                           self.args.token, self.args.benchmark_table,
                           self.args.job_queue, self.args.test,
                           self.args.benchmark_db_entry)
        self.devices = {}
        for k in devices:
            kind = k["kind"]
            hash = k["hash"]
            entry = {
                "kind":
                kind,
                "hash":
                hash,
                "available":
                True,
                "live":
                True,
                "start_time":
                None,
                "done_time":
                None,
                "output_dir":
                None,
                "job":
                None,
                "adb":
                ADB(hash, self.args.android_dir),
                "reboot_time":
                datetime.datetime.now() - datetime.timedelta(hours=8)
            }
            if kind not in self.devices:
                self.devices[kind] = {}
            assert hash not in self.devices[kind], \
                "Device {} ({}) is attached twice.".format(kind, hash)
            self.devices[kind][hash] = entry

        dvs = [
            self.devices[k][h] for k in self.devices for h in self.devices[k]
        ]
        self.db.updateDevices(self.args.claimer_id, getDevicesString(dvs),
                              True)
        if self.args.platform.startswith("host"):
            numProcesses = 2
        else:
            numProcesses = multiprocessing.cpu_count() - 1
        self.pool = multiprocessing.Pool(processes=numProcesses)

    def run(self):
        while (not stopRun(self.args)):
            with LOCK:
                self._runOnce()
            time.sleep(1)
        self.db.updateDevices(self.args.claimer_id, "", True)

    def _runOnce(self):
        jobs = self._claimBenchmarks()
        jobs_queue, remaining_jobs = self._selectBenchmarks(jobs)
        if len(remaining_jobs) != 0:
            self._releaseBenchmarks(remaining_jobs)
        if len(jobs_queue) == 0:
            return
        self._runBenchmarks(jobs_queue)

    def _claimBenchmarks(self):
        claimer_id = self.args.claimer_id
        # get available devices
        devices = ",".join([
            k for k in self.devices
            if any(self.devices[k][hash]["available"] is True
                   for hash in self.devices[k])
        ])
        jobs = []
        if len(devices) > 0:
            jobs = self.db.claimBenchmarks(claimer_id, devices)
        return jobs

    def _selectBenchmarks(self, jobs):
        remaining_jobs = []
        jobs_queue = []
        for job in jobs:
            device_kind = job["device"]
            if device_kind not in self.devices:
                getLogger().error("Retrieved job for device "
                                  "{} ".format(device_kind) +
                                  "cannot be run on server "
                                  "{}".format(self.args.claimer_id))
                remaining_jobs.append(job)
            else:
                for hash in self.devices[device_kind]:
                    device = self.devices[device_kind][hash]
                    if device["available"] is True:
                        job["hash"] = hash
                        jobs_queue.append(job)
                        device["available"] = False
                        break
        return jobs_queue, remaining_jobs

    def _releaseBenchmarks(self, remaining_jobs):
        # releasing unmatched jobs
        releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs])
        self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids)

    def _runBenchmarks(self, jobs_queue):
        # run the jobs in job queue
        run_ids = ",".join([str(job["id"]) for job in jobs_queue])
        self.db.runBenchmarks(self.args.claimer_id, run_ids)
        run_devices = [
            self.devices[job["device"]][job["hash"]] for job in jobs_queue
        ]
        self.db.updateDevices(self.args.claimer_id,
                              getDevicesString(run_devices), False)
        self._downloadFiles(jobs_queue)

        # run the benchmarks
        for job in jobs_queue:
            tempdir = tempfile.mkdtemp()
            raw_args = self._getRawArgs(job, tempdir)
            self.devices[job["device"]][
                job["hash"]]["start_time"] = time.ctime()
            app = runAsync(self.args, self.devices, self.db, job, tempdir)
            """
            Python's multiprocessing need to pickle things to sling them
            in different processes. However, bounded methods are not pickable,
            so the way it's doing it here doesn't work.
            Thus, I added __call__ method in runAsync class and call the
            class here, since class object is pickable.
            Ref: https://stackoverflow.com/a/6975654
            """
            self.pool.apply_async(app, args=[raw_args], callback=app.callback)

    def _saveBenchmarks(self, jobs_queue):
        benchmark_files = []
        # save benchmarks to files
        for job in jobs_queue:
            benchmarks = job["benchmarks"]
            benchmark = benchmarks["benchmark"]
            content = benchmark["content"]
            benchmark_str = json.dumps(content)
            outfd, path = tempfile.mkstemp()
            with os.fdopen(outfd, "w") as f:
                f.write(benchmark_str)
            job["benchmarks"]["benchmark"]["content"] = path
            if content["tests"][0]["metric"] == "generic":
                job["framework"] = "generic"
            elif "model" in content and "framework" in content["model"]:
                job["framework"] = content["model"]["framework"]
            else:
                getLogger().error("Framework is not specified, "
                                  "use Caffe2 as default")
                job["framework"] = "caffe2"
            benchmark_files.append(path)
        return benchmark_files

    def _downloadBinaries(self, info_dict):
        programs = info_dict["programs"]
        for bin_name in programs:
            program_location = programs[bin_name]["location"]
            self.benchmark_downloader.downloadFile(program_location, None)
            if program_location.startswith("//"):
                program_location = self.args.root_model_dir + program_location[
                    1:]
            elif program_location.startswith("http"):
                replace_pattern = {
                    " ": '-',
                    "\\": '-',
                    ":": '/',
                }
                program_location = self.args.root_model_dir + '/' +\
                                   getFilename(program_location,
                                               replace_pattern=replace_pattern)
            elif program_location.startswith("/"):
                program_location = self.args.root_model_dir + program_location
            if self.args.platform.startswith("ios") and \
                    bin_name == "program" and \
                    not program_location.endswith(".ipa"):
                new_location = program_location + ".ipa"
                os.rename(program_location, new_location)
                program_location = new_location
            os.chmod(program_location,
                     stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR)
            programs[bin_name]["location"] = program_location

    def _downloadFiles(self, jobs_queue):
        benchmark_files = self._saveBenchmarks(jobs_queue)
        # download the models
        for bf in benchmark_files:
            self.benchmark_downloader.run(bf)
        # download the programs
        for job in jobs_queue:
            if "info" not in job["benchmarks"]:
                continue
            try:
                if "treatment" not in job["benchmarks"]["info"]:
                    getLogger().error("Field treatment "
                                      "must exist in job[\"benchmarks\"]")
                elif "programs" not in job["benchmarks"]["info"]["treatment"]:
                    getLogger().error(
                        "Field \"program\" must exist in "
                        "job[\"benchmarks\"][\"info\"][\"treatment\"]")
                else:
                    treatment_info = job["benchmarks"]["info"]["treatment"]
                    self._downloadBinaries(treatment_info)

                if "control" in job["benchmarks"]["info"]:
                    if "programs" not in job["benchmarks"]["info"]["control"]:
                        getLogger().error(
                            "Field \"program\" must exist in "
                            "job[\"benchmarks\"][\"info\"][\"control\"]")
                    else:
                        control_info = job["benchmarks"]["info"]["control"]
                        self._downloadBinaries(control_info)
            except Exception:
                getLogger().error("Unknown exception {}".format(
                    sys.exc_info()[0]))
                getLogger().error("File download failure")
        return benchmark_files

    def _getDevices(self):
        raw_args = []
        raw_args.extend(["--platform", self.args.platform])
        if self.args.platform_sig:
            raw_args.append("--platform_sig")
            raw_args.append(self.args.platform_sig)
        if self.args.devices:
            raw_args.append("--devices")
            raw_args.append(self.args.devices)
        if self.args.hash_platform_mapping:
            # if the user provides filename, we will load it.
            raw_args.append("--hash_platform_mapping")
            raw_args.append(self.args.hash_platform_mapping)
        app = GetConnectedDevices(raw_args=raw_args)
        devices_json = app.run()
        assert devices_json, "Devices cannot be empty"
        devices = json.loads(devices_json.strip())
        return devices

    def _getRawArgs(self, job, tempdir):
        if "info" in job["benchmarks"]:
            info = job["benchmarks"]["info"]
        elif "program" in job["benchmarks"]:
            # TODO: remove after all clients are updated
            info = {
                "treatment": {
                    "commit": "interactive",
                    "commit_time": 0,
                    "program": job["benchmarks"]["program"],
                }
            }
        # pass the device hash as well as type
        device = {"kind": job["device"], "hash": job["hash"]}
        device_str = json.dumps(device)
        raw_args = []
        raw_args.extend([
            "--benchmark_file",
            job["benchmarks"]["benchmark"]["content"],
            "--cooldown",
            str(self.args.cooldown),
            "--device",
            device_str,
            "--framework",
            job["framework"],
            "--info",
            json.dumps(info),
            "--model_cache",
            self.args.model_cache,
            "--platform",
            self.args.platform,
            "--remote_access_token",
            self.args.remote_access_token,
            "--root_model_dir",
            self.args.root_model_dir,
            "--simple_local_reporter",
            tempdir,
            "--user_identifier",
            str(job["identifier"]),
        ])
        if job["framework"] != "generic":
            raw_args.extend(["--remote_reporter", self.args.remote_reporter])
        if self.args.shared_libs:
            raw_args.extend(
                ["--shared_libs", "'" + self.args.shared_libs + "'"])
        if self.args.timeout:
            raw_args.extend(["--timeout", str(self.args.timeout)])
        if self.args.platform_sig:
            raw_args.append("--platform_sig")
            raw_args.append(self.args.platform_sig)
        if self.args.monsoon_map:
            raw_args.extend(["--monsoon_map", str(self.args.monsoon_map)])
        if self.args.hash_platform_mapping:
            # if the user provides filename, we will load it.
            raw_args.append("--hash_platform_mapping")
            raw_args.append(self.args.hash_platform_mapping)

        return raw_args
Esempio n. 6
0
class RunLab(object):
    def __init__(self, raw_args=None):
        self.args, self.unknowns = parser.parse_known_args(raw_args)
        os.environ["CLAIMER"] = self.args.claimer_id
        self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger())
        self.adb = ADB(None, self.args.android_dir)
        setLoggerLevel(self.args.logger_level)
        if not self.args.benchmark_db_entry:
            assert (
                self.args.server_addr is not None
            ), "Either server_addr or benchmark_db_entry must be specified"
            while self.args.server_addr[-1] == "/":
                self.args.server_addr = self.args.server_addr[:-1]
            self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/"
        self.db = DBDriver(
            self.args.benchmark_db,
            self.args.app_id,
            self.args.token,
            self.args.benchmark_table,
            self.args.job_queue,
            self.args.test,
            self.args.benchmark_db_entry,
        )
        self.device_manager = DeviceManager(self.args, self.db)
        self.devices = self.device_manager.getLabDevices()

        if self.args.platform.startswith("host"):
            numProcesses = 2
        else:
            numProcesses = multiprocessing.cpu_count() - 1
        self.pool = Pool(max_workers=numProcesses, initializer=hookSignals)

    def run(self):
        hookSignals()
        while not stopRun(self.args):
            with LOCK:
                self._runOnce()
            time.sleep(1)
        self.db.updateDevices(self.args.claimer_id, "", True)
        self.device_manager.shutdown()

    def _runOnce(self):
        jobs = self._claimBenchmarks()
        jobs_queue, remaining_jobs = self._selectBenchmarks(jobs)
        if len(remaining_jobs) != 0:
            self._releaseBenchmarks(remaining_jobs)
        if len(jobs_queue) == 0:
            return
        self._runBenchmarks(jobs_queue)

    def _claimBenchmarks(self):
        claimer_id = self.args.claimer_id
        # get available devices with their hashes
        devices = []
        hashes = []
        for k in self.devices:
            for hash in self.devices[k]:
                if self.devices[k][hash]["available"]:
                    devices.append(k)
                    hashes.append(hash)
        hashes = ",".join(hashes)
        devices = ",".join(devices)
        jobs = []
        if len(devices) > 0:
            jobs = self.db.claimBenchmarks(claimer_id, devices, hashes)
        return jobs

    def _selectBenchmarks(self, jobs):
        remaining_jobs = []
        jobs_queue = []
        for job in jobs:
            device_kind = job["device"]
            if device_kind not in self.devices:
                getLogger().error("Retrieved job for device "
                                  "{} ".format(device_kind) +
                                  "cannot be run on server "
                                  "{}".format(self.args.claimer_id))
                remaining_jobs.append(job)
            else:
                for hash in self.devices[device_kind]:
                    device = self.devices[device_kind][hash]
                    if device["available"] is True:
                        job["hash"] = hash
                        jobs_queue.append(job)
                        device["available"] = False
                        break
        return jobs_queue, remaining_jobs

    def _releaseBenchmarks(self, remaining_jobs):
        # releasing unmatched jobs
        releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs])
        self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids)

    def _runBenchmarks(self, jobs_queue):
        """Given a queue of jobs, update run statuses and device statuses in db, and spawn job processes."""
        run_ids = ",".join([str(job["id"]) for job in jobs_queue])
        self.db.runBenchmarks(self.args.claimer_id, run_ids)
        run_devices = [
            self.devices[job["device"]][job["hash"]] for job in jobs_queue
        ]
        getLogger().info("Updating devices status")
        self.db.updateDevices(self.args.claimer_id,
                              getDevicesString(run_devices), False)

        # run the benchmarks
        for job in jobs_queue:
            getLogger().info(
                f"Running job with identifier {job['identifier']} and id {job['id']}"
            )
            device = self.devices[job["device"]][job["hash"]]
            device["start_time"] = time.ctime()
            async_runner = runAsync(
                self.args,
                device,
                self.db,
                job,
                self.benchmark_downloader,
                self.device_manager.usb_controller,
            )

            # Watchdog will be used to kill currently running jobs
            # based on user requests
            app = WatchDog(async_runner, async_runner.didUserRequestJobKill,
                           async_runner.killJob)

            global RUNNING_JOBS
            RUNNING_JOBS += 1
            """
            Python's multiprocessing need to pickle things to sling them
            in different processes. However, bounded methods are not pickable,
            so the way it's doing it here doesn't work.
            Thus, I added __call__ method to the class we are passing into the
            apply_async method.
            Ref: https://stackoverflow.com/a/6975654
            """
            future = self.pool.submit(app)
            future.add_done_callback(self.callback)

    def callback(self, future_result_dict):
        """Decrement running jobs count, output job log, and start device cooldown."""
        global RUNNING_JOBS
        RUNNING_JOBS -= 1
        result = future_result_dict.result()
        job = result["job"]
        device = result["device"]
        device = self.devices[device["kind"]][device["hash"]]

        # output benchmark log in main thread.
        getLogger().info(
            "\n{}\n\nBenchmark:\t\t{}\nJob:\t\t\t{}\nDevice Kind:\t\t{}\nDevice Hash:\t\t{}\n{}\n\n{}"
            .format(
                "#" * 80,
                job["identifier"],
                job["id"],
                device["kind"],
                device["hash"],
                job["log"],
                "#" * 80,
            ))

        with LOCK:
            self._coolDown(device, force_reboot=job["status"] != "DONE")

    def _coolDown(self, device, force_reboot=False):
        t = CoolDownDevice(device, self.args, self.db, force_reboot, LOCK)
        t.start()
Esempio n. 7
0
class RunLab(object):
    def __init__(self, raw_args=None):
        self.args, self.unknowns = parser.parse_known_args(raw_args)
        self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger())
        self.adb = ADB(None, self.args.android_dir)
        setLoggerLevel(self.args.logger_level)
        if not self.args.benchmark_db_entry:
            assert self.args.server_addr is not None, \
                "Either server_addr or benchmark_db_entry must be specified"
            while self.args.server_addr[-1] == '/':
                self.args.server_addr = self.args.server_addr[:-1]
            self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/"
        self.db = DBDriver(self.args.benchmark_db, self.args.app_id,
                           self.args.token, self.args.benchmark_table,
                           self.args.job_queue, self.args.test,
                           self.args.benchmark_db_entry)
        self.device_manager = DeviceManager(self.args, self.db)
        self.devices = self.device_manager.getLabDevices()

        if self.args.platform.startswith("host"):
            numProcesses = 2
        else:
            numProcesses = multiprocessing.cpu_count() - 1
        self.pool = multiprocessing.Pool(processes=numProcesses)

    def run(self):
        hookSignals()
        while (not stopRun(self.args)):
            with LOCK:
                self._runOnce()
            time.sleep(1)
        self.pool.close()
        self.db.updateDevices(self.args.claimer_id, "", True)
        self.device_manager.shutdown()

    def _runOnce(self):
        jobs = self._claimBenchmarks()
        jobs_queue, remaining_jobs = self._selectBenchmarks(jobs)
        if len(remaining_jobs) != 0:
            self._releaseBenchmarks(remaining_jobs)
        if len(jobs_queue) == 0:
            return
        self._runBenchmarks(jobs_queue)

    def _claimBenchmarks(self):
        claimer_id = self.args.claimer_id
        # get available devices with their hashes
        devices = []
        hashes = []
        for k in self.devices:
            for hash in self.devices[k]:
                if self.devices[k][hash]["available"]:
                    devices.append(k)
                    hashes.append(hash)
        hashes = ",".join(hashes)
        devices = ",".join(devices)
        jobs = []
        if len(devices) > 0:
            jobs = self.db.claimBenchmarks(claimer_id, devices, hashes)
        return jobs

    def _selectBenchmarks(self, jobs):
        remaining_jobs = []
        jobs_queue = []
        for job in jobs:
            device_kind = job["device"]
            if device_kind not in self.devices:
                getLogger().error("Retrieved job for device "
                                  "{} ".format(device_kind) +
                                  "cannot be run on server "
                                  "{}".format(self.args.claimer_id))
                remaining_jobs.append(job)
            else:
                for hash in self.devices[device_kind]:
                    device = self.devices[device_kind][hash]
                    if device["available"] is True:
                        job["hash"] = hash
                        jobs_queue.append(job)
                        device["available"] = False
                        break
        return jobs_queue, remaining_jobs

    def _releaseBenchmarks(self, remaining_jobs):
        # releasing unmatched jobs
        releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs])
        self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids)

    def _runBenchmarks(self, jobs_queue):
        # run the jobs in job queue
        run_ids = ",".join([str(job["id"]) for job in jobs_queue])
        self.db.runBenchmarks(self.args.claimer_id, run_ids)
        run_devices = [
            self.devices[job["device"]][job["hash"]] for job in jobs_queue
        ]
        getLogger().info("Updating devices status")
        self.db.updateDevices(self.args.claimer_id,
                              getDevicesString(run_devices), False)
        getLogger().info("Downloading files")
        self._downloadFiles(jobs_queue)

        # run the benchmarks
        for job in jobs_queue:
            identifier = job["identifier"]
            getLogger().info(
                "Running job with identifier {}".format(identifier))
            tempdir = tempfile.mkdtemp(
                prefix="_".join(["aibench", str(identifier), ""]))
            raw_args = self._getRawArgs(job, tempdir)
            self.devices[job["device"]][
                job["hash"]]["start_time"] = time.ctime()
            async_runner = runAsync(self.args, self.devices, self.db, job,
                                    tempdir)

            # Watchdog will be used to kill currently running jobs
            # based on user requests
            app = WatchDog(async_runner, async_runner.didUserRequestJobKill,
                           async_runner.killJob)

            global RUNNING_JOBS
            RUNNING_JOBS += 1
            """
            Python's multiprocessing need to pickle things to sling them
            in different processes. However, bounded methods are not pickable,
            so the way it's doing it here doesn't work.
            Thus, I added __call__ method to the class we are passing into the
            apply_async method.
            Ref: https://stackoverflow.com/a/6975654
            """

            self.pool.apply_async(app,
                                  args=[raw_args],
                                  callback=app.main.callback)

    def _saveBenchmarks(self, job):
        # save benchmarks to files
        benchmarks = job["benchmarks"]
        benchmark = benchmarks["benchmark"]
        content = benchmark["content"]
        benchmark_str = json.dumps(content)
        outfd, path = tempfile.mkstemp(prefix="aibench")
        getLogger().info("Temp directory: {}".format(path))
        with os.fdopen(outfd, "w") as f:
            f.write(benchmark_str)
        job["benchmarks"]["benchmark"]["content"] = path
        if content["tests"][0]["metric"] == "generic":
            job["framework"] = "generic"
        elif "model" in content and "framework" in content["model"]:
            job["framework"] = content["model"]["framework"]
        else:
            getLogger().error("Framework is not specified, "
                              "use Caffe2 as default")
            job["framework"] = "caffe2"
        return path

    def _downloadBinaries(self, info_dict):
        programs = info_dict["programs"]
        program_locations = []
        for bin_name in programs:
            program_location = programs[bin_name]["location"]
            self.benchmark_downloader.downloadFile(program_location, None)
            if program_location.startswith("//"):
                program_location = self.args.root_model_dir + program_location[
                    1:]
            elif program_location.startswith("http"):
                replace_pattern = {
                    " ": '-',
                    "\\": '-',
                    ":": '/',
                }
                program_location = os.path.join(
                    self.args.root_model_dir,
                    getFilename(program_location,
                                replace_pattern=replace_pattern))
            elif program_location.startswith("/"):
                program_location = self.args.root_model_dir + program_location
            if self.args.platform.startswith("ios") and \
                    bin_name == "program" and \
                    not program_location.endswith(".ipa"):
                new_location = program_location + ".ipa"
                os.rename(program_location, new_location)
                program_location = new_location
            os.chmod(program_location,
                     stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR)
            programs[bin_name]["location"] = program_location
            program_locations.append(program_location)
        return program_locations

    def _downloadFiles(self, jobs_queue):
        for job in jobs_queue:
            job["models_location"] = []
            # added log capture for reporting
            log_capture_string = StringIO()
            ch = logging.StreamHandler(log_capture_string)
            ch.setLevel(logging.DEBUG)
            getLogger().addHandler(ch)
            # download the models
            try:
                getLogger().info("Downloading models")
                path = self._saveBenchmarks(job)
                location = self.benchmark_downloader.run(path)
                job["models_location"].extend(location)
            except Exception as e:
                getLogger().error("Unknown exception {}".format(
                    sys.exc_info()[0]))
                getLogger().error(
                    "Error downloading models. Job id: {}".format(job["id"]))
                getLogger().error(e)
                job["download_error_log"] = log_capture_string.getvalue()
            getLogger().info("Downloading programs")
            # download the programs
            if "info" not in job["benchmarks"]:
                continue
            try:
                if "treatment" not in job["benchmarks"]["info"]:
                    getLogger().error("Field treatment "
                                      "must exist in job[\"benchmarks\"]")
                elif "programs" not in job["benchmarks"]["info"]["treatment"]:
                    getLogger().error(
                        "Field \"program\" must exist in "
                        "job[\"benchmarks\"][\"info\"][\"treatment\"]")
                else:
                    treatment_info = job["benchmarks"]["info"]["treatment"]
                    getLogger().info("Downloading treatment binary")
                    treatment_locations = self._downloadBinaries(
                        treatment_info)
                    job["programs_location"] = treatment_locations

                if "control" in job["benchmarks"]["info"]:
                    if "programs" not in job["benchmarks"]["info"]["control"]:
                        getLogger().error(
                            "Field \"program\" must exist in "
                            "job[\"benchmarks\"][\"info\"][\"control\"]")
                    else:
                        control_info = job["benchmarks"]["info"]["control"]
                        getLogger().info("Downloading control binary")
                        control_locations = self._downloadBinaries(
                            control_info)
                        job["programs_location"].extend(control_locations)

            except Exception as e:
                getLogger().error("Unknown exception {}".format(
                    sys.exc_info()[0]))
                getLogger().error(
                    "Error downloading programs. Job id: {}".format(job["id"]))
                getLogger().error(e)
                job["download_error_log"] = log_capture_string.getvalue()
            log_capture_string.close()
            getLogger().handlers.pop()
            gc.collect()

    def _getRawArgs(self, job, tempdir):
        if "info" in job["benchmarks"]:
            info = job["benchmarks"]["info"]
        elif "program" in job["benchmarks"]:
            # TODO: remove after all clients are updated
            info = {
                "treatment": {
                    "commit": "interactive",
                    "commit_time": 0,
                    "program": job["benchmarks"]["program"],
                }
            }
        # pass the device hash as well as type
        device = {"kind": job["device"], "hash": job["hash"]}
        device_str = json.dumps(device)
        raw_args = []
        raw_args.extend([
            "--benchmark_file",
            job["benchmarks"]["benchmark"]["content"],
            "--cooldown",
            str(self.args.cooldown),
            "--device",
            device_str,
            "--framework",
            job["framework"],
            "--info",
            json.dumps(info),
            "--model_cache",
            self.args.model_cache,
            "--platform",
            self.args.platform,
            "--remote_access_token",
            self.args.remote_access_token,
            "--root_model_dir",
            self.args.root_model_dir,
            "--simple_local_reporter",
            tempdir,
            "--user_identifier",
            str(job["identifier"]),
            "--user_string",
            job.get("user"),
        ])
        if job["framework"] != "generic":
            raw_args.extend(["--remote_reporter", self.args.remote_reporter])
        if self.args.shared_libs:
            raw_args.extend(
                ["--shared_libs", "'" + self.args.shared_libs + "'"])
        if self.args.timeout:
            raw_args.extend(["--timeout", str(self.args.timeout)])
        if self.args.platform_sig:
            raw_args.append("--platform_sig")
            raw_args.append(self.args.platform_sig)
        if self.args.monsoon_map:
            raw_args.extend(["--monsoon_map", str(self.args.monsoon_map)])
        if self.args.hash_platform_mapping:
            # if the user provides filename, we will load it.
            raw_args.append("--hash_platform_mapping")
            raw_args.append(self.args.hash_platform_mapping)
        if self.args.device_name_mapping:
            raw_args.append("--device_name_mapping")
            raw_args.append(self.args.device_name_mapping)

        return raw_args