def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) self._updateArgs(self.args) setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert ( self.args.server_addr is not None ), "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == "/": self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver( self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry, ) self.url_printer = PrintResultURL(self.args) self.file_handler = FileHandler(self.args) self.devices = Devices(self.args.devices_config) # Hard code scuba table self.scuba_dataset = "caffe2_benchmarking" self.info = None self.temprdir = ""
def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) os.environ["CLAIMER"] = self.args.claimer_id self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger()) self.adb = ADB(None, self.args.android_dir) setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert ( self.args.server_addr is not None ), "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == "/": self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver( self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry, ) self.device_manager = DeviceManager(self.args, self.db) self.devices = self.device_manager.getLabDevices() if self.args.platform.startswith("host"): numProcesses = 2 else: numProcesses = multiprocessing.cpu_count() - 1 self.pool = Pool(max_workers=numProcesses, initializer=hookSignals)
def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger()) self.adb = ADB(None, self.args.android_dir) devices = self._getDevices() setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert self.args.server_addr is not None, \ "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == '/': self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver(self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry) self.devices = {} for k in devices: kind = k["kind"] hash = k["hash"] entry = { "kind": kind, "hash": hash, "available": True, "live": True, "start_time": None, "done_time": None, "output_dir": None, "job": None, "adb": ADB(hash, self.args.android_dir), "reboot_time": datetime.datetime.now() - datetime.timedelta(hours=8) } if kind not in self.devices: self.devices[kind] = {} assert hash not in self.devices[kind], \ "Device {} ({}) is attached twice.".format(kind, hash) self.devices[kind][hash] = entry dvs = [ self.devices[k][h] for k in self.devices for h in self.devices[k] ] self.db.updateDevices(self.args.claimer_id, getDevicesString(dvs), True) if self.args.platform.startswith("host"): numProcesses = 2 else: numProcesses = multiprocessing.cpu_count() - 1 self.pool = multiprocessing.Pool(processes=numProcesses)
class RunRemote(object): def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert self.args.server_addr is not None, \ "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == '/': self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver(self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry) self.url_printer = PrintResultURL(self.args) self.file_handler = FileHandler(self.args) self.devices = Devices(self.args.devices_config) # Hard code scuba table self.scuba_dataset = "caffe2_benchmarking" self.info = None self.temprdir = '' def run(self): if self.args.list_devices: self._listDevices() return if self.args.list_job_queues: self._printJobQueues() return if self.args.fetch_status or self.args.fetch_result: result = self._fetchResult() return result if self.args.query_num_devices: return self._queryNumDevices(self.args.query_num_devices) assert self.args.benchmark_file, \ "--benchmark_file (-b) must be specified" assert self.args.devices, "--devices must be specified" assert self.args.framework, "--framework must be specified" assert self.args.platform, "--platform must be specified" assert self.args.repo_dir, "--repo_dir must be specified" assert ((self.args.info is not None) and (self.args.custom_binary is None) and (self.args.pre_built_binary is None)) or (self.args.info is None), \ "--info cannot co-exist with --custom_binary and --pre_built_binary" list_job_queues = self._listJobQueues() if not self.args.force_submit: self._checkDevices(self.args.devices, self.args.hashes) assert self.args.job_queue != "*" and \ self.args.job_queue in list_job_queues, \ "--job_queue must be choosen from " + " ".join(list_job_queues) self.tempdir = tempfile.mkdtemp() program_filenames = {} if self.args.info: self.info = json.loads(self.args.info) else: self.info = {"treatment": {"programs": {}}} if self.args.string_map: self.info["treatment"]["string_map"] = str( self.args.string_map) assert (("treatment" in self.info) and ("programs" in self.info["treatment"])), \ 'In --info, field treatment must exist. In info["treatment"] ' \ "program field must exist (may be None)" binary = self.info["treatment"]["programs"]["program"]["location"] \ if ("programs" in self.info["treatment"] and "program" in self.info["treatment"]["programs"]) \ else self.args.custom_binary if self.args.custom_binary \ else self.args.pre_built_binary t = BuildProgram(self.args, self.file_handler, self.tempdir, program_filenames, binary) t.start() benchmarks = getBenchmarks(self.args.benchmark_file, self.args.framework) for benchmark in benchmarks: self._uploadOneBenchmark(benchmark) if self.args.debug: for test in benchmark["content"]["tests"]: test["log_output"] = True if self.args.env: env = {} env_vars = self.args.env.split() for env_var in env_vars: k, v = parse_kwarg(env_var) env[k] = v for test in benchmark["content"]["tests"]: cmd_env = {} cmd_env.update(env) if "env" in test: cmd_env.update(test["env"]) test["env"] = cmd_env t.join() assert "program" in program_filenames, \ "program does not exist. Build may be failed." for fn in program_filenames: self.info["treatment"]["programs"][fn] = { "location": program_filenames[fn] } # Pass meta file from build to benchmark meta = getMeta(self.args, self.args.platform) if meta: assert "meta" not in self.info, \ "info field already has a meta field" self.info["meta"] = meta new_devices = self.devices.getFullNames(self.args.devices) user_identifier = int(self.args.user_identifier) \ if self.args.user_identifier else randint(1, 1000000000000000) user = getuser( ) if not self.args.user_string else self.args.user_string hashes = self.args.hashes for benchmark in benchmarks: data = { "benchmark": benchmark, "info": self.info, } self.db.submitBenchmarks(data, new_devices, user_identifier, user, hashes) if self.args.async_submit: return self.url_printer.printURL(self.scuba_dataset, user_identifier, benchmarks) if not self.args.debug: shutil.rmtree(self.tempdir, True) if self.args.screen_reporter: self._screenReporter(user_identifier) def _uploadOneBenchmark(self, benchmark): filename = benchmark["filename"] one_benchmark = benchmark["content"] # TODO refactor the code to collect all files to upload del_paths = [] if "model" in one_benchmark: if "files" in one_benchmark["model"]: for field in one_benchmark["model"]["files"]: value = one_benchmark["model"]["files"][field] assert "location" in value, \ "location field is missing in benchmark " \ "{}".format(filename) ref_path = ["files", field] if self._uploadFile(value, filename, benchmark, ref_path): del_paths.append(ref_path) if "libraries" in one_benchmark["model"]: for value in one_benchmark["model"]["libraries"]: assert "location" in value, \ "location field is missing in benchmark " \ "{}".format(filename) self._uploadFile(value, filename, benchmark) for del_path in del_paths: self._del_from_benchmark(benchmark["content"]["model"], del_path) # upload test file assert "tests" in one_benchmark, \ "tests field is missing in benchmark {}".format(filename) tests = one_benchmark["tests"] for test in tests: if "input_files" in test: self._uploadTestFiles(test["input_files"], filename) # ignore the outputs for non accuracy metrics if "output_files" in test and test["metric"] == "error": self._uploadTestFiles(test["output_files"], filename) def _uploadTestFiles(self, files, basefilename): if isinstance(files, list): for i in range(len(files)): f = files[i] self._uploadFile(f, basefilename) elif isinstance(files, dict): for f in files: value = files[f] if isinstance(value, list): for i in range(len(value)): v = value[i] self._uploadFile(v, basefilename) else: self._uploadFile(value, basefilename) def _uploadFile(self, f, basefilename, benchmark=None, ref_path=None, cache_file=True): if "location" not in f: return location = f["location"] if "md5" not in f: raise Exception("No md5sum provided for {}".format(f["filename"])) md5 = f["md5"] """ For the file from repo, there is special handling we need to fetch both control and treatment , and also move the file from benchmark to info Note: Support the file in model first """ if location.startswith("//repo"): assert ref_path is not None, "repo is not yet \ supported for {}".format(location) for side in self.info: if side == "extra": continue value = self.info[side] commit_hash = "master" if "commit" in value: commit_hash = value["commit"] or "master" tgt_file = self._downloadRepoFile(location, self.tempdir, commit_hash) f["location"], f["md5"] = self.file_handler.uploadFile( tgt_file, md5, basefilename, cache_file) # add to info assert len(ref_path), "ref_path must be a path to target file" value["programs"][".".join(ref_path)] = { "location": f["location"] } # remove from benchmark assert benchmark is not None, \ "benchmark must be passed into _uploadFile" return True else: f["location"], f["md5"] = self.file_handler.uploadFile( location, md5, basefilename, cache_file) return False def _downloadRepoFile(self, location, tgt_dir, commit_hash): """ location: //repo/fbsource/fbcode/aibench/...../a.py """ raw_scm_query = pkg_resources.resource_string( "aibench", "benchmarking/bin/scm_query.par") query_exe = os.path.join(tgt_dir, "scm_query.par") with open(query_exe, "wb") as f: f.write(raw_scm_query) cmd = ['chmod', '+x', os.path.join(tgt_dir, "scm_query.par")] subprocess.check_output(cmd) dirs = location[2:].split("/") tgt_file = os.path.join(tgt_dir, dirs[-1]) cmd = [ query_exe, '--repo', dirs[1], '--file_path', '/'.join(dirs[2:]), '--target_file', tgt_file, '--commit_hash', commit_hash ] getLogger().info("Downloading {}".format(location)) subprocess.check_output(cmd) os.remove(query_exe) return tgt_file def _del_from_benchmark(self, benchmark, ref_path): tgt = benchmark for item in ref_path[:-1]: tgt = tgt[item] tgt.pop(ref_path[-1]) def _listDevices(self, flag=True): devices = self.db.listDevices(self.args.job_queue) headers = ["Device", "Status", "Abbrs", "Hash"] rows = [] for device in devices: abbrs = self.devices.getAbbrs(device["device"]) abbrs = ",".join(abbrs) if abbrs else "" hash = device["hash"] row = [device["device"], device["status"], abbrs, hash] rows.append(row) rows.sort() if flag: table = tabulate(rows, headers=headers, tablefmt='orgtbl') print("\n{}\n".format(table)) return rows def _checkDevices(self, specified_devices, hashes=None): rows = self._listDevices(flag=False) specifiedDevices = set(specified_devices.split(",")) specifiedHashes = None if hashes: hashes = hashes.split(",") devices = specified_devices.split(",") if len(hashes) != len(devices): raise Exception( "You need to provide same number of hashes and devices") specifiedHashes = {} for i, hash in enumerate(hashes): specifiedHashes[hash] = devices[i] devices = {} devicesIn = True for row in rows: abbrs = row[-2].split(",") if row[-2] else [] if row[-1] not in devices: devices[row[-1]] = {row[0]}.union(set(abbrs)) else: devices[row[-1]].union({row[0]}.union(set(abbrs))) if specifiedHashes: for specifiedHash in specifiedHashes: if specifiedHash not in devices or \ specifiedHashes[specifiedHash] not in devices[specifiedHash]: devicesIn = False else: allDevices = set() for v in devices.values(): allDevices = allDevices.union(v) devicesIn = not specifiedDevices.difference(allDevices) if not devicesIn: errMessages = " ".join([ "Devices", specified_devices, "is not available in the job_queue", self.args.job_queue ]) if hashes: errMessages = " ".join([ "Devices", specified_devices, "with hashes", ",".join(hashes), "is not available in the job_queue", self.args.job_queue ]) raise Exception(errMessages) def _queryNumDevices(self, device_name): deviceCounter = defaultdict(int) for device in self.db.listDevices(self.args.job_queue): abbrs = self.devices.getAbbrs(device["device"]) if device["device"] == device_name or device_name in (abbrs or []): deviceCounter[device["status"]] += 1 return deviceCounter def _listJobQueues(self): devices = self.db.listDevices(job_queue="*") list_job_queues = sorted({device['job_queue'] for device in devices}) return list_job_queues def _printJobQueues(self): list_job_queues = self._listJobQueues() for jobQueue in list_job_queues: print(jobQueue) def _screenReporter(self, user_identifier): reporter = ScreenReporter(self.db, self.devices, self.args.debug) reporter.run(user_identifier, self.args.urlPrefix) def _fetchResult(self): user_identifier = self.args.user_identifier assert user_identifier, "User identifier must be specified for " \ "fetching the status and/or result of the previously run benchmarks" statuses = self.db.statusBenchmarks(user_identifier) result = None if self.args.fetch_status: result = json.dumps(statuses) elif self.args.fetch_result: ids = ",".join([str(status["id"]) for status in statuses]) output = self.db.getBenchmarks(ids) self._mobilelabResult(output) result = json.dumps(output) print(result) return result def _mobilelabResult(self, output): # always get the last result for item in output: raw_result = item["result"] if raw_result is None: continue result = json.loads(raw_result) mobilelab_result = {"treatment": {}, "control": {}} for k in result: # k is identifier v = result[k] for kk in v: vv = v[kk] # update values if only summary exists if "values" not in vv or len(vv["values"]) == 0: if "summary" in vv: if "mean" in vv["summary"]: vv["values"] = [vv["summary"]["mean"]] elif "p50" in vv["summary"]: vv["values"] = [vv["summary"]["p50"]] if "control_summary" in vv: if "mean" in vv["control_summary"]: vv["control_values"] = \ [vv["control_summary"]["mean"]] elif "p50" in vv["control_summary"]: vv["control_values"] = \ [vv["control_summary"]["p50"]] # check values again if "values" not in vv or len(vv["values"]) == 0: continue assert vv["type"], "type is missing in {}".format(kk) assert vv["metric"], "metric is missing in {}".format(kk) if vv["metric"] == "flops": continue unit = vv["unit"] if "unit" in vv else "null" self._mobilelabAddField(mobilelab_result["treatment"], k, vv["type"], vv["metric"], vv["values"], unit) if "control_values" in vv: self._mobilelabAddField(mobilelab_result["control"], k, vv["type"], vv["metric"], vv["control_values"], unit) item["mobilelab_result"] = mobilelab_result def _mobilelabAddField(self, output, identifier, type, metric, values, unit): key = "{}__{}__{}".format(identifier, type, metric) key = re.sub('\W+', '_', key) assert key not in output, \ "duplicate key {}".format(key) output[key] = { "values": values, "metric": metric, "type": type, "unit": unit, }
class RunLab(object): def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger()) self.adb = ADB(None, self.args.android_dir) devices = self._getDevices() setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert self.args.server_addr is not None, \ "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == '/': self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver(self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry) self.devices = {} for k in devices: kind = k["kind"] hash = k["hash"] entry = { "kind": kind, "hash": hash, "available": True, "live": True, "start_time": None, "done_time": None, "output_dir": None, "job": None, "adb": ADB(hash, self.args.android_dir), "reboot_time": datetime.datetime.now() - datetime.timedelta(hours=8) } if kind not in self.devices: self.devices[kind] = {} assert hash not in self.devices[kind], \ "Device {} ({}) is attached twice.".format(kind, hash) self.devices[kind][hash] = entry dvs = [ self.devices[k][h] for k in self.devices for h in self.devices[k] ] self.db.updateDevices(self.args.claimer_id, getDevicesString(dvs), True) if self.args.platform.startswith("host"): numProcesses = 2 else: numProcesses = multiprocessing.cpu_count() - 1 self.pool = multiprocessing.Pool(processes=numProcesses) def run(self): while (not stopRun(self.args)): with LOCK: self._runOnce() time.sleep(1) self.db.updateDevices(self.args.claimer_id, "", True) def _runOnce(self): jobs = self._claimBenchmarks() jobs_queue, remaining_jobs = self._selectBenchmarks(jobs) if len(remaining_jobs) != 0: self._releaseBenchmarks(remaining_jobs) if len(jobs_queue) == 0: return self._runBenchmarks(jobs_queue) def _claimBenchmarks(self): claimer_id = self.args.claimer_id # get available devices devices = ",".join([ k for k in self.devices if any(self.devices[k][hash]["available"] is True for hash in self.devices[k]) ]) jobs = [] if len(devices) > 0: jobs = self.db.claimBenchmarks(claimer_id, devices) return jobs def _selectBenchmarks(self, jobs): remaining_jobs = [] jobs_queue = [] for job in jobs: device_kind = job["device"] if device_kind not in self.devices: getLogger().error("Retrieved job for device " "{} ".format(device_kind) + "cannot be run on server " "{}".format(self.args.claimer_id)) remaining_jobs.append(job) else: for hash in self.devices[device_kind]: device = self.devices[device_kind][hash] if device["available"] is True: job["hash"] = hash jobs_queue.append(job) device["available"] = False break return jobs_queue, remaining_jobs def _releaseBenchmarks(self, remaining_jobs): # releasing unmatched jobs releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs]) self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids) def _runBenchmarks(self, jobs_queue): # run the jobs in job queue run_ids = ",".join([str(job["id"]) for job in jobs_queue]) self.db.runBenchmarks(self.args.claimer_id, run_ids) run_devices = [ self.devices[job["device"]][job["hash"]] for job in jobs_queue ] self.db.updateDevices(self.args.claimer_id, getDevicesString(run_devices), False) self._downloadFiles(jobs_queue) # run the benchmarks for job in jobs_queue: tempdir = tempfile.mkdtemp() raw_args = self._getRawArgs(job, tempdir) self.devices[job["device"]][ job["hash"]]["start_time"] = time.ctime() app = runAsync(self.args, self.devices, self.db, job, tempdir) """ Python's multiprocessing need to pickle things to sling them in different processes. However, bounded methods are not pickable, so the way it's doing it here doesn't work. Thus, I added __call__ method in runAsync class and call the class here, since class object is pickable. Ref: https://stackoverflow.com/a/6975654 """ self.pool.apply_async(app, args=[raw_args], callback=app.callback) def _saveBenchmarks(self, jobs_queue): benchmark_files = [] # save benchmarks to files for job in jobs_queue: benchmarks = job["benchmarks"] benchmark = benchmarks["benchmark"] content = benchmark["content"] benchmark_str = json.dumps(content) outfd, path = tempfile.mkstemp() with os.fdopen(outfd, "w") as f: f.write(benchmark_str) job["benchmarks"]["benchmark"]["content"] = path if content["tests"][0]["metric"] == "generic": job["framework"] = "generic" elif "model" in content and "framework" in content["model"]: job["framework"] = content["model"]["framework"] else: getLogger().error("Framework is not specified, " "use Caffe2 as default") job["framework"] = "caffe2" benchmark_files.append(path) return benchmark_files def _downloadBinaries(self, info_dict): programs = info_dict["programs"] for bin_name in programs: program_location = programs[bin_name]["location"] self.benchmark_downloader.downloadFile(program_location, None) if program_location.startswith("//"): program_location = self.args.root_model_dir + program_location[ 1:] elif program_location.startswith("http"): replace_pattern = { " ": '-', "\\": '-', ":": '/', } program_location = self.args.root_model_dir + '/' +\ getFilename(program_location, replace_pattern=replace_pattern) elif program_location.startswith("/"): program_location = self.args.root_model_dir + program_location if self.args.platform.startswith("ios") and \ bin_name == "program" and \ not program_location.endswith(".ipa"): new_location = program_location + ".ipa" os.rename(program_location, new_location) program_location = new_location os.chmod(program_location, stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR) programs[bin_name]["location"] = program_location def _downloadFiles(self, jobs_queue): benchmark_files = self._saveBenchmarks(jobs_queue) # download the models for bf in benchmark_files: self.benchmark_downloader.run(bf) # download the programs for job in jobs_queue: if "info" not in job["benchmarks"]: continue try: if "treatment" not in job["benchmarks"]["info"]: getLogger().error("Field treatment " "must exist in job[\"benchmarks\"]") elif "programs" not in job["benchmarks"]["info"]["treatment"]: getLogger().error( "Field \"program\" must exist in " "job[\"benchmarks\"][\"info\"][\"treatment\"]") else: treatment_info = job["benchmarks"]["info"]["treatment"] self._downloadBinaries(treatment_info) if "control" in job["benchmarks"]["info"]: if "programs" not in job["benchmarks"]["info"]["control"]: getLogger().error( "Field \"program\" must exist in " "job[\"benchmarks\"][\"info\"][\"control\"]") else: control_info = job["benchmarks"]["info"]["control"] self._downloadBinaries(control_info) except Exception: getLogger().error("Unknown exception {}".format( sys.exc_info()[0])) getLogger().error("File download failure") return benchmark_files def _getDevices(self): raw_args = [] raw_args.extend(["--platform", self.args.platform]) if self.args.platform_sig: raw_args.append("--platform_sig") raw_args.append(self.args.platform_sig) if self.args.devices: raw_args.append("--devices") raw_args.append(self.args.devices) if self.args.hash_platform_mapping: # if the user provides filename, we will load it. raw_args.append("--hash_platform_mapping") raw_args.append(self.args.hash_platform_mapping) app = GetConnectedDevices(raw_args=raw_args) devices_json = app.run() assert devices_json, "Devices cannot be empty" devices = json.loads(devices_json.strip()) return devices def _getRawArgs(self, job, tempdir): if "info" in job["benchmarks"]: info = job["benchmarks"]["info"] elif "program" in job["benchmarks"]: # TODO: remove after all clients are updated info = { "treatment": { "commit": "interactive", "commit_time": 0, "program": job["benchmarks"]["program"], } } # pass the device hash as well as type device = {"kind": job["device"], "hash": job["hash"]} device_str = json.dumps(device) raw_args = [] raw_args.extend([ "--benchmark_file", job["benchmarks"]["benchmark"]["content"], "--cooldown", str(self.args.cooldown), "--device", device_str, "--framework", job["framework"], "--info", json.dumps(info), "--model_cache", self.args.model_cache, "--platform", self.args.platform, "--remote_access_token", self.args.remote_access_token, "--root_model_dir", self.args.root_model_dir, "--simple_local_reporter", tempdir, "--user_identifier", str(job["identifier"]), ]) if job["framework"] != "generic": raw_args.extend(["--remote_reporter", self.args.remote_reporter]) if self.args.shared_libs: raw_args.extend( ["--shared_libs", "'" + self.args.shared_libs + "'"]) if self.args.timeout: raw_args.extend(["--timeout", str(self.args.timeout)]) if self.args.platform_sig: raw_args.append("--platform_sig") raw_args.append(self.args.platform_sig) if self.args.monsoon_map: raw_args.extend(["--monsoon_map", str(self.args.monsoon_map)]) if self.args.hash_platform_mapping: # if the user provides filename, we will load it. raw_args.append("--hash_platform_mapping") raw_args.append(self.args.hash_platform_mapping) return raw_args
class RunLab(object): def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) os.environ["CLAIMER"] = self.args.claimer_id self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger()) self.adb = ADB(None, self.args.android_dir) setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert ( self.args.server_addr is not None ), "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == "/": self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver( self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry, ) self.device_manager = DeviceManager(self.args, self.db) self.devices = self.device_manager.getLabDevices() if self.args.platform.startswith("host"): numProcesses = 2 else: numProcesses = multiprocessing.cpu_count() - 1 self.pool = Pool(max_workers=numProcesses, initializer=hookSignals) def run(self): hookSignals() while not stopRun(self.args): with LOCK: self._runOnce() time.sleep(1) self.db.updateDevices(self.args.claimer_id, "", True) self.device_manager.shutdown() def _runOnce(self): jobs = self._claimBenchmarks() jobs_queue, remaining_jobs = self._selectBenchmarks(jobs) if len(remaining_jobs) != 0: self._releaseBenchmarks(remaining_jobs) if len(jobs_queue) == 0: return self._runBenchmarks(jobs_queue) def _claimBenchmarks(self): claimer_id = self.args.claimer_id # get available devices with their hashes devices = [] hashes = [] for k in self.devices: for hash in self.devices[k]: if self.devices[k][hash]["available"]: devices.append(k) hashes.append(hash) hashes = ",".join(hashes) devices = ",".join(devices) jobs = [] if len(devices) > 0: jobs = self.db.claimBenchmarks(claimer_id, devices, hashes) return jobs def _selectBenchmarks(self, jobs): remaining_jobs = [] jobs_queue = [] for job in jobs: device_kind = job["device"] if device_kind not in self.devices: getLogger().error("Retrieved job for device " "{} ".format(device_kind) + "cannot be run on server " "{}".format(self.args.claimer_id)) remaining_jobs.append(job) else: for hash in self.devices[device_kind]: device = self.devices[device_kind][hash] if device["available"] is True: job["hash"] = hash jobs_queue.append(job) device["available"] = False break return jobs_queue, remaining_jobs def _releaseBenchmarks(self, remaining_jobs): # releasing unmatched jobs releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs]) self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids) def _runBenchmarks(self, jobs_queue): """Given a queue of jobs, update run statuses and device statuses in db, and spawn job processes.""" run_ids = ",".join([str(job["id"]) for job in jobs_queue]) self.db.runBenchmarks(self.args.claimer_id, run_ids) run_devices = [ self.devices[job["device"]][job["hash"]] for job in jobs_queue ] getLogger().info("Updating devices status") self.db.updateDevices(self.args.claimer_id, getDevicesString(run_devices), False) # run the benchmarks for job in jobs_queue: getLogger().info( f"Running job with identifier {job['identifier']} and id {job['id']}" ) device = self.devices[job["device"]][job["hash"]] device["start_time"] = time.ctime() async_runner = runAsync( self.args, device, self.db, job, self.benchmark_downloader, self.device_manager.usb_controller, ) # Watchdog will be used to kill currently running jobs # based on user requests app = WatchDog(async_runner, async_runner.didUserRequestJobKill, async_runner.killJob) global RUNNING_JOBS RUNNING_JOBS += 1 """ Python's multiprocessing need to pickle things to sling them in different processes. However, bounded methods are not pickable, so the way it's doing it here doesn't work. Thus, I added __call__ method to the class we are passing into the apply_async method. Ref: https://stackoverflow.com/a/6975654 """ future = self.pool.submit(app) future.add_done_callback(self.callback) def callback(self, future_result_dict): """Decrement running jobs count, output job log, and start device cooldown.""" global RUNNING_JOBS RUNNING_JOBS -= 1 result = future_result_dict.result() job = result["job"] device = result["device"] device = self.devices[device["kind"]][device["hash"]] # output benchmark log in main thread. getLogger().info( "\n{}\n\nBenchmark:\t\t{}\nJob:\t\t\t{}\nDevice Kind:\t\t{}\nDevice Hash:\t\t{}\n{}\n\n{}" .format( "#" * 80, job["identifier"], job["id"], device["kind"], device["hash"], job["log"], "#" * 80, )) with LOCK: self._coolDown(device, force_reboot=job["status"] != "DONE") def _coolDown(self, device, force_reboot=False): t = CoolDownDevice(device, self.args, self.db, force_reboot, LOCK) t.start()
class RunLab(object): def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger()) self.adb = ADB(None, self.args.android_dir) setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert self.args.server_addr is not None, \ "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == '/': self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver(self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry) self.device_manager = DeviceManager(self.args, self.db) self.devices = self.device_manager.getLabDevices() if self.args.platform.startswith("host"): numProcesses = 2 else: numProcesses = multiprocessing.cpu_count() - 1 self.pool = multiprocessing.Pool(processes=numProcesses) def run(self): hookSignals() while (not stopRun(self.args)): with LOCK: self._runOnce() time.sleep(1) self.pool.close() self.db.updateDevices(self.args.claimer_id, "", True) self.device_manager.shutdown() def _runOnce(self): jobs = self._claimBenchmarks() jobs_queue, remaining_jobs = self._selectBenchmarks(jobs) if len(remaining_jobs) != 0: self._releaseBenchmarks(remaining_jobs) if len(jobs_queue) == 0: return self._runBenchmarks(jobs_queue) def _claimBenchmarks(self): claimer_id = self.args.claimer_id # get available devices with their hashes devices = [] hashes = [] for k in self.devices: for hash in self.devices[k]: if self.devices[k][hash]["available"]: devices.append(k) hashes.append(hash) hashes = ",".join(hashes) devices = ",".join(devices) jobs = [] if len(devices) > 0: jobs = self.db.claimBenchmarks(claimer_id, devices, hashes) return jobs def _selectBenchmarks(self, jobs): remaining_jobs = [] jobs_queue = [] for job in jobs: device_kind = job["device"] if device_kind not in self.devices: getLogger().error("Retrieved job for device " "{} ".format(device_kind) + "cannot be run on server " "{}".format(self.args.claimer_id)) remaining_jobs.append(job) else: for hash in self.devices[device_kind]: device = self.devices[device_kind][hash] if device["available"] is True: job["hash"] = hash jobs_queue.append(job) device["available"] = False break return jobs_queue, remaining_jobs def _releaseBenchmarks(self, remaining_jobs): # releasing unmatched jobs releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs]) self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids) def _runBenchmarks(self, jobs_queue): # run the jobs in job queue run_ids = ",".join([str(job["id"]) for job in jobs_queue]) self.db.runBenchmarks(self.args.claimer_id, run_ids) run_devices = [ self.devices[job["device"]][job["hash"]] for job in jobs_queue ] getLogger().info("Updating devices status") self.db.updateDevices(self.args.claimer_id, getDevicesString(run_devices), False) getLogger().info("Downloading files") self._downloadFiles(jobs_queue) # run the benchmarks for job in jobs_queue: identifier = job["identifier"] getLogger().info( "Running job with identifier {}".format(identifier)) tempdir = tempfile.mkdtemp( prefix="_".join(["aibench", str(identifier), ""])) raw_args = self._getRawArgs(job, tempdir) self.devices[job["device"]][ job["hash"]]["start_time"] = time.ctime() async_runner = runAsync(self.args, self.devices, self.db, job, tempdir) # Watchdog will be used to kill currently running jobs # based on user requests app = WatchDog(async_runner, async_runner.didUserRequestJobKill, async_runner.killJob) global RUNNING_JOBS RUNNING_JOBS += 1 """ Python's multiprocessing need to pickle things to sling them in different processes. However, bounded methods are not pickable, so the way it's doing it here doesn't work. Thus, I added __call__ method to the class we are passing into the apply_async method. Ref: https://stackoverflow.com/a/6975654 """ self.pool.apply_async(app, args=[raw_args], callback=app.main.callback) def _saveBenchmarks(self, job): # save benchmarks to files benchmarks = job["benchmarks"] benchmark = benchmarks["benchmark"] content = benchmark["content"] benchmark_str = json.dumps(content) outfd, path = tempfile.mkstemp(prefix="aibench") getLogger().info("Temp directory: {}".format(path)) with os.fdopen(outfd, "w") as f: f.write(benchmark_str) job["benchmarks"]["benchmark"]["content"] = path if content["tests"][0]["metric"] == "generic": job["framework"] = "generic" elif "model" in content and "framework" in content["model"]: job["framework"] = content["model"]["framework"] else: getLogger().error("Framework is not specified, " "use Caffe2 as default") job["framework"] = "caffe2" return path def _downloadBinaries(self, info_dict): programs = info_dict["programs"] program_locations = [] for bin_name in programs: program_location = programs[bin_name]["location"] self.benchmark_downloader.downloadFile(program_location, None) if program_location.startswith("//"): program_location = self.args.root_model_dir + program_location[ 1:] elif program_location.startswith("http"): replace_pattern = { " ": '-', "\\": '-', ":": '/', } program_location = os.path.join( self.args.root_model_dir, getFilename(program_location, replace_pattern=replace_pattern)) elif program_location.startswith("/"): program_location = self.args.root_model_dir + program_location if self.args.platform.startswith("ios") and \ bin_name == "program" and \ not program_location.endswith(".ipa"): new_location = program_location + ".ipa" os.rename(program_location, new_location) program_location = new_location os.chmod(program_location, stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR) programs[bin_name]["location"] = program_location program_locations.append(program_location) return program_locations def _downloadFiles(self, jobs_queue): for job in jobs_queue: job["models_location"] = [] # added log capture for reporting log_capture_string = StringIO() ch = logging.StreamHandler(log_capture_string) ch.setLevel(logging.DEBUG) getLogger().addHandler(ch) # download the models try: getLogger().info("Downloading models") path = self._saveBenchmarks(job) location = self.benchmark_downloader.run(path) job["models_location"].extend(location) except Exception as e: getLogger().error("Unknown exception {}".format( sys.exc_info()[0])) getLogger().error( "Error downloading models. Job id: {}".format(job["id"])) getLogger().error(e) job["download_error_log"] = log_capture_string.getvalue() getLogger().info("Downloading programs") # download the programs if "info" not in job["benchmarks"]: continue try: if "treatment" not in job["benchmarks"]["info"]: getLogger().error("Field treatment " "must exist in job[\"benchmarks\"]") elif "programs" not in job["benchmarks"]["info"]["treatment"]: getLogger().error( "Field \"program\" must exist in " "job[\"benchmarks\"][\"info\"][\"treatment\"]") else: treatment_info = job["benchmarks"]["info"]["treatment"] getLogger().info("Downloading treatment binary") treatment_locations = self._downloadBinaries( treatment_info) job["programs_location"] = treatment_locations if "control" in job["benchmarks"]["info"]: if "programs" not in job["benchmarks"]["info"]["control"]: getLogger().error( "Field \"program\" must exist in " "job[\"benchmarks\"][\"info\"][\"control\"]") else: control_info = job["benchmarks"]["info"]["control"] getLogger().info("Downloading control binary") control_locations = self._downloadBinaries( control_info) job["programs_location"].extend(control_locations) except Exception as e: getLogger().error("Unknown exception {}".format( sys.exc_info()[0])) getLogger().error( "Error downloading programs. Job id: {}".format(job["id"])) getLogger().error(e) job["download_error_log"] = log_capture_string.getvalue() log_capture_string.close() getLogger().handlers.pop() gc.collect() def _getRawArgs(self, job, tempdir): if "info" in job["benchmarks"]: info = job["benchmarks"]["info"] elif "program" in job["benchmarks"]: # TODO: remove after all clients are updated info = { "treatment": { "commit": "interactive", "commit_time": 0, "program": job["benchmarks"]["program"], } } # pass the device hash as well as type device = {"kind": job["device"], "hash": job["hash"]} device_str = json.dumps(device) raw_args = [] raw_args.extend([ "--benchmark_file", job["benchmarks"]["benchmark"]["content"], "--cooldown", str(self.args.cooldown), "--device", device_str, "--framework", job["framework"], "--info", json.dumps(info), "--model_cache", self.args.model_cache, "--platform", self.args.platform, "--remote_access_token", self.args.remote_access_token, "--root_model_dir", self.args.root_model_dir, "--simple_local_reporter", tempdir, "--user_identifier", str(job["identifier"]), "--user_string", job.get("user"), ]) if job["framework"] != "generic": raw_args.extend(["--remote_reporter", self.args.remote_reporter]) if self.args.shared_libs: raw_args.extend( ["--shared_libs", "'" + self.args.shared_libs + "'"]) if self.args.timeout: raw_args.extend(["--timeout", str(self.args.timeout)]) if self.args.platform_sig: raw_args.append("--platform_sig") raw_args.append(self.args.platform_sig) if self.args.monsoon_map: raw_args.extend(["--monsoon_map", str(self.args.monsoon_map)]) if self.args.hash_platform_mapping: # if the user provides filename, we will load it. raw_args.append("--hash_platform_mapping") raw_args.append(self.args.hash_platform_mapping) if self.args.device_name_mapping: raw_args.append("--device_name_mapping") raw_args.append(self.args.device_name_mapping) return raw_args