class RunLab(object): def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) os.environ["CLAIMER"] = self.args.claimer_id self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger()) self.adb = ADB(None, self.args.android_dir) setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert ( self.args.server_addr is not None ), "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == "/": self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver( self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry, ) self.device_manager = DeviceManager(self.args, self.db) self.devices = self.device_manager.getLabDevices() if self.args.platform.startswith("host"): numProcesses = 2 else: numProcesses = multiprocessing.cpu_count() - 1 self.pool = Pool(max_workers=numProcesses, initializer=hookSignals) def run(self): hookSignals() while not stopRun(self.args): with LOCK: self._runOnce() time.sleep(1) self.db.updateDevices(self.args.claimer_id, "", True) self.device_manager.shutdown() def _runOnce(self): jobs = self._claimBenchmarks() jobs_queue, remaining_jobs = self._selectBenchmarks(jobs) if len(remaining_jobs) != 0: self._releaseBenchmarks(remaining_jobs) if len(jobs_queue) == 0: return self._runBenchmarks(jobs_queue) def _claimBenchmarks(self): claimer_id = self.args.claimer_id # get available devices with their hashes devices = [] hashes = [] for k in self.devices: for hash in self.devices[k]: if self.devices[k][hash]["available"]: devices.append(k) hashes.append(hash) hashes = ",".join(hashes) devices = ",".join(devices) jobs = [] if len(devices) > 0: jobs = self.db.claimBenchmarks(claimer_id, devices, hashes) return jobs def _selectBenchmarks(self, jobs): remaining_jobs = [] jobs_queue = [] for job in jobs: device_kind = job["device"] if device_kind not in self.devices: getLogger().error("Retrieved job for device " "{} ".format(device_kind) + "cannot be run on server " "{}".format(self.args.claimer_id)) remaining_jobs.append(job) else: for hash in self.devices[device_kind]: device = self.devices[device_kind][hash] if device["available"] is True: job["hash"] = hash jobs_queue.append(job) device["available"] = False break return jobs_queue, remaining_jobs def _releaseBenchmarks(self, remaining_jobs): # releasing unmatched jobs releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs]) self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids) def _runBenchmarks(self, jobs_queue): """Given a queue of jobs, update run statuses and device statuses in db, and spawn job processes.""" run_ids = ",".join([str(job["id"]) for job in jobs_queue]) self.db.runBenchmarks(self.args.claimer_id, run_ids) run_devices = [ self.devices[job["device"]][job["hash"]] for job in jobs_queue ] getLogger().info("Updating devices status") self.db.updateDevices(self.args.claimer_id, getDevicesString(run_devices), False) # run the benchmarks for job in jobs_queue: getLogger().info( f"Running job with identifier {job['identifier']} and id {job['id']}" ) device = self.devices[job["device"]][job["hash"]] device["start_time"] = time.ctime() async_runner = runAsync( self.args, device, self.db, job, self.benchmark_downloader, self.device_manager.usb_controller, ) # Watchdog will be used to kill currently running jobs # based on user requests app = WatchDog(async_runner, async_runner.didUserRequestJobKill, async_runner.killJob) global RUNNING_JOBS RUNNING_JOBS += 1 """ Python's multiprocessing need to pickle things to sling them in different processes. However, bounded methods are not pickable, so the way it's doing it here doesn't work. Thus, I added __call__ method to the class we are passing into the apply_async method. Ref: https://stackoverflow.com/a/6975654 """ future = self.pool.submit(app) future.add_done_callback(self.callback) def callback(self, future_result_dict): """Decrement running jobs count, output job log, and start device cooldown.""" global RUNNING_JOBS RUNNING_JOBS -= 1 result = future_result_dict.result() job = result["job"] device = result["device"] device = self.devices[device["kind"]][device["hash"]] # output benchmark log in main thread. getLogger().info( "\n{}\n\nBenchmark:\t\t{}\nJob:\t\t\t{}\nDevice Kind:\t\t{}\nDevice Hash:\t\t{}\n{}\n\n{}" .format( "#" * 80, job["identifier"], job["id"], device["kind"], device["hash"], job["log"], "#" * 80, )) with LOCK: self._coolDown(device, force_reboot=job["status"] != "DONE") def _coolDown(self, device, force_reboot=False): t = CoolDownDevice(device, self.args, self.db, force_reboot, LOCK) t.start()
class RunLab(object): def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger()) self.adb = ADB(None, self.args.android_dir) devices = self._getDevices() setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert self.args.server_addr is not None, \ "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == '/': self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver(self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry) self.devices = {} for k in devices: kind = k["kind"] hash = k["hash"] entry = { "kind": kind, "hash": hash, "available": True, "live": True, "start_time": None, "done_time": None, "output_dir": None, "job": None, "adb": ADB(hash, self.args.android_dir), "reboot_time": datetime.datetime.now() - datetime.timedelta(hours=8) } if kind not in self.devices: self.devices[kind] = {} assert hash not in self.devices[kind], \ "Device {} ({}) is attached twice.".format(kind, hash) self.devices[kind][hash] = entry dvs = [ self.devices[k][h] for k in self.devices for h in self.devices[k] ] self.db.updateDevices(self.args.claimer_id, getDevicesString(dvs), True) if self.args.platform.startswith("host"): numProcesses = 2 else: numProcesses = multiprocessing.cpu_count() - 1 self.pool = multiprocessing.Pool(processes=numProcesses) def run(self): while (not stopRun(self.args)): with LOCK: self._runOnce() time.sleep(1) self.db.updateDevices(self.args.claimer_id, "", True) def _runOnce(self): jobs = self._claimBenchmarks() jobs_queue, remaining_jobs = self._selectBenchmarks(jobs) if len(remaining_jobs) != 0: self._releaseBenchmarks(remaining_jobs) if len(jobs_queue) == 0: return self._runBenchmarks(jobs_queue) def _claimBenchmarks(self): claimer_id = self.args.claimer_id # get available devices devices = ",".join([ k for k in self.devices if any(self.devices[k][hash]["available"] is True for hash in self.devices[k]) ]) jobs = [] if len(devices) > 0: jobs = self.db.claimBenchmarks(claimer_id, devices) return jobs def _selectBenchmarks(self, jobs): remaining_jobs = [] jobs_queue = [] for job in jobs: device_kind = job["device"] if device_kind not in self.devices: getLogger().error("Retrieved job for device " "{} ".format(device_kind) + "cannot be run on server " "{}".format(self.args.claimer_id)) remaining_jobs.append(job) else: for hash in self.devices[device_kind]: device = self.devices[device_kind][hash] if device["available"] is True: job["hash"] = hash jobs_queue.append(job) device["available"] = False break return jobs_queue, remaining_jobs def _releaseBenchmarks(self, remaining_jobs): # releasing unmatched jobs releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs]) self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids) def _runBenchmarks(self, jobs_queue): # run the jobs in job queue run_ids = ",".join([str(job["id"]) for job in jobs_queue]) self.db.runBenchmarks(self.args.claimer_id, run_ids) run_devices = [ self.devices[job["device"]][job["hash"]] for job in jobs_queue ] self.db.updateDevices(self.args.claimer_id, getDevicesString(run_devices), False) self._downloadFiles(jobs_queue) # run the benchmarks for job in jobs_queue: tempdir = tempfile.mkdtemp() raw_args = self._getRawArgs(job, tempdir) self.devices[job["device"]][ job["hash"]]["start_time"] = time.ctime() app = runAsync(self.args, self.devices, self.db, job, tempdir) """ Python's multiprocessing need to pickle things to sling them in different processes. However, bounded methods are not pickable, so the way it's doing it here doesn't work. Thus, I added __call__ method in runAsync class and call the class here, since class object is pickable. Ref: https://stackoverflow.com/a/6975654 """ self.pool.apply_async(app, args=[raw_args], callback=app.callback) def _saveBenchmarks(self, jobs_queue): benchmark_files = [] # save benchmarks to files for job in jobs_queue: benchmarks = job["benchmarks"] benchmark = benchmarks["benchmark"] content = benchmark["content"] benchmark_str = json.dumps(content) outfd, path = tempfile.mkstemp() with os.fdopen(outfd, "w") as f: f.write(benchmark_str) job["benchmarks"]["benchmark"]["content"] = path if content["tests"][0]["metric"] == "generic": job["framework"] = "generic" elif "model" in content and "framework" in content["model"]: job["framework"] = content["model"]["framework"] else: getLogger().error("Framework is not specified, " "use Caffe2 as default") job["framework"] = "caffe2" benchmark_files.append(path) return benchmark_files def _downloadBinaries(self, info_dict): programs = info_dict["programs"] for bin_name in programs: program_location = programs[bin_name]["location"] self.benchmark_downloader.downloadFile(program_location, None) if program_location.startswith("//"): program_location = self.args.root_model_dir + program_location[ 1:] elif program_location.startswith("http"): replace_pattern = { " ": '-', "\\": '-', ":": '/', } program_location = self.args.root_model_dir + '/' +\ getFilename(program_location, replace_pattern=replace_pattern) elif program_location.startswith("/"): program_location = self.args.root_model_dir + program_location if self.args.platform.startswith("ios") and \ bin_name == "program" and \ not program_location.endswith(".ipa"): new_location = program_location + ".ipa" os.rename(program_location, new_location) program_location = new_location os.chmod(program_location, stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR) programs[bin_name]["location"] = program_location def _downloadFiles(self, jobs_queue): benchmark_files = self._saveBenchmarks(jobs_queue) # download the models for bf in benchmark_files: self.benchmark_downloader.run(bf) # download the programs for job in jobs_queue: if "info" not in job["benchmarks"]: continue try: if "treatment" not in job["benchmarks"]["info"]: getLogger().error("Field treatment " "must exist in job[\"benchmarks\"]") elif "programs" not in job["benchmarks"]["info"]["treatment"]: getLogger().error( "Field \"program\" must exist in " "job[\"benchmarks\"][\"info\"][\"treatment\"]") else: treatment_info = job["benchmarks"]["info"]["treatment"] self._downloadBinaries(treatment_info) if "control" in job["benchmarks"]["info"]: if "programs" not in job["benchmarks"]["info"]["control"]: getLogger().error( "Field \"program\" must exist in " "job[\"benchmarks\"][\"info\"][\"control\"]") else: control_info = job["benchmarks"]["info"]["control"] self._downloadBinaries(control_info) except Exception: getLogger().error("Unknown exception {}".format( sys.exc_info()[0])) getLogger().error("File download failure") return benchmark_files def _getDevices(self): raw_args = [] raw_args.extend(["--platform", self.args.platform]) if self.args.platform_sig: raw_args.append("--platform_sig") raw_args.append(self.args.platform_sig) if self.args.devices: raw_args.append("--devices") raw_args.append(self.args.devices) if self.args.hash_platform_mapping: # if the user provides filename, we will load it. raw_args.append("--hash_platform_mapping") raw_args.append(self.args.hash_platform_mapping) app = GetConnectedDevices(raw_args=raw_args) devices_json = app.run() assert devices_json, "Devices cannot be empty" devices = json.loads(devices_json.strip()) return devices def _getRawArgs(self, job, tempdir): if "info" in job["benchmarks"]: info = job["benchmarks"]["info"] elif "program" in job["benchmarks"]: # TODO: remove after all clients are updated info = { "treatment": { "commit": "interactive", "commit_time": 0, "program": job["benchmarks"]["program"], } } # pass the device hash as well as type device = {"kind": job["device"], "hash": job["hash"]} device_str = json.dumps(device) raw_args = [] raw_args.extend([ "--benchmark_file", job["benchmarks"]["benchmark"]["content"], "--cooldown", str(self.args.cooldown), "--device", device_str, "--framework", job["framework"], "--info", json.dumps(info), "--model_cache", self.args.model_cache, "--platform", self.args.platform, "--remote_access_token", self.args.remote_access_token, "--root_model_dir", self.args.root_model_dir, "--simple_local_reporter", tempdir, "--user_identifier", str(job["identifier"]), ]) if job["framework"] != "generic": raw_args.extend(["--remote_reporter", self.args.remote_reporter]) if self.args.shared_libs: raw_args.extend( ["--shared_libs", "'" + self.args.shared_libs + "'"]) if self.args.timeout: raw_args.extend(["--timeout", str(self.args.timeout)]) if self.args.platform_sig: raw_args.append("--platform_sig") raw_args.append(self.args.platform_sig) if self.args.monsoon_map: raw_args.extend(["--monsoon_map", str(self.args.monsoon_map)]) if self.args.hash_platform_mapping: # if the user provides filename, we will load it. raw_args.append("--hash_platform_mapping") raw_args.append(self.args.hash_platform_mapping) return raw_args
class RunLab(object): def __init__(self, raw_args=None): self.args, self.unknowns = parser.parse_known_args(raw_args) self.benchmark_downloader = DownloadBenchmarks(self.args, getLogger()) self.adb = ADB(None, self.args.android_dir) setLoggerLevel(self.args.logger_level) if not self.args.benchmark_db_entry: assert self.args.server_addr is not None, \ "Either server_addr or benchmark_db_entry must be specified" while self.args.server_addr[-1] == '/': self.args.server_addr = self.args.server_addr[:-1] self.args.benchmark_db_entry = self.args.server_addr + "/benchmark/" self.db = DBDriver(self.args.benchmark_db, self.args.app_id, self.args.token, self.args.benchmark_table, self.args.job_queue, self.args.test, self.args.benchmark_db_entry) self.device_manager = DeviceManager(self.args, self.db) self.devices = self.device_manager.getLabDevices() if self.args.platform.startswith("host"): numProcesses = 2 else: numProcesses = multiprocessing.cpu_count() - 1 self.pool = multiprocessing.Pool(processes=numProcesses) def run(self): hookSignals() while (not stopRun(self.args)): with LOCK: self._runOnce() time.sleep(1) self.pool.close() self.db.updateDevices(self.args.claimer_id, "", True) self.device_manager.shutdown() def _runOnce(self): jobs = self._claimBenchmarks() jobs_queue, remaining_jobs = self._selectBenchmarks(jobs) if len(remaining_jobs) != 0: self._releaseBenchmarks(remaining_jobs) if len(jobs_queue) == 0: return self._runBenchmarks(jobs_queue) def _claimBenchmarks(self): claimer_id = self.args.claimer_id # get available devices with their hashes devices = [] hashes = [] for k in self.devices: for hash in self.devices[k]: if self.devices[k][hash]["available"]: devices.append(k) hashes.append(hash) hashes = ",".join(hashes) devices = ",".join(devices) jobs = [] if len(devices) > 0: jobs = self.db.claimBenchmarks(claimer_id, devices, hashes) return jobs def _selectBenchmarks(self, jobs): remaining_jobs = [] jobs_queue = [] for job in jobs: device_kind = job["device"] if device_kind not in self.devices: getLogger().error("Retrieved job for device " "{} ".format(device_kind) + "cannot be run on server " "{}".format(self.args.claimer_id)) remaining_jobs.append(job) else: for hash in self.devices[device_kind]: device = self.devices[device_kind][hash] if device["available"] is True: job["hash"] = hash jobs_queue.append(job) device["available"] = False break return jobs_queue, remaining_jobs def _releaseBenchmarks(self, remaining_jobs): # releasing unmatched jobs releasing_ids = ",".join([str(job["id"]) for job in remaining_jobs]) self.db.releaseBenchmarks(self.args.claimer_id, releasing_ids) def _runBenchmarks(self, jobs_queue): # run the jobs in job queue run_ids = ",".join([str(job["id"]) for job in jobs_queue]) self.db.runBenchmarks(self.args.claimer_id, run_ids) run_devices = [ self.devices[job["device"]][job["hash"]] for job in jobs_queue ] getLogger().info("Updating devices status") self.db.updateDevices(self.args.claimer_id, getDevicesString(run_devices), False) getLogger().info("Downloading files") self._downloadFiles(jobs_queue) # run the benchmarks for job in jobs_queue: identifier = job["identifier"] getLogger().info( "Running job with identifier {}".format(identifier)) tempdir = tempfile.mkdtemp( prefix="_".join(["aibench", str(identifier), ""])) raw_args = self._getRawArgs(job, tempdir) self.devices[job["device"]][ job["hash"]]["start_time"] = time.ctime() async_runner = runAsync(self.args, self.devices, self.db, job, tempdir) # Watchdog will be used to kill currently running jobs # based on user requests app = WatchDog(async_runner, async_runner.didUserRequestJobKill, async_runner.killJob) global RUNNING_JOBS RUNNING_JOBS += 1 """ Python's multiprocessing need to pickle things to sling them in different processes. However, bounded methods are not pickable, so the way it's doing it here doesn't work. Thus, I added __call__ method to the class we are passing into the apply_async method. Ref: https://stackoverflow.com/a/6975654 """ self.pool.apply_async(app, args=[raw_args], callback=app.main.callback) def _saveBenchmarks(self, job): # save benchmarks to files benchmarks = job["benchmarks"] benchmark = benchmarks["benchmark"] content = benchmark["content"] benchmark_str = json.dumps(content) outfd, path = tempfile.mkstemp(prefix="aibench") getLogger().info("Temp directory: {}".format(path)) with os.fdopen(outfd, "w") as f: f.write(benchmark_str) job["benchmarks"]["benchmark"]["content"] = path if content["tests"][0]["metric"] == "generic": job["framework"] = "generic" elif "model" in content and "framework" in content["model"]: job["framework"] = content["model"]["framework"] else: getLogger().error("Framework is not specified, " "use Caffe2 as default") job["framework"] = "caffe2" return path def _downloadBinaries(self, info_dict): programs = info_dict["programs"] program_locations = [] for bin_name in programs: program_location = programs[bin_name]["location"] self.benchmark_downloader.downloadFile(program_location, None) if program_location.startswith("//"): program_location = self.args.root_model_dir + program_location[ 1:] elif program_location.startswith("http"): replace_pattern = { " ": '-', "\\": '-', ":": '/', } program_location = os.path.join( self.args.root_model_dir, getFilename(program_location, replace_pattern=replace_pattern)) elif program_location.startswith("/"): program_location = self.args.root_model_dir + program_location if self.args.platform.startswith("ios") and \ bin_name == "program" and \ not program_location.endswith(".ipa"): new_location = program_location + ".ipa" os.rename(program_location, new_location) program_location = new_location os.chmod(program_location, stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR) programs[bin_name]["location"] = program_location program_locations.append(program_location) return program_locations def _downloadFiles(self, jobs_queue): for job in jobs_queue: job["models_location"] = [] # added log capture for reporting log_capture_string = StringIO() ch = logging.StreamHandler(log_capture_string) ch.setLevel(logging.DEBUG) getLogger().addHandler(ch) # download the models try: getLogger().info("Downloading models") path = self._saveBenchmarks(job) location = self.benchmark_downloader.run(path) job["models_location"].extend(location) except Exception as e: getLogger().error("Unknown exception {}".format( sys.exc_info()[0])) getLogger().error( "Error downloading models. Job id: {}".format(job["id"])) getLogger().error(e) job["download_error_log"] = log_capture_string.getvalue() getLogger().info("Downloading programs") # download the programs if "info" not in job["benchmarks"]: continue try: if "treatment" not in job["benchmarks"]["info"]: getLogger().error("Field treatment " "must exist in job[\"benchmarks\"]") elif "programs" not in job["benchmarks"]["info"]["treatment"]: getLogger().error( "Field \"program\" must exist in " "job[\"benchmarks\"][\"info\"][\"treatment\"]") else: treatment_info = job["benchmarks"]["info"]["treatment"] getLogger().info("Downloading treatment binary") treatment_locations = self._downloadBinaries( treatment_info) job["programs_location"] = treatment_locations if "control" in job["benchmarks"]["info"]: if "programs" not in job["benchmarks"]["info"]["control"]: getLogger().error( "Field \"program\" must exist in " "job[\"benchmarks\"][\"info\"][\"control\"]") else: control_info = job["benchmarks"]["info"]["control"] getLogger().info("Downloading control binary") control_locations = self._downloadBinaries( control_info) job["programs_location"].extend(control_locations) except Exception as e: getLogger().error("Unknown exception {}".format( sys.exc_info()[0])) getLogger().error( "Error downloading programs. Job id: {}".format(job["id"])) getLogger().error(e) job["download_error_log"] = log_capture_string.getvalue() log_capture_string.close() getLogger().handlers.pop() gc.collect() def _getRawArgs(self, job, tempdir): if "info" in job["benchmarks"]: info = job["benchmarks"]["info"] elif "program" in job["benchmarks"]: # TODO: remove after all clients are updated info = { "treatment": { "commit": "interactive", "commit_time": 0, "program": job["benchmarks"]["program"], } } # pass the device hash as well as type device = {"kind": job["device"], "hash": job["hash"]} device_str = json.dumps(device) raw_args = [] raw_args.extend([ "--benchmark_file", job["benchmarks"]["benchmark"]["content"], "--cooldown", str(self.args.cooldown), "--device", device_str, "--framework", job["framework"], "--info", json.dumps(info), "--model_cache", self.args.model_cache, "--platform", self.args.platform, "--remote_access_token", self.args.remote_access_token, "--root_model_dir", self.args.root_model_dir, "--simple_local_reporter", tempdir, "--user_identifier", str(job["identifier"]), "--user_string", job.get("user"), ]) if job["framework"] != "generic": raw_args.extend(["--remote_reporter", self.args.remote_reporter]) if self.args.shared_libs: raw_args.extend( ["--shared_libs", "'" + self.args.shared_libs + "'"]) if self.args.timeout: raw_args.extend(["--timeout", str(self.args.timeout)]) if self.args.platform_sig: raw_args.append("--platform_sig") raw_args.append(self.args.platform_sig) if self.args.monsoon_map: raw_args.extend(["--monsoon_map", str(self.args.monsoon_map)]) if self.args.hash_platform_mapping: # if the user provides filename, we will load it. raw_args.append("--hash_platform_mapping") raw_args.append(self.args.hash_platform_mapping) if self.args.device_name_mapping: raw_args.append("--device_name_mapping") raw_args.append(self.args.device_name_mapping) return raw_args