def _get_current_step_records(self): step_name = General.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = Report().get_step_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = Report().load_records_from_model_folder(models_folder) else: records = self._load_single_model_records() final_records = [] for record in records: if not record.weights_file: logger.error("Model file is not existed, id={}".format( record.worker_id)) else: record.step_name = General.step_name final_records.append(record) logging.debug("Records: {}".format(final_records)) return final_records
def _simulate_tiny_pipeline(self, cfg_tiny): """Simulate tiny pipeline by using one sample one epoch.""" report = ReportServer() for i, step_name in enumerate(PipelineConfig.steps): step_cfg = cfg_tiny.get(step_name) if step_cfg.pipe_step.type != 'SearchPipeStep': continue step_cfg.trainer.distributed = False step_cfg.trainer.epochs = 1 self.restrict_config.trials[step_name] = 1 General.step_name = step_name PipeStepConfig.from_dict(step_cfg) pipestep = PipeStep() if i == 0: pipestep.do() record = report.get_step_records(step_name)[-1] self.epoch_time = record.runtime _worker_path = TaskOps().local_base_path if os.path.exists(_worker_path): os.system('rm -rf {}'.format(_worker_path)) if step_cfg.pipe_step.type == 'SearchPipeStep': self.params_dict[step_name][ 'max_samples'] = pipestep.generator.search_alg.max_samples _file = os.path.join(TaskOps().step_path, ".generator") if os.path.exists(_file): os.system('rm {}'.format(_file))
def _backup_cfg(cfg_path): """Backup yml file. :param cfg_path: path of yml file. """ if isinstance(cfg_path, str): output_path = FileOps.join_path(TaskOps().local_output_path, os.path.basename(cfg_path)) FileOps.copy_file(cfg_path, output_path) else: output_path = FileOps.join_path(TaskOps().local_output_path, 'config.yml') with open(output_path, 'w') as f: f.write(yaml.dump(cfg_path))
def _save_worker_record(cls, record): step_name = record.get('step_name') worker_id = record.get('worker_id') _path = TaskOps().get_local_worker_path(step_name, worker_id) for record_name in ["desc", "hps", "performance"]: _file_name = None _file = None record_value = remove_np_value(record.get(record_name)) if not record_value: continue _file = None try: # for cars/darts save multi-desc if isinstance(record_value, list) and record_name == "desc": for idx, value in enumerate(record_value): _file_name = "desc_{}.json".format(idx) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(value, f) else: _file_name = None if record_name == "desc": _file_name = "desc_{}.json".format(worker_id) if record_name == "hps": _file_name = "hps_{}.json".format(worker_id) if record_name == "performance": _file_name = "performance_{}.json".format(worker_id) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(record_value, f) except Exception as ex: logging.error("Failed to save {}, file={}, desc={}, msg={}".format( record_name, _file, record_value, str(ex)))
def _backup_config(args): _file = args.config_file from zeus.common.task_ops import TaskOps from zeus.common.file_ops import FileOps dest_file = FileOps.join_path(TaskOps().local_output_path, os.path.basename(_file)) FileOps.make_base_dir(dest_file) FileOps.copy_file(_file, dest_file)
def restore(cls): """Restore generator from file.""" step_path = TaskOps().step_path _file = os.path.join(step_path, ".generator") if os.path.exists(_file): with open(_file, "rb") as f: return pickle.load(f) else: return None
def _get_current_step_records(self): step_name = self.task.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = ReportServer().get_pareto_front_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = ReportServer().load_records_from_model_folder( models_folder) else: records = [ReportRecord(step_name, 0)] logging.debug("Records: {}".format(records)) for record in records: record.step_name = step_name return records
def _show_pipeline_info(self, steps_time, step_name): logging.info("-" * 48) logging.info(" Pipeline end.") logging.info("") logging.info(" task id: {}".format(General.task.task_id)) logging.info(" output folder: {}".format(TaskOps().local_output_path)) logging.info("") self._show_step_time(steps_time) logging.info("") self._show_report(step_name) logging.info("-" * 48)
def _calc_forward_latency_davinci(model, input, sess_config=None, num=10, evaluate_config=None): """Model forward latency calculation. :param model: network model :type model: torch or tf module :param input: input tensor :type input: Tensor of torch or tf :param num: forward number :type num: int :param evaluate_config: some config for evaluate in davinci :type evaluate_config: dict :return: forward latency :rtype: float """ from zeus.evaluator.tools.evaluate_davinci_bolt import evaluate from zeus.common.task_ops import TaskOps # backend = evaluate_config.get("backend") hardware = evaluate_config.get("hardware") remote_host = evaluate_config.get("remote_host") worker_path = TaskOps().local_base_path save_data_file = os.path.join(worker_path, "input.bin") latency = 0. now_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S%f') job_id = "pre_evaluate_" + now_time logging.info("The job id of evaluate service is {}.".format(job_id)) if zeus.is_torch_backend(): import torch input_shape = input.shape if torch.is_tensor(input): input = input.cpu().numpy() input.tofile(save_data_file) for index in range(num): reuse_model = False if index == 0 else True results = evaluate("pytorch", hardware, remote_host, model, None, save_data_file, input_shape, reuse_model, job_id) latency += np.float(results.get("latency")) elif zeus.is_tf_backend(): input_shape = input.shape.as_list() test_data = np.random.random(input_shape).astype(np.float32) test_data.tofile(save_data_file) for index in range(num): reuse_model = False if index == 0 else True results = evaluate("tensorflow", hardware, remote_host, model, None, save_data_file, input_shape, reuse_model, job_id) latency += np.float(results.get("latency")) return latency / num
def _show_performance(): output_file = FileOps.join_path(TaskOps().local_output_path, General.step_name, "output.csv") try: data = pd.read_csv(output_file) except Exception: logging.info(" Result file output.csv is not existed or empty.") return if data.shape[1] < 2 or data.shape[0] == 0: logging.info(" Result file output.csv is empty.") return logging.info("-" * 48) data = json.loads(data.to_json()) logging.info(" result: {}".format(data["performance"]["0"])) logging.info("-" * 48)
def _show_report(self, step_name): performance_file = FileOps.join_path( TaskOps().local_output_path, step_name, "output.csv") try: data = pd.read_csv(performance_file) except Exception: logging.info(" result file output.csv is not existed or empty") return if data.shape[1] < 2 or data.shape[0] == 0: logging.info(" result file output.csv is empty") return logging.info(" result:") data = json.loads(data.to_json()) for key in data["worker_id"].keys(): logging.info(" {:>3s}: {}".format(str(data["worker_id"][key]), data["performance"][key]))
def save_master_ip(ip_address, port, args): """Write the ip and port in a system path. :param str ip_address: The `ip_address` need to write. :param str port: The `port` need to write. :param argparse.ArgumentParser args: `args` is a argparse that should contain `init_method`, `rank` and `world_size`. """ temp_folder = TaskOps().temp_path FileOps.make_dir(temp_folder) file_path = os.path.join(temp_folder, 'ip_address.txt') logging.info("write ip, file path={}".format(file_path)) with open(file_path, 'w') as f: f.write(ip_address + "\n") f.write(port + "\n")
def load_master_ip(): """Get the ip and port that write in a system path. here will not download anything from S3. """ temp_folder = TaskOps().temp_path FileOps.make_dir(temp_folder) file_path = os.path.join(temp_folder, 'ip_address.txt') if os.path.isfile(file_path): with open(file_path, 'r') as f: ip = f.readline().strip() port = f.readline().strip() logging.info("get write ip, ip={}, port={}".format(ip, port)) return ip, port else: return None, None
def _init_env(cfg_path): """Init config and evn parameters. :param cfg_path: config file path """ logging.getLogger().setLevel(logging.DEBUG) UserConfig().load(cfg_path) # load general General.from_json(UserConfig().data.get("general"), skip_check=False) init_log(level=General.logger.level, log_path=TaskOps().local_log_path) cluster_args = env_args() if not cluster_args: cluster_args = init_local_cluster_args() setattr(PipelineConfig, "steps", UserConfig().data.pipeline) General.env = cluster_args set_backend(General.backend, General.device_category)
def _clean_checkpoint(self): worker_parent_folder = os.path.abspath( os.path.join(TaskOps().get_local_worker_path(General.step_name, 1), "..")) patterns = [ ".*.pkl", "*.pth", "model_*", "model.ckpt-*", "*.pb", "graph.*", "eval", "events*", "CKP-*", "checkpoint", ".*.log", "*.ckpt", "*.air", "*.onnx", "*.caffemodel", "*.pbtxt", "*.bin", "kernel_meta", "*.prototxt", ] all_files = [] worker_folders = glob.glob(worker_parent_folder + "/*") for worker_folder in worker_folders: for pattern in patterns: file_pattern = worker_folder + "/" + pattern all_files += glob.glob(file_pattern) if all_files: logging.info( "Clean worker folder {}.".format(worker_parent_folder)) for item in all_files: try: if os.path.isfile(item): os.remove(item) elif os.path.isdir(item): shutil.rmtree(item) except Exception: logging.warn("Failed to remove {}".format(item))
def _load_single_model_records(self): model_desc = PipeStepConfig.model.model_desc model_desc_file = PipeStepConfig.model.model_desc_file if model_desc_file: model_desc_file = model_desc_file.replace( "{local_base_path}", TaskOps().local_base_path) model_desc = Config(model_desc_file) if not model_desc: logger.error("Model desc or Model desc file is None.") return [] model_file = PipeStepConfig.model.pretrained_model_file if not model_file: logger.error("Model file is None.") return [] if not os.path.exists(model_file): logger.error("Model file is not existed.") return [] return [ ReportRecord().load_dict( dict(worker_id="1", desc=model_desc, weights_file=model_file)) ]
def _get_abs_path(cls, _path): if "{local_base_path}" in _path: from zeus.common.task_ops import TaskOps return os.path.abspath(_path.replace("{local_base_path}", TaskOps().local_base_path)) return _path
def _init_env(): if sys.version_info < (3, 6): sys.exit('Sorry, Python < 3.6 is not supported.') init_log(level=General.logger.level, log_path=TaskOps().local_log_path) General.env = init_cluster_args() _print_task_id()
def dump(self): """Dump generator to file.""" step_path = TaskOps().step_path _file = os.path.join(step_path, ".generator") with open(_file, "wb") as f: pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)