def _output_records(self, step_name, records): """Dump records.""" columns = ["worker_id", "performance", "desc"] outputs = [] for record in records: record = record.serialize() _record = {} for key in columns: _record[key] = record[key] outputs.append(deepcopy(_record)) data = pd.DataFrame(outputs) step_path = FileOps.join_path(TaskOps().local_output_path, step_name) FileOps.make_dir(step_path) _file = FileOps.join_path(step_path, "output.csv") try: data.to_csv(_file, index=False) except Exception: logging.error("Failed to save output file, file={}".format(_file)) for record in outputs: worker_id = record["worker_id"] worker_path = TaskOps().get_local_worker_path(step_name, worker_id) outputs_globs = [] outputs_globs += glob.glob(FileOps.join_path(worker_path, "desc_*.json")) outputs_globs += glob.glob(FileOps.join_path(worker_path, "hps_*.json")) outputs_globs += glob.glob(FileOps.join_path(worker_path, "model_*")) outputs_globs += glob.glob(FileOps.join_path(worker_path, "performance_*.json")) for _file in outputs_globs: if os.path.isfile(_file): FileOps.copy_file(_file, step_path) elif os.path.isdir(_file): FileOps.copy_folder(_file, FileOps.join_path(step_path, os.path.basename(_file)))
def save_report(self, records): """Save report to `reports.json`.""" try: _file = FileOps.join_path(TaskOps().local_output_path, "reports.json") FileOps.make_base_dir(_file) data = {"_steps_": []} for step in self.step_names: if step in self.steps: data["_steps_"].append(self.steps[step]) else: data["_steps_"].append({ "step_name": step, "status": Status.unstarted }) for record in records: if record.step_name in data: data[record.step_name].append(record.to_dict()) else: data[record.step_name] = [record.to_dict()] with open(_file, "w") as f: json.dump(data, f, indent=4, cls=JsonEncoder) except Exception: logging.warning(traceback.format_exc())
def restore(cls): """Transfer cvs_file to records.""" step_path = TaskOps().step_path _file = os.path.join(step_path, ".reports") if os.path.exists(_file): with open(_file, "rb") as f: data = pickle.load(f) cls._hist_records = data[0] cls.__instances__ = data[1]
def pickle_report(self, records, report_instance): """Pickle report to `.reports`.""" try: _file = os.path.join(TaskOps().step_path, ".reports") _dump_data = [records, report_instance] with open(_file, "wb") as f: pickle.dump(_dump_data, f, protocol=pickle.HIGHEST_PROTOCOL) except Exception: logging.warning(traceback.format_exc())
def __init__(self, name=None, **kwargs): """Initialize pipestep.""" self.task = TaskOps() self.name = name if name else "pipestep" self.start_time = datetime.now() self.status = Status.unstarted self.message = None self.end_time = None self.num_epochs = None self.num_models = None
def _get_current_step_records(self): step_name = self.task.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = ReportServer().load_records_from_model_folder( models_folder) else: records = [ReportRecord(step_name, 0)] logging.debug("Records: {}".format(records)) for record in records: record.step_name = step_name return records
def _show_pipeline_info(self): logging.info("-" * 48) logging.info(" Pipeline end.") logging.info("") logging.info(" task id: {}".format(General.task.task_id)) logging.info(" output folder: {}".format(TaskOps().local_output_path)) logging.info("") self._show_step_time() logging.info("") self._show_report() logging.info("-" * 48)
def __init__(self, update_func=None): """Init master attrs, setup and start dask distributed cluster and local multiprocess pool.""" self._checkout_cluster_existed() self.cfg = General() self.task_count = 0 self.eval_count = General.worker.eval_count self.__master_path__ = FileOps.join_path(TaskOps().temp_path, "master") FileOps.make_dir(self.__master_path__) self.dask_env = DaskEnv(General.env, self.__master_path__, General.devices_per_trainer, TaskOps().temp_path) status = self.dask_env.start() if not status or not self.dask_env.is_master: sys.exit(0) self._start_cluster() self.t_queue = Queue() self.update_func = update_func self._thread_runing = True self._lock = Lock() self._thread = self._run_monitor_thread() return
def _get_search_space_list(self): """Get search space list from models folder.""" models_folder = PipeStepConfig.pipe_step.get("models_folder") if not models_folder: self.search_space_list = None return self.search_space_list = [] models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) pattern = FileOps.join_path(models_folder, "*.json") files = glob.glob(pattern) for file in files: with open(file) as f: self.search_space_list.append(json.load(f))
def __init__(self): """Initialize Visual callback.""" super(VisualCallBack, self).__init__() self.priority = 290 self._archive_root = TaskOps().local_visual_path self._fix_path = None self.summary = None self.writer = None self.input = None self.model = None self._need_keys = {"loss_avg", "lr"} self._info = {k: 0. for k in self._need_keys}
def _show_report(self): performance_file = FileOps.join_path(TaskOps().local_output_path, self.steps[-1].name, "output.csv") try: data = pd.read_csv(performance_file) except Exception: logging.info(" result file output.csv is not existed or empty") return if data.shape[1] < 2 or data.shape[0] == 0: logging.info(" result file output.csv is empty") return logging.info(" result:") data = json.loads(data.to_json()) for key in data["worker_id"].keys(): logging.info(" {:>3s}: {}".format(str(data["worker_id"][key]), data["performance"][key]))
def _start_cluster(self): """Set and start dask distributed cluster.""" self.md = ClusterDaskDistributor(self.dask_env.master_address) self.client = self.md.get_client() local_host = None if "BATCH_CURRENT_HOST" in os.environ: local_host = os.environ["BATCH_CURRENT_HOST"] elif "BATCH_CUSTOM0_HOSTS" in os.environ: local_host = os.environ["BATCH_CUSTOM0_HOSTS"] if "CUDA_VISIBLE_DEVICES" in os.environ: os.environ["ORIGIN_CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"] self._remove_worker_number_file() plugin = WorkerEnv(self.dask_env.slave_proc_num, self.dask_env.slave_device_num_per_proc, local_host, os.getpid(), TaskOps().temp_path) self.client.register_worker_plugin(plugin) if "ORIGIN_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["ORIGIN_CUDA_VISIBLE_DEVICES"] if "CUDA_VISIBLE_DEVICES" in os.environ and "ORIGIN_CUDA_VISIBLE_DEVICES" not in os.environ: del os.environ["CUDA_VISIBLE_DEVICES"] return
def _remove_worker_number_file(self): _worker_number_file = os.path.join(TaskOps().temp_path, ".*worker_number") files = glob.glob(_worker_number_file) for _file in files: os.remove(_file)
def backup_output_path(self): """Back up output to local path.""" backup_path = TaskOps().backup_base_path if backup_path is None: return FileOps.copy_folder(TaskOps().local_output_path, backup_path)