def dump(self): """Dump report to file.""" try: _file = FileOps.join_path(TaskOps().step_path, "reports.csv") FileOps.make_base_dir(_file) data = self.all_records data_dict = {} for step in data: step_data = step.serialize().items() for k, v in step_data: if k in data_dict: data_dict[k].append(v) else: data_dict[k] = [v] data = pd.DataFrame(data_dict) data.to_csv(_file, index=False) _file = os.path.join(TaskOps().step_path, ".reports") _dump_data = [ ReportServer._hist_records, ReportServer.__instances__ ] with open(_file, "wb") as f: pickle.dump(_dump_data, f, protocol=pickle.HIGHEST_PROTOCOL) self.backup_output_path() except Exception: logging.warning(traceback.format_exc())
def copy_pareto_output(self, step_name=None, worker_ids=[]): """Copy files related to pareto from worker to output.""" taskops = TaskOps() local_output_path = os.path.join(taskops.local_output_path, step_name) if not (step_name and os.path.exists(local_output_path)): return for worker_id in worker_ids: desDir = os.path.join(local_output_path, str(worker_id)) FileOps.make_dir(desDir) local_worker_path = taskops.get_worker_subpath( step_name, str(worker_id)) srcDir = FileOps.join_path(taskops.local_base_path, local_worker_path) copy_search_file(srcDir, desDir)
def _append_record_to_csv(self, record_name=None, step_name=None, record=None, mode='a'): """Transfer record to csv file.""" local_output_path = os.path.join(TaskOps().local_output_path, step_name) logging.debug( "recode to csv, local_output_path={}".format(local_output_path)) if not record_name and os.path.exists(local_output_path): return file_path = os.path.join(local_output_path, "{}.csv".format(record_name)) FileOps.make_base_dir(file_path) try: for key in record: if isinstance(record[key], dict) or isinstance( record[key], list): record[key] = str(record[key]) data = pd.DataFrame([record]) if not os.path.exists(file_path): data.to_csv(file_path, index=False) elif os.path.exists(file_path) and os.path.getsize( file_path) and mode == 'a': data.to_csv(file_path, index=False, mode=mode, header=0) else: data.to_csv(file_path, index=False, mode=mode) except Exception as ex: logging.info( 'Can not transfer record to csv file Error: {}'.format(ex))
def _init_model(self): """Initialize model if fully training a model. :return: config of fully train model :rtype: config file """ config = Config(self.cfg.config_template) config['total_epochs'] = self.cfg.epoch if hasattr(self.cfg, 'model_desc_file') and self.cfg.model_desc_file: _model_desc_file = self.cfg.model_desc_file.replace( "{local_base_path}", TaskOps().local_base_path) _total_list = ListDict.load_csv(_model_desc_file) pre_arch = _total_list.sort('mAP')[0]['arch'] pretrained = pre_arch.split('_')[1] pre_worker_id = _total_list.sort('mAP')[0]['pre_worker_id'] model_desc = dict(arch=pre_arch, pre_arch=pretrained, pre_worker_id=-1) logging.info("Initialize fully train model from: {}".format(model_desc)) if self.cfg.regnition: # re-write config from previous result config['model']['backbone']['reignition'] = True config['model']['pretrained'] = os.path.join( self.output_path, pretrained + '_imagenet.pth') else: config['model']['pretrained'] = extract_backbone_from_pth( self.output_path, pre_worker_id, pretrained) self.sample_results = dict( arch=pre_arch, worker_id=self._worker_id, pre_arch=pre_arch, pre_worker_id=pre_worker_id) elif 'model_desc' in self.cfg: model_desc = self.cfg.model_desc else: raise ValueError('Missing model description!') model_desc = update_config(config, model_desc) return model_desc
def _save_worker_record(cls, record): step_name = record.get('step_name') worker_id = record.get('worker_id') _path = TaskOps().get_local_worker_path(step_name, worker_id) for record_name in ["desc", "performance"]: _file_name = None _file = None record_value = record.get(record_name) if not record_value: continue _file = None try: # for cars/darts save multi-desc if isinstance(record_value, list) and record_name == "desc": for idx, value in enumerate(record_value): _file_name = "desc_{}.json".format(idx) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(value, f) else: _file_name = None if record_name == "desc": _file_name = "desc_{}.json".format(worker_id) if record_name == "performance": _file_name = "performance_{}.json".format(worker_id) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(record_value, f) except Exception as ex: logging.error( "Failed to save {}, file={}, desc={}, msg={}".format( record_name, _file, record_value, str(ex)))
def restore(cls): """Transfer cvs_file to records.""" step_path = TaskOps().step_path _file = os.path.join(step_path, ".reports") if os.path.exists(_file): with open(_file, "rb") as f: data = pickle.load(f) cls._hist_records = data[0] cls.__instances__ = data[1]
def _output_records(self, step_name, records, desc=True, weights_file=False, performance=False): """Dump records.""" columns = ["worker_id", "performance", "desc"] outputs = [] for record in records: record = record.serialize() _record = {} for key in columns: _record[key] = record[key] outputs.append(deepcopy(_record)) data = pd.DataFrame(outputs) step_path = FileOps.join_path(TaskOps().local_output_path, step_name) FileOps.make_dir(step_path) _file = FileOps.join_path(step_path, "output.csv") try: data.to_csv(_file, index=False) except Exception: logging.error("Failed to save output file, file={}".format(_file)) for record in outputs: worker_id = record["worker_id"] worker_path = TaskOps().get_local_worker_path(step_name, worker_id) outputs_globs = [] if desc: outputs_globs += glob.glob( FileOps.join_path(worker_path, "desc_*.json")) if weights_file: outputs_globs += glob.glob( FileOps.join_path(worker_path, "model_*")) if performance: outputs_globs += glob.glob( FileOps.join_path(worker_path, "performance_*.json")) for _file in outputs_globs: if os.path.isfile(_file): FileOps.copy_file(_file, step_path) elif os.path.isdir(_file): FileOps.copy_folder( _file, FileOps.join_path(step_path, os.path.basename(_file)))
def _get_current_step_records(self): step_name = self.task.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") records = [] cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = Report().get_pareto_front_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = Report().load_records_from_model_folder(models_folder) else: records = [ReportRecord(step_name, 0)] logging.debug("Records: {}".format(records)) for record in records: record.step_name = step_name return records
def _get_search_space_list(self): """Get search space list from models folder.""" models_folder = PipeStepConfig.pipe_step.get("models_folder") if not models_folder: self.search_space_list = None return self.search_space_list = [] models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) pattern = FileOps.join_path(models_folder, "*.json") files = glob.glob(pattern) for file in files: with open(file) as f: self.search_space_list.append(json.load(f))
def __init__(self): """Initialize Visual callback.""" super(VisualCallBack, self).__init__() self.priority = 290 self._archive_root = TaskOps().local_visual_path self._fix_path = None self.summary = None self.writer = None self.input = None self.model = None self._need_keys = {"loss_avg", "lr"} self._info = {k: 0. for k in self._need_keys}
def dump(self): """Dump report to file.""" try: _file = FileOps.join_path(TaskOps().step_path, "reports.json") FileOps.make_base_dir(_file) data = {} for record in self.all_records: if record.step_name in data: data[record.step_name].append(record.to_dict()) else: data[record.step_name] = [record.to_dict()] with open(_file, "w") as f: json.dump(data, f, indent=4) _file = os.path.join(TaskOps().step_path, ".reports") _dump_data = [ ReportServer._hist_records, ReportServer.__instances__ ] with open(_file, "wb") as f: pickle.dump(_dump_data, f, protocol=pickle.HIGHEST_PROTOCOL) self.backup_output_path() except Exception: logging.warning(traceback.format_exc())
def dump_report(self, step_name=None, record=None): """Save one records.""" try: if record and step_name: self._append_record_to_csv(self.REPORT_FILE_NAME, step_name, record.serialize()) self.backup_output_path() step_path = TaskOps().step_path _file = os.path.join(step_path, ".reports") _dump_data = [Report._hist_records, Report.__instances__] with open(_file, "wb") as f: pickle.dump(_dump_data, f, protocol=pickle.HIGHEST_PROTOCOL) except Exception: logging.warning(traceback.format_exc())
def __init__(self, update_func=None): """Init master attrs, setup and start dask distributed cluster and local multiprocess pool.""" self.cfg = General() self.task_count = 0 self.eval_count = General.worker.eval_count self.dask_env = DaskEnv(General.env, self.__master_path__, General.devices_per_trainer, TaskOps().temp_path) status = self.dask_env.start() if not status or not self.dask_env.is_master: sys.exit(0) self._start_cluster() self.t_queue = Queue() self.update_func = update_func self.evaluator_list = {} self._thread_runing = True self._lock = Lock() self._thread = self._run_monitor_thread() ReportServer().renew() return
def _start_cluster(self): """Set and start dask distributed cluster.""" self.md = ClusterDaskDistributor(self.dask_env.master_address) self.client = self.md.get_client() local_host = None if "BATCH_CURRENT_HOST" in os.environ: local_host = os.environ["BATCH_CURRENT_HOST"] elif "BATCH_CUSTOM0_HOSTS" in os.environ: local_host = os.environ["BATCH_CUSTOM0_HOSTS"] if "CUDA_VISIBLE_DEVICES" in os.environ: os.environ["ORIGIN_CUDA_VISIBLE_DEVICES"] = os.environ[ "CUDA_VISIBLE_DEVICES"] self._remove_worker_number_file() plugin = WorkerEnv(self.dask_env.slave_proc_num, self.dask_env.slave_device_num_per_proc, local_host, os.getpid(), TaskOps().temp_path) self.client.register_worker_plugin(plugin) if "ORIGIN_CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = os.environ[ "ORIGIN_CUDA_VISIBLE_DEVICES"] if "CUDA_VISIBLE_DEVICES" in os.environ and "ORIGIN_CUDA_VISIBLE_DEVICES" not in os.environ: del os.environ["CUDA_VISIBLE_DEVICES"] return
def backup_output_path(self): """Back up output to local path.""" backup_path = TaskOps().backup_base_path if backup_path is None: return FileOps.copy_folder(TaskOps().local_output_path, backup_path)
def _remove_worker_number_file(self): _worker_number_file = os.path.join(TaskOps().temp_path, ".*worker_number") files = glob.glob(_worker_number_file) for _file in files: os.remove(_file)
def __init__(self): self.task = TaskOps()