def save_results(self): """Save the results of evolution contains the information of pupulation and elitism.""" step_name = Config(deepcopy(UserConfig().data)).general.step_name _path = FileOps.join_path(self.local_output_path, step_name) FileOps.make_dir(_path) arch_file = FileOps.join_path(_path, 'arch.txt') arch_child = FileOps.join_path(_path, 'arch_child.txt') sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy') sel_arch = [] with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac: writer_a = csv.writer(fw_a, lineterminator='\n') writer_ac = csv.writer(fw_ac, lineterminator='\n') writer_ac.writerow(['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.individual_num): writer_ac.writerow( self._log_data(net_info_type='active_only', pop=self.pop[c], value=self.pop[c].fitness)) writer_a.writerow(['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.elitism_num): writer_a.writerow(self._log_data(net_info_type='active_only', pop=self.elitism[c], value=self.elit_fitness[c])) sel_arch.append(self.elitism[c].gene) sel_arch = np.stack(sel_arch) np.save(sel_arch_file, sel_arch) if self.backup_base_path is not None: FileOps.copy_folder(self.local_output_path, self.backup_base_path)
def load_checkpoint(self, worker_id=None, step_name=None, saved_folder=None): """Load checkpoint.""" if saved_folder is None: if worker_id is None: worker_id = self.worker_id if step_name is None: step_name = self.step_name saved_folder = self.get_local_worker_path(step_name, worker_id) checkpoint_file = FileOps.join_path(saved_folder, self.checkpoint_file_name) model_pickle_file = FileOps.join_path(saved_folder, self.model_pickle_file_name) try: with open(model_pickle_file, 'rb') as f: model = pickle.load(f) ckpt = torch.load(checkpoint_file, map_location=torch.device('cpu')) model.load_state_dict(ckpt['weight']) if self.cfg.cuda: model = model.cuda() self.model = model except Exception: logging.info( 'Checkpoint file is not existed, use default model now.') return
def _save_performance(self, performance, model_desc=None): """Save result of the model, and calculate pareto front. :param performance: The dict that contains all the result needed :param model_desc: config of the model """ performance_str = json.dumps(performance, indent=4, sort_keys=True) self.trainer._save_performance(performance_str) method = model_desc.method code = model_desc.code metric_method = self.cfg.metric.method FileOps.make_dir(FileOps.join_path(self.result_path)) result_file_name = FileOps.join_path(self.result_path, "{}.csv".format(method)) header = "Code,GFlops,KParams,{0},Best {0},Worker_id\n".format( metric_method) if not os.path.exists(result_file_name): with open(result_file_name, 'w') as file: file.write(header) with open(result_file_name, 'a') as file: file.write('{},{},{},{},{},{}\n'.format( code, performance['gflops'], performance['kparams'], performance["cur_valid_perf"], performance["best_valid_perf"], self.trainer.worker_id)) logging.info("Model result saved to {}".format(result_file_name)) self._save_pareto_front("GFlops", "Best {}".format(metric_method))
def _save_checkpoint(self, epoch, best=False): """Save model weights. :param epoch: current epoch :type epoch: int """ save_dir = os.path.join(self.worker_path, str(epoch)) FileOps.make_dir(save_dir) for name in self.model.model_names: if isinstance(name, str): save_filename = '%s_net_%s.pth' % (epoch, name) save_path = FileOps.join_path(save_dir, save_filename) net = getattr(self.model, 'net' + name) best_file = FileOps.join_path(self.worker_path, "model_{}.pth".format(name)) if self.cfg.cuda and torch.cuda.is_available(): # torch.save(net.module.cpu().state_dict(), save_path) torch.save(net.module.state_dict(), save_path) # net.cuda() if best: torch.save(net.module.state_dict(), best_file) else: torch.save(net.cpu().state_dict(), save_path) if best: torch.save(net.cpu().state_dict(), best_file)
def _save_worker_record(cls, record): step_name = record.get('step_name') worker_id = record.get('worker_id') _path = TaskOps().get_local_worker_path(step_name, worker_id) for record_name in ["desc", "performance"]: _file_name = None _file = None record_value = record.get(record_name) if not record_value: continue _file = None try: # for cars/darts save multi-desc if isinstance(record_value, list) and record_name == "desc": for idx, value in enumerate(record_value): _file_name = "desc_{}.json".format(idx) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(record_value, f) else: _file_name = None if record_name == "desc": _file_name = "desc_{}.json".format(worker_id) if record_name == "performance": _file_name = "performance_{}.json".format(worker_id) _file = FileOps.join_path(_path, _file_name) with open(_file, "w") as f: json.dump(record_value, f) except Exception as ex: logging.error( "Failed to save {}, file={}, desc={}, msg={}".format( record_name, _file, record_value, str(ex)))
def _copy_needed_file(self): if "pareto_front_file" in self.cfg and self.cfg.pareto_front_file is not None: init_pareto_front_file = self.cfg.pareto_front_file.replace( "{local_base_path}", self.local_base_path) self.pareto_front_file = FileOps.join_path(self.result_path, "pareto_front.csv") FileOps.copy_file(init_pareto_front_file, self.pareto_front_file) if "random_file" in self.cfg and self.cfg.random_file is not None: init_random_file = self.cfg.random_file.replace( "{local_base_path}", self.local_base_path) self.random_file = FileOps.join_path(self.local_output_path, self.cfg.step_name, "random.csv") FileOps.copy_file(init_random_file, self.random_file)
def dump_model_visual_info(trainer, epoch, model, inputs): """Dump model to tensorboard event files. :param trainer: trainer. :type worker: object that the class was inherited from DistributedWorker. :param model: model. :type model: model. :param inputs: input data. :type inputs: data. """ (_, visual, interval, title, worker_id, output_path) = _get_trainer_info(trainer) if visual is not True: return if epoch % interval != 0: return title = str(worker_id) _path = FileOps.join_path(output_path, title) FileOps.make_dir(_path) try: with SummaryWriter(_path) as writer: writer.add_graph(model, (inputs, )) except Exception as e: logging.error( "Failed to dump model visual info, worker id: {}, epoch: {}, error: {}" .format(worker_id, epoch, str(e)))
def before_train(self, logs=None): """Call before_train of the managed callbacks.""" super().before_train(logs) """Be called before the training process.""" hpo_result = FileOps.load_pickle( FileOps.join_path(self.trainer.local_output_path, 'best_config.pickle')) logging.info("loading stage1_hpo_result \n{}".format(hpo_result)) feature_interaction_score = hpo_result['feature_interaction_score'] print('feature_interaction_score:', feature_interaction_score) sorted_pairs = sorted(feature_interaction_score.items(), key=lambda x: abs(x[1]), reverse=True) model_cfg = ClassFactory.__configs__.get('model') if model_cfg: fis_ratio = model_cfg["model_desc"]["custom"]["fis_ratio"] else: fis_ratio = 1.0 top_k = int(len(feature_interaction_score) * min(1.0, fis_ratio)) self.selected_pairs = list(map(lambda x: x[0], sorted_pairs[:top_k])) # add selected_pairs setattr(model_cfg["model_desc"]["custom"], 'selected_pairs', self.selected_pairs)
def after_valid(self, logs=None): """Call after_valid of the managed callbacks.""" self.model = self.trainer.model feature_interaction_score = self.model.get_feature_interaction_score() print('get feature_interaction_score', feature_interaction_score) feature_interaction = [] for feature in feature_interaction_score: if abs(feature_interaction_score[feature]) > 0: feature_interaction.append(feature) print('get feature_interaction', feature_interaction) curr_auc = float(self.trainer.valid_metrics.results['auc']) if curr_auc > self.best_score: best_config = { 'score': curr_auc, 'feature_interaction': feature_interaction } logging.info("BEST CONFIG IS\n{}".format(best_config)) pickle_result_file = FileOps.join_path( self.trainer.local_output_path, 'best_config.pickle') logging.info("Saved to {}".format(pickle_result_file)) FileOps.dump_pickle(best_config, pickle_result_file) self.best_score = curr_auc
def before_train(self, logs=None): """Be called before the whole train process.""" self.cfg = self.trainer.cfg self.trainer.auto_save_ckpt = False self.trainer.auto_save_perf = False self.result_path = FileOps.join_path(self.trainer.local_base_path, "result")
def _get_current_step_records(self): step_name = General.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = Report().get_step_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = Report().load_records_from_model_folder(models_folder) else: records = self._load_single_model_records() final_records = [] for record in records: if not record.weights_file: logger.error("Model file is not existed, id={}".format( record.worker_id)) else: record.step_name = General.step_name final_records.append(record) logging.debug("Records: {}".format(final_records)) return final_records
def output_model_desc(self, id=None, model_desc=None, performance=None): """Save model desc and performance. :param id: model desc id, usally worker id instead. :type id: int or str. :param model_desc: model description. :type model_desc: json. :param performance: performance value, eg. {"accuracy": 98.23}. :type performance: json. """ if id is None: id = self.worker_id if model_desc is None: if not hasattr(self, "model_desc"): logger.error( "Failed to save model desc, param 'model_desc' is not assigned." ) return model_desc = self.model_desc _file = FileOps.join_path(self.local_output_path, self.step_name, "model_desc_{}.json".format(str(id))) FileOps.make_base_dir(_file) try: with open(_file, "w") as f: json.dump(model_desc, f) except Exception as ex: logger.error( "Failed to save model desc, file={}, desc={}, msg={}".format( _file, model_desc, str(ex))) return if performance is not None: self.output_evaluate_result(id, performance)
def _evaluate_esr_models(self, esr_models_file, models_folder): models_folder = models_folder.replace("{local_base_path}", self.task.local_base_path) models_folder = os.path.abspath(models_folder) esr_models_file = esr_models_file.replace("{local_base_path}", self.task.local_base_path) esr_models_file = os.path.abspath(esr_models_file) archs = np.load(esr_models_file) for i, arch in enumerate(archs): try: cls_gpu_evaluator = ClassFactory.get_cls( ClassType.GPU_EVALUATOR) except Exception: logger.error( "Failed to create Evaluator, please check the config file") logger.error(traceback.format_exc()) return pretrained_model = FileOps.join_path(models_folder, "model_{}.pth".format(i)) if not os.path.exists(pretrained_model): logger.error("Failed to find model file, file={}".format( pretrained_model)) cls_gpu_evaluator.cfg.model_arch = arch cls_gpu_evaluator.cfg.pretrained_model_file = pretrained_model try: evaluator = cls_gpu_evaluator() evaluator.train_process() evaluator.output_evaluate_result(i, evaluator.evaluate_result) except Exception: logger.error( "Failed to evaluate model, id={}, pretrained_model={}". format(i, pretrained_model)) logger.error(traceback.format_exc()) return
def output_evaluate_result(self, id=None, performance=None, evaluate_type="gpu"): """Save model performance. :param id: model desc id, usally worker id instead. :type id: int or str. :param performance: performance value, eg. {"accuracy": 98.23}. :type performance: json. :param evaluate_type: evaluate type, eg. "gpu", "davinci", "arm". :type evaluate_type: str. """ if performance is None: return if id is None: id = self.worker_id _file = FileOps.join_path( self.local_output_path, self.step_name, "performance_{}_{}.txt".format(evaluate_type, str(id))) FileOps.make_base_dir(_file) try: performance = str(performance) with open(_file, "w") as f: f.write(performance) except Exception as ex: logger.error( "Failed to save performance, file={}, pfm={}, msg={}".format( _file, performance, str(ex))) return
def output_hps(self, id=None, hps=None): """Save model desc and performance. :param id: model desc id, usually worker id. :type id: int or str. :param hps: hyper parameters. :type hps: json. """ if id is None: id = self.worker_id if hps is None: if not hasattr(self, "hps"): logger.error( "Failed to save hyperparameters, param 'hps' is not assigned." ) return hps = self.hps _file = FileOps.join_path(self.local_output_path, self.step_name, "hyperparameters.json") FileOps.make_base_dir(_file) try: with open(_file, "w") as f: json.dump({str(id): hps}, f) except Exception as ex: logger.error( "Failed to save hyperparameters, file={}, hps={}, msg={}". format(_file, hps, str(ex))) return
def load_records_from_model_folder(cls, model_folder): """Transfer json_file to records.""" if not model_folder or not os.path.exists(model_folder): logging.error( "Failed to load records from model folder, folder={}".format( model_folder)) return [] records = [] pattern = FileOps.join_path(model_folder, "desc_*.json") files = glob.glob(pattern) for _file in files: try: with open(_file) as f: worker_id = _file.split(".")[-2].split("_")[-1] weights_file = os.path.join( os.path.dirname(_file), "model_{}.pth".format(worker_id)) if os.path.exists(weights_file): sample = dict(worker_id=worker_id, desc=json.load(f), weights_file=weights_file) else: sample = dict(worker_id=worker_id, desc=json.load(f)) record = ReportRecord().load_dict(sample) records.append(record) except Exception as ex: logging.info( 'Can not read records from json because {}'.format(ex)) return records
def _init_model(self, model=None): """Load model desc from save path and parse to model.""" if model is not None: return model model_cfg = ClassFactory.__configs__.get('model') if 'model_desc_file' in model_cfg and model_cfg.model_desc_file is not None: desc_file = model_cfg.model_desc_file.replace( "{model_zoo}", self.model_zoo_path) desc_file = desc_file.replace("{local_base_path}", self.local_base_path) if ":" not in desc_file: desc_file = os.path.abspath(desc_file) if ":" in desc_file: local_desc_file = FileOps.join_path( self.local_output_path, os.path.basename(desc_file)) FileOps.copy_file(desc_file, local_desc_file) desc_file = local_desc_file if self.horovod: hvd.join() model_desc = Config(desc_file) logging.info("net_desc:{}".format(model_desc)) elif 'model_desc' in model_cfg and model_cfg.model_desc is not None: model_desc = model_cfg.model_desc else: return None if model_desc is not None: self.model_desc = model_desc net_desc = NetworkDesc(model_desc) model = net_desc.to_model() return model else: return None
def _backup(self): """Backup result worker folder.""" if self.need_backup is True and self.backup_base_path is not None: backup_worker_path = FileOps.join_path(self.backup_base_path, self.get_worker_subpath()) FileOps.copy_folder(self.get_local_worker_path(), backup_worker_path)
def _save_checkpoint(self, epoch): """Save checkpoint.""" checkpoint_file = FileOps.join_path(self.get_local_worker_path(), self.checkpoint_file_name) model_pickle_file = FileOps.join_path(self.get_local_worker_path(), self.model_pickle_file_name) # pickle model with open(model_pickle_file, 'wb') as handle: pickle.dump(self.model, handle, protocol=pickle.HIGHEST_PROTOCOL) # save checkpoint ckpt = { 'epoch': epoch, 'weight': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'lr_scheduler': self.lr_scheduler.state_dict(), } torch.save(ckpt, checkpoint_file)
def _init_npu_estimator(self, sess_config): model_dir = self.get_local_worker_path() if self.distributed: model_dir = FileOps.join_path(model_dir, str(self._local_rank_id)) config = NPURunConfig(model_dir=model_dir, save_checkpoints_steps=self.config.save_steps, log_step_count_steps=self.config.report_freq, session_config=sess_config, enable_data_pre_proc=True, iterations_per_loop=1) self.estimator = NPUEstimator(model_fn=self.model_fn, config=config)
def before_train(self, logs=None): """Be called before the whole train process.""" self.trainer.config.call_metrics_on_train = False self.cfg = self.trainer.config self.worker_id = self.trainer.worker_id self.local_base_path = self.trainer.local_base_path self.local_output_path = self.trainer.local_output_path self.result_path = FileOps.join_path(self.trainer.local_base_path, "result") FileOps.make_dir(self.result_path) self.logger_patch()
def set_trainer(self, trainer): """Set trainer object for current callback.""" self.trainer = trainer self.trainer._train_loop = self.train_process self.cfg = self.trainer.config self._worker_id = self.trainer._worker_id if hasattr(self.cfg, "kwargs") and "spnas_sample" in self.cfg.kwargs: self.sample_result = self.cfg.kwargs["spnas_sample"] self.worker_path = self.trainer.get_local_worker_path() self.output_path = self.trainer.local_output_path self.best_model_name = "model_best" self.best_model_file = FileOps.join_path( self.worker_path, "model_{}.pth".format(self.trainer.worker_id))
def _init_model(self, model=None): """Load model desc from save path and parse to model.""" if model is not None: if vega.is_torch_backend() and self.use_cuda: model = model.cuda() return model model_cfg = Config(ClassFactory.__configs__.get('model')) if "model_desc_file" in model_cfg and model_cfg.model_desc_file is not None: desc_file = model_cfg.model_desc_file desc_file = desc_file.replace("{local_base_path}", self.local_base_path) if ":" not in desc_file: desc_file = os.path.abspath(desc_file) if ":" in desc_file: local_desc_file = FileOps.join_path( self.local_output_path, os.path.basename(desc_file)) FileOps.copy_file(desc_file, local_desc_file) desc_file = local_desc_file model_desc = Config(desc_file) logging.info("net_desc:{}".format(model_desc)) elif "model_desc" in model_cfg and model_cfg.model_desc is not None: model_desc = model_cfg.model_desc elif "models_folder" in model_cfg and model_cfg.models_folder is not None: folder = model_cfg.models_folder.replace("{local_base_path}", self.local_base_path) pattern = FileOps.join_path(folder, "desc_*.json") desc_file = glob.glob(pattern)[0] model_desc = Config(desc_file) else: return None if model_desc is not None: self.model_desc = model_desc net_desc = NetworkDesc(model_desc) model = net_desc.to_model() if vega.is_torch_backend() and self.use_cuda: model = model.cuda() return model else: return None
def _save_pareto_front(self, metric_x, metric_y): """Save pareto front of the searched models. :param metric_x: x axis of pareto front :param metric_y: y axis of pareto front """ df_all = pd.read_csv(FileOps.join_path(self.result_path, "random.csv")) mutate_csv = FileOps.join_path(self.result_path, 'mutate.csv') if os.path.exists(mutate_csv): df_mutate = pd.read_csv(mutate_csv) df_all = pd.concat([df_all, df_mutate], ignore_index=True) current_best = 0 df_result = pd.DataFrame(columns=df_all.columns) df_all = df_all.sort_values(by=metric_x) for _, row in df_all.iterrows(): if row[metric_y] > current_best: current_best = row[metric_y] df_result.loc[len(df_result)] = row result_file_name = FileOps.join_path(self.result_path, "pareto_front.csv") df_result.to_csv(result_file_name, index=False) logger.info("Pareto front updated to {}".format(result_file_name))
def _output_records(self, step_name, records, desc=True, weights_file=False, performance=False): """Dump records.""" columns = ["worker_id", "performance", "desc"] outputs = [] for record in records: record = record.serialize() _record = {} for key in columns: _record[key] = record[key] outputs.append(deepcopy(_record)) data = pd.DataFrame(outputs) step_path = FileOps.join_path(TaskOps().local_output_path, step_name) FileOps.make_dir(step_path) _file = FileOps.join_path(step_path, "output.csv") try: data.to_csv(_file, index=False) except Exception: logging.error("Failed to save output file, file={}".format(_file)) for record in outputs: worker_id = record["worker_id"] worker_path = TaskOps().get_local_worker_path(step_name, worker_id) outputs_globs = [] if desc: outputs_globs += glob.glob( FileOps.join_path(worker_path, "desc_*.json")) if weights_file: outputs_globs += glob.glob( FileOps.join_path(worker_path, "model_*.pth")) if performance: outputs_globs += glob.glob( FileOps.join_path(worker_path, "performance_*.json")) for _file in outputs_globs: FileOps.copy_file(_file, step_path)
def logger_patch(self): """Patch the default logger.""" worker_path = self.trainer.get_local_worker_path() worker_spec_log_file = FileOps.join_path(worker_path, 'current_worker.log') logger = logging.getLogger(__name__) for hdlr in logger.handlers: logger.removeHandler(hdlr) for hdlr in logging.root.handlers: logging.root.removeHandler(hdlr) logger.addHandler(logging.FileHandler(worker_spec_log_file)) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) logging.root = logger
def _save_descript(self): """Save result descript.""" template_file = self.config.darts_template_file genotypes = self.search_alg.codec.calc_genotype(self._get_arch_weights()) if template_file == "{default_darts_cifar10_template}": template = DartsNetworkTemplateConfig.cifar10 elif template_file == "{default_darts_imagenet_template}": template = DartsNetworkTemplateConfig.imagenet else: dst = FileOps.join_path(self.trainer.get_local_worker_path(), os.path.basename(template_file)) FileOps.copy_file(template_file, dst) template = Config(dst) model_desc = self._gen_model_desc(genotypes, template) self.trainer.config.codec = model_desc
def copy_pareto_output(self, step_name=None, worker_ids=[]): """Copy files related to pareto from worker to output.""" taskops = TaskOps() local_output_path = os.path.join(taskops.local_output_path, step_name) if not (step_name and os.path.exists(local_output_path)): return for worker_id in worker_ids: desDir = os.path.join(local_output_path, str(worker_id)) FileOps.make_dir(desDir) local_worker_path = taskops.get_worker_subpath( step_name, str(worker_id)) srcDir = FileOps.join_path(taskops.local_base_path, local_worker_path) copy_search_file(srcDir, desDir)
def after_train(self, logs=None): """Call after_train of the managed callbacks.""" curr_auc = float(self.trainer.valid_metrics.results['auc']) self.sieve_board = self.sieve_board.append( { 'selected_feature_pairs': self.selected_pairs, 'score': curr_auc }, ignore_index=True) result_file = FileOps.join_path( self.trainer.local_output_path, '{}_result.csv'.format(self.trainer.__worker_id__)) self.sieve_board.to_csv(result_file, sep='\t')
def _save_model_desc(self): search_space = SearchSpace() codec = Codec(self.cfg.codec, search_space) pareto_front_df = pd.read_csv( FileOps.join_path(self.result_path, "pareto_front.csv")) codes = pareto_front_df['Code'] for i in range(len(codes)): search_desc = Config() search_desc.custom = deepcopy(search_space.search_space.custom) search_desc.modules = deepcopy(search_space.search_space.modules) code = codes.loc[i] search_desc.custom.code = code search_desc.custom.method = 'full' codec.decode(search_desc.custom) self.trainer.output_model_desc(i, search_desc)