def _output_records(self, step_name, records): """Dump records.""" columns = ["worker_id", "performance", "desc"] outputs = [] for record in records: record = record.serialize() _record = {} for key in columns: _record[key] = record[key] outputs.append(deepcopy(_record)) data = pd.DataFrame(outputs) step_path = FileOps.join_path(TaskOps().local_output_path, step_name) FileOps.make_dir(step_path) _file = FileOps.join_path(step_path, "output.csv") try: data.to_csv(_file, index=False) except Exception: logging.error("Failed to save output file, file={}".format(_file)) for record in outputs: worker_id = record["worker_id"] worker_path = TaskOps().get_local_worker_path(step_name, worker_id) outputs_globs = [] outputs_globs += glob.glob(FileOps.join_path(worker_path, "desc_*.json")) outputs_globs += glob.glob(FileOps.join_path(worker_path, "hps_*.json")) outputs_globs += glob.glob(FileOps.join_path(worker_path, "model_*")) outputs_globs += glob.glob(FileOps.join_path(worker_path, "performance_*.json")) for _file in outputs_globs: if os.path.isfile(_file): FileOps.copy_file(_file, step_path) elif os.path.isdir(_file): FileOps.copy_folder(_file, FileOps.join_path(step_path, os.path.basename(_file)))
def save_results(self): """Save the results of evolution contains the information of pupulation and elitism.""" _path = FileOps.join_path(self.local_output_path, General.step_name) FileOps.make_dir(_path) arch_file = FileOps.join_path(_path, 'arch.txt') arch_child = FileOps.join_path(_path, 'arch_child.txt') sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy') sel_arch = [] with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac: writer_a = csv.writer(fw_a, lineterminator='\n') writer_ac = csv.writer(fw_ac, lineterminator='\n') writer_ac.writerow( ['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.individual_num): writer_ac.writerow( self._log_data(net_info_type='active_only', pop=self.pop[c], value=self.pop[c].fitness)) writer_a.writerow( ['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.elitism_num): writer_a.writerow( self._log_data(net_info_type='active_only', pop=self.elitism[c], value=self.elit_fitness[c])) sel_arch.append(self.elitism[c].gene) sel_arch = np.stack(sel_arch) np.save(sel_arch_file, sel_arch) if self.backup_base_path is not None: FileOps.copy_folder(self.local_output_path, self.backup_base_path)
def _save_checkpoint(self, epoch, best=False): """Save model weights. :param epoch: current epoch :type epoch: int """ save_dir = os.path.join(self.worker_path, str(epoch)) FileOps.make_dir(save_dir) for name in self.model.model_names: if isinstance(name, str): save_filename = '%s_net_%s.pth' % (epoch, name) save_path = FileOps.join_path(save_dir, save_filename) net = getattr(self.model, 'net' + name) best_file = FileOps.join_path(self.worker_path, "model_{}.pth".format(name)) if vega.is_gpu_device() and torch.cuda.is_available(): # torch.save(net.module.cpu().state_dict(), save_path) torch.save(net.module.state_dict(), save_path) # net.cuda() if best: torch.save(net.module.state_dict(), best_file) elif vega.is_npu_device(): torch.save(net.state_dict(), save_path) if best: torch.save(net.state_dict(), best_file) else: torch.save(net.cpu().state_dict(), save_path) if best: torch.save(net.cpu().state_dict(), best_file)
def before_train(self, logs=None): """Be called before the whole train process.""" self.trainer.config.call_metrics_on_train = False self.cfg = self.trainer.config self.worker_id = self.trainer.worker_id self.local_base_path = self.trainer.local_base_path self.local_output_path = self.trainer.local_output_path self.result_path = FileOps.join_path(self.trainer.local_base_path, "result") FileOps.make_dir(self.result_path) self.logger_patch()
def update(self, record): """Update current performance into hpo score board. :param hps: hyper parameters need to update :param performance: trainer performance """ super().update(record) config_id = str(record.get('worker_id')) step_name = record.get('step_name') worker_result_path = self.get_local_worker_path(step_name, config_id) new_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', config_id, 'checkpoint') FileOps.make_dir(worker_result_path) FileOps.make_dir(new_worker_result_path) if os.path.exists(new_worker_result_path): shutil.rmtree(new_worker_result_path) shutil.copytree(worker_result_path, new_worker_result_path)
def load_master_ip(): """Get the ip and port that write in a system path. here will not download anything from S3. """ temp_folder = TaskOps().temp_path FileOps.make_dir(temp_folder) file_path = os.path.join(temp_folder, 'ip_address.txt') if os.path.isfile(file_path): with open(file_path, 'r') as f: ip = f.readline().strip() port = f.readline().strip() logging.info("get write ip, ip={}, port={}".format(ip, port)) return ip, port else: return None, None
def save_master_ip(ip_address, port, args): """Write the ip and port in a system path. :param str ip_address: The `ip_address` need to write. :param str port: The `port` need to write. :param argparse.ArgumentParser args: `args` is a argparse that should contain `init_method`, `rank` and `world_size`. """ temp_folder = TaskOps().temp_path FileOps.make_dir(temp_folder) file_path = os.path.join(temp_folder, 'ip_address.txt') logging.info("write ip, file path={}".format(file_path)) with open(file_path, 'w') as f: f.write(ip_address + "\n") f.write(port + "\n")
def _saved_multi_checkpoint(self, epoch): """Save multi tasks checkpoint.""" FileOps.make_dir(self.trainer.get_local_worker_path(), self.trainer.multi_task) checkpoint_file = FileOps.join_path( self.trainer.get_local_worker_path(), self.trainer.multi_task, self.trainer.checkpoint_file_name) logging.debug("Start Save Multi Task Model, model_file=%s", self.trainer.model_pickle_file_name) if vega.is_torch_backend(): ckpt = { 'epoch': epoch, 'weight': self.trainer.model.state_dict(), 'optimizer': self.trainer.optimizer.state_dict(), 'lr_scheduler': self.trainer.lr_scheduler.state_dict(), } torch.save(ckpt, checkpoint_file) self.trainer.checkpoint_file = checkpoint_file
def search(self): """Search an id and hps from hpo.""" sample = self.hpo.propose() if sample is None: return None re_hps = {} sample = copy.deepcopy(sample) sample_id = sample.get('config_id') trans_para = sample.get('configs') rung_id = sample.get('rung_id') all_para = sample.get('all_configs') re_hps['dataset.transforms'] = [{'type': 'PBATransformer', 'para_array': trans_para, 'all_para': all_para, 'operation_names': self.operation_names}] checkpoint_path = FileOps.join_path(self.local_base_path, 'worker', 'cache', str(sample_id), 'checkpoint') FileOps.make_dir(checkpoint_path) if os.path.exists(checkpoint_path): re_hps['trainer.checkpoint_path'] = checkpoint_path if 'epoch' in sample: re_hps['trainer.epochs'] = sample.get('epoch') return dict(worker_id=sample_id, encoded_desc=re_hps, rung_id=rung_id)
def _init_next_rung(self): """Init next rung to search.""" next_rung_id = self.rung_id + 1 if next_rung_id >= self.total_rungs: self.rung_id = self.rung_id + 1 return for i in range(self.config_count): self.all_config_dict[i][next_rung_id] = self.all_config_dict[i][ self.rung_id] current_score = [] for i in range(self.config_count): current_score.append((i, self.best_score_dict[self.rung_id][i])) current_score.sort(key=lambda current_score: current_score[1]) for i in range(4): better_id = current_score[self.config_count - 1 - i][0] worse_id = current_score[i][0] better_worker_result_path = FileOps.join_path( self.local_base_path, 'cache', 'pba', str(better_id), 'checkpoint') FileOps.make_dir(better_worker_result_path) worse_worker_result_path = FileOps.join_path( self.local_base_path, 'cache', 'pba', str(worse_id), 'checkpoint') FileOps.make_dir(worse_worker_result_path) shutil.rmtree(worse_worker_result_path) shutil.copytree(better_worker_result_path, worse_worker_result_path) self.all_config_dict[worse_id] = self.all_config_dict[better_id] policy_unchange = self.all_config_dict[worse_id][next_rung_id] policy_changed = self.explore(policy_unchange) self.all_config_dict[worse_id][next_rung_id] = policy_changed for id in range(self.config_count): self.best_score_dict[next_rung_id][id] = -1 * float('inf') tmp_row_data = { 'config_id': id, 'rung_id': next_rung_id, 'status': StatusType.WAITTING } self._add_to_board(tmp_row_data) self.rung_id = self.rung_id + 1
def __init__(self, update_func=None): """Init master attrs, setup and start dask distributed cluster and local multiprocess pool.""" self._checkout_cluster_existed() self.cfg = General() self.task_count = 0 self.eval_count = General.worker.eval_count self.__master_path__ = FileOps.join_path(TaskOps().temp_path, "master") FileOps.make_dir(self.__master_path__) self.dask_env = DaskEnv(General.env, self.__master_path__, General.devices_per_trainer, TaskOps().temp_path) status = self.dask_env.start() if not status or not self.dask_env.is_master: sys.exit(0) self._start_cluster() self.t_queue = Queue() self.update_func = update_func self._thread_runing = True self._lock = Lock() self._thread = self._run_monitor_thread() return
def search(self): """Search an id and hps from hpo.""" sample = self.hpo.propose() if sample is None: return None re_hps = {} sample = copy.deepcopy(sample) sample_id = sample.get('config_id') cur_configs = sample.get('configs') all_configs = sample.get("all_configs") rung_id = sample.get('rung_id') checkpoint_path = FileOps.join_path(self.local_base_path, 'cache', str(sample_id), 'checkpoint') FileOps.make_dir(checkpoint_path) if os.path.exists(checkpoint_path): re_hps['trainer.checkpoint_path'] = checkpoint_path if 'epoch' in sample: re_hps['trainer.epochs'] = sample.get('epoch') re_hps.update(cur_configs) re_hps['trainer.all_configs'] = all_configs logging.info("Current rung [ {} /{}] ".format(rung_id, self.config.policy.total_rungs)) return dict(worker_id=sample_id, encoded_desc=re_hps, rung_id=rung_id)
def _save_best_model(self): """Save best model.""" if vega.is_torch_backend(): torch.save(self.trainer.model.state_dict(), self.trainer.weights_file) elif vega.is_tf_backend(): worker_path = self.trainer.get_local_worker_path() model_id = "model_{}".format(self.trainer.worker_id) weights_folder = FileOps.join_path(worker_path, model_id) FileOps.make_dir(weights_folder) checkpoint_file = tf.train.latest_checkpoint(worker_path) ckpt_globs = glob.glob("{}.*".format(checkpoint_file)) for _file in ckpt_globs: FileOps.copy_file( _file, FileOps.join_path(weights_folder, os.path.split(_file)[-1])) FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'), weights_folder) if self.trainer.save_ext_model: self._save_pb_model(weights_folder, model_id) self.trainer.ext_model = FileOps.join_path( weights_folder, '{}.pb'.format(model_id)) elif vega.is_ms_backend(): worker_path = self.trainer.get_local_worker_path() save_path = os.path.join( worker_path, "model_{}.ckpt".format(self.trainer.worker_id)) for file in os.listdir(worker_path): if file.startswith("CKP") and file.endswith(".ckpt"): self.weights_file = FileOps.join_path(worker_path, file) os.rename(self.weights_file, save_path) if self.trainer.save_ext_model: model_id = "model_{}".format(self.trainer.worker_id) self._save_om_model(worker_path, model_id) self.trainer.ext_model = FileOps.join_path( worker_path, '{}.om'.format(model_id))