def _save_checkpoint(self, epoch, best=False): """Save model weights. :param epoch: current epoch :type epoch: int """ save_dir = os.path.join(self.worker_path, str(epoch)) FileOps.make_dir(save_dir) for name in self.model.model_names: if isinstance(name, str): save_filename = '%s_net_%s.pth' % (epoch, name) save_path = FileOps.join_path(save_dir, save_filename) net = getattr(self.model, 'net' + name) best_file = FileOps.join_path(self.worker_path, "model_{}.pth".format(name)) if self.cfg.cuda and torch.cuda.is_available(): # torch.save(net.module.cpu().state_dict(), save_path) torch.save(net.module.state_dict(), save_path) # net.cuda() if best: torch.save(net.module.state_dict(), best_file) else: torch.save(net.cpu().state_dict(), save_path) if best: torch.save(net.cpu().state_dict(), best_file)
def _init_next_rung(self): """Init next rung to search.""" next_rung_id = self.rung_id + 1 if next_rung_id >= self.total_rungs: self.rung_id = self.rung_id + 1 return for i in range(self.config_count): self.all_config_dict[i][next_rung_id] = self.all_config_dict[i][self.rung_id] current_score = [] for i in range(self.config_count): current_score.append((i, self.best_score_dict[self.rung_id][i])) current_score.sort(key=lambda current_score: current_score[1]) for i in range(4): better_id = current_score[self.config_count - 1 - i][0] worse_id = current_score[i][0] better_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(better_id), 'checkpoint') FileOps.make_dir(better_worker_result_path) worse_worker_result_path = FileOps.join_path(self.local_base_path, 'cache', 'pba', str(worse_id), 'checkpoint') FileOps.make_dir(worse_worker_result_path) shutil.rmtree(worse_worker_result_path) shutil.copytree(better_worker_result_path, worse_worker_result_path) self.all_config_dict[worse_id] = self.all_config_dict[better_id] policy_unchange = self.all_config_dict[worse_id][next_rung_id] policy_changed = self.explore(policy_unchange) self.all_config_dict[worse_id][next_rung_id] = policy_changed for id in range(self.config_count): self.best_score_dict[next_rung_id][id] = -1 * float('inf') tmp_row_data = {'config_id': id, 'rung_id': next_rung_id, 'status': StatusType.WAITTING} self._add_to_board(tmp_row_data) self.rung_id = self.rung_id + 1
def dump_model_visual_info(trainer, epoch, model, inputs): """Dump model to tensorboard event files. :param trainer: trainer. :type worker: object that the class was inherited from DistributedWorker. :param model: model. :type model: model. :param inputs: input data. :type inputs: data. """ (_, visual, interval, title, worker_id, output_path) = _get_trainer_info(trainer) if visual is not True: return if epoch % interval != 0: return title = str(worker_id) _path = FileOps.join_path(output_path, title) FileOps.make_dir(_path) try: with SummaryWriter(_path) as writer: writer.add_graph(model, (inputs,)) except Exception as e: logging.error("Failed to dump model visual info, worker id: {}, epoch: {}, error: {}".format( worker_id, epoch, str(e) ))
def search(self): """Search an id and hps from hpo.""" sample = self.hpo.propose() if sample is None: return None re_hps = {} sample = copy.deepcopy(sample) sample_id = sample.get('config_id') trans_para = sample.get('configs') rung_id = sample.get('rung_id') all_para = sample.get('all_configs') re_hps['dataset.transforms'] = [{ 'type': 'PBATransformer', 'para_array': trans_para, 'all_para': all_para, 'operation_names': self.operation_names }] checkpoint_path = FileOps.join_path(self.local_base_path, 'worker', 'cache', str(sample_id), 'checkpoint') FileOps.make_dir(checkpoint_path) if os.path.exists(checkpoint_path): re_hps['trainer.checkpoint_path'] = checkpoint_path if 'epoch' in sample: re_hps['trainer.epochs'] = sample.get('epoch') return dict(worker_id=sample_id, encoded_desc=re_hps, rung_id=rung_id)
def _save_best_model(self): """Save best model.""" if zeus.is_torch_backend(): torch.save(self.trainer.model.state_dict(), self.trainer.weights_file) elif zeus.is_tf_backend(): worker_path = self.trainer.get_local_worker_path() model_id = "model_{}".format(self.trainer.worker_id) weights_folder = FileOps.join_path(worker_path, model_id) FileOps.make_dir(weights_folder) checkpoint_file = tf.train.latest_checkpoint(worker_path) ckpt_globs = glob.glob("{}.*".format(checkpoint_file)) for _file in ckpt_globs: dst_file = model_id + os.path.splitext(_file)[-1] FileOps.copy_file(_file, FileOps.join_path(weights_folder, dst_file)) FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'), weights_folder) elif zeus.is_ms_backend(): worker_path = self.trainer.get_local_worker_path() save_path = os.path.join( worker_path, "model_{}.ckpt".format(self.trainer.worker_id)) for file in os.listdir(worker_path): if file.startswith("CKP") and file.endswith(".ckpt"): self.weights_file = FileOps.join_path(worker_path, file) os.rename(self.weights_file, save_path)
def save_results(self): """Save the results of evolution contains the information of pupulation and elitism.""" _path = FileOps.join_path(self.local_output_path, General.step_name) FileOps.make_dir(_path) arch_file = FileOps.join_path(_path, 'arch.txt') arch_child = FileOps.join_path(_path, 'arch_child.txt') sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy') sel_arch = [] with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac: writer_a = csv.writer(fw_a, lineterminator='\n') writer_ac = csv.writer(fw_ac, lineterminator='\n') writer_ac.writerow( ['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.individual_num): writer_ac.writerow( self._log_data(net_info_type='active_only', pop=self.pop[c], value=self.pop[c].fitness)) writer_a.writerow( ['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.elitism_num): writer_a.writerow( self._log_data(net_info_type='active_only', pop=self.elitism[c], value=self.elit_fitness[c])) sel_arch.append(self.elitism[c].gene) sel_arch = np.stack(sel_arch) np.save(sel_arch_file, sel_arch) if self.backup_base_path is not None: FileOps.copy_folder(self.local_output_path, self.backup_base_path)
def before_train(self, logs=None): """Be called before the whole train process.""" self.trainer.config.call_metrics_on_train = False self.cfg = self.trainer.config self.worker_id = self.trainer.worker_id self.local_base_path = self.trainer.local_base_path self.local_output_path = self.trainer.local_output_path self.result_path = FileOps.join_path(self.trainer.local_base_path, "result") FileOps.make_dir(self.result_path) self.logger_patch()
def copy_pareto_output(self, step_name=None, worker_ids=[]): """Copy files related to pareto from worker to output.""" taskops = TaskOps() local_output_path = os.path.join(taskops.local_output_path, step_name) if not (step_name and os.path.exists(local_output_path)): return for worker_id in worker_ids: desDir = os.path.join(local_output_path, str(worker_id)) FileOps.make_dir(desDir) local_worker_path = taskops.get_worker_subpath( step_name, str(worker_id)) srcDir = FileOps.join_path(taskops.local_base_path, local_worker_path) copy_search_file(srcDir, desDir)
def save_master_ip(ip_address, port, args): """Write the ip and port in a system path. :param str ip_address: The `ip_address` need to write. :param str port: The `port` need to write. :param argparse.ArgumentParser args: `args` is a argparse that should contain `init_method`, `rank` and `world_size`. """ temp_folder = TaskOps().temp_path FileOps.make_dir(temp_folder) file_path = os.path.join(temp_folder, 'ip_address.txt') logging.info("write ip, file path={}".format(file_path)) with open(file_path, 'w') as f: f.write(ip_address + "\n") f.write(port + "\n")
def load_master_ip(): """Get the ip and port that write in a system path. here will not download anything from S3. """ temp_folder = TaskOps().temp_path FileOps.make_dir(temp_folder) file_path = os.path.join(temp_folder, 'ip_address.txt') if os.path.isfile(file_path): with open(file_path, 'r') as f: ip = f.readline().strip() port = f.readline().strip() logging.info("get write ip, ip={}, port={}".format(ip, port)) return ip, port else: return None, None
def _save_performance(self, results): """Save performance into performance.pkl and save checkpoint to output_dir. :param results: performance results :type sr: dict """ logging.info("performance=%s", str(results)) performance_dir = os.path.join(self.worker_path, 'performance') FileOps.make_dir(performance_dir) FileOps.dump_pickle(results, os.path.join(performance_dir, 'performance.pkl')) logging.info("performance save to %s", performance_dir) # copy pth to output dir output_dir = os.path.join(self.output_path, str(self._worker_id)) FileOps.make_dir(output_dir) shutil.copy(os.path.join(self.worker_path, 'latest.pth'), os.path.join(output_dir, results['arch'].split('_')[1] + '.pth')) logging.info("Latest checkpoint save to %s", output_dir)
def _save_best_model(self): """Save best model.""" if zeus.is_torch_backend(): torch.save(self.trainer.model.state_dict(), self.trainer.weights_file) elif zeus.is_tf_backend(): worker_path = self.trainer.get_local_worker_path() model_id = "model_{}".format(self.trainer.worker_id) weights_folder = FileOps.join_path(worker_path, model_id) FileOps.make_dir(weights_folder) checkpoint_file = tf.train.latest_checkpoint(worker_path) ckpt_globs = glob.glob("{}.*".format(checkpoint_file)) for _file in ckpt_globs: dst_file = model_id + os.path.splitext(_file)[-1] FileOps.copy_file(_file, FileOps.join_path(weights_folder, dst_file)) FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'), weights_folder)
def update(self, record): """Update current performance into hpo score board. :param hps: hyper parameters need to update :param performance: trainer performance """ super().update(record) config_id = str(record.get('worker_id')) step_name = record.get('step_name') worker_result_path = self.get_local_worker_path(step_name, config_id) new_worker_result_path = FileOps.join_path(self.local_base_path, 'worker', 'cache', config_id, 'checkpoint') FileOps.make_dir(worker_result_path) FileOps.make_dir(new_worker_result_path) if os.path.exists(new_worker_result_path): shutil.rmtree(new_worker_result_path) shutil.copytree(worker_result_path, new_worker_result_path)
def _output_records(self, step_name, records, desc=True, weights_file=False, performance=False): """Dump records.""" columns = ["worker_id", "performance", "desc"] outputs = [] for record in records: record = record.serialize() _record = {} for key in columns: _record[key] = record[key] outputs.append(deepcopy(_record)) data = pd.DataFrame(outputs) step_path = FileOps.join_path(TaskOps().local_output_path, step_name) FileOps.make_dir(step_path) _file = FileOps.join_path(step_path, "output.csv") try: data.to_csv(_file, index=False) except Exception: logging.error("Failed to save output file, file={}".format(_file)) for record in outputs: worker_id = record["worker_id"] worker_path = TaskOps().get_local_worker_path(step_name, worker_id) outputs_globs = [] if desc: outputs_globs += glob.glob( FileOps.join_path(worker_path, "desc_*.json")) if weights_file: outputs_globs += glob.glob( FileOps.join_path(worker_path, "model_*")) if performance: outputs_globs += glob.glob( FileOps.join_path(worker_path, "performance_*.json")) for _file in outputs_globs: if os.path.isfile(_file): FileOps.copy_file(_file, step_path) elif os.path.isdir(_file): FileOps.copy_folder( _file, FileOps.join_path(step_path, os.path.basename(_file)))