def save_results(output_dir: str, model: nn.Module, optimizer: torch.optim.Optimizer) -> None: if distributed.is_main_process(): logger.info("Dump the last model") torch.save(model.state_dict(), os.path.join(output_dir, "model.pt")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
def evaluate(input_dir: str, workspace_dir: str, output_dir: str, valid_dataset: torch.utils.data.Dataset, model: nn.Module, synthesizer: Synthesizer, metrics: Mapping[str, Callable[[Environment, Code], float]], top_n: List[int] = [1], device: torch.device = torch.device("cpu"), n_samples: Optional[int] = None) \ -> None: os.makedirs(workspace_dir, exist_ok=True) logger.info("Prepare model") model.to(device) evaluate_synthesizer = EvaluateSynthesizer[Code, GroundTruth](valid_dataset, synthesizer, metrics, top_n, n_samples) model_dir = os.path.join(input_dir, "model") if len(os.listdir(model_dir)) > 1: logger.warning(f"There are multiple models in {model_dir}") if len(os.listdir(model_dir)) == 0: logger.warning(f"There are no models in {model_dir}") pathes = [] for model_path in os.listdir(model_dir): model_path = os.path.join(model_dir, os.path.basename(model_path)) score = \ torch.load(model_path, map_location=torch.device("cpu"))["score"] pathes.append((score, model_path)) pathes.sort(key=lambda x: -x[0]) # type: ignore model_path = pathes[0][1] logger.info(f"Start evaluation: {model_path}") state_dict = \ torch.load(model_path, map_location=torch.device("cpu"))["model"] model.load_state_dict(state_dict) result = evaluate_synthesizer() logger.info("Save result to output_dir") if distributed.is_main_process(): os.makedirs(output_dir, exist_ok=True) torch.save(result, os.path.join(output_dir, "result.pt")) with open(os.path.join(output_dir, "result_metrics.json"), "w") as file: json.dump( { "metrics": result.metrics, "generation_rate": result.generation_rate, "generation_time": result.generation_time }, file)
def save_results(workspace_dir: str, output_dir: str, model: nn.Module, optimizer: torch.optim.Optimizer) -> None: if distributed.is_main_process(): model_dir = os.path.join(workspace_dir, "model") logger.info("Copy log to output_dir") if os.path.exists(os.path.join(workspace_dir, "log")): os.makedirs(output_dir, exist_ok=True) shutil.copyfile(os.path.join(workspace_dir, "log"), os.path.join(output_dir, "log.json")) logger.info("Copy models to output_dir") out_model_dir = os.path.join(output_dir, "model") if os.path.exists(out_model_dir): shutil.rmtree(out_model_dir) shutil.copytree(model_dir, out_model_dir) logger.info("Dump the last model") torch.save(model.state_dict(), os.path.join(output_dir, "model.pt")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
def create_extensions_manager(n_iter: int, evaluation_interval_iter: int, snapshot_interval_iter: int, iter_per_epoch: int, model: nn.Module, optimizer: torch.optim.Optimizer, evaluate: Optional[Callable[[], None]], metric: str, maximize: bool, threshold: Optional[float], output_dir: str, report_metrics: Optional[List[str]] = None): model_dir = os.path.join(output_dir, "model") logger.info("Prepare pytorch-pfn-extras") manager = ppe.training.ExtensionsManager( model, optimizer, n_iter / iter_per_epoch, out_dir=os.path.join(output_dir), extensions=[], iters_per_epoch=iter_per_epoch, ) manager.extend(extensions.FailOnNonNumber(), trigger=Trigger(evaluation_interval_iter, n_iter)) if evaluate is not None: manager.extend( Call(evaluate), trigger=Trigger(evaluation_interval_iter, n_iter), ) if distributed.is_main_process(): manager.extend( extensions.LogReport( trigger=Trigger(100, n_iter), filename="log.json", )) manager.extend(extensions.ProgressBar()) manager.extend( SaveTopKModel(model_dir, 1, metric, model, maximize=maximize), trigger=Trigger(evaluation_interval_iter, n_iter), ) metrics = report_metrics or [] manager.extend( extensions.PrintReport(entries=[ "loss", *metrics, "iteration", "epoch", "time.iteration", "gpu.time.iteration", "elapsed_time" ]), trigger=Trigger(100, n_iter), ) if threshold is not None: manager.extend( StopByThreshold(metric, threshold, maximize=maximize), trigger=Trigger(evaluation_interval_iter, n_iter), ) if distributed.is_initialized(): snapshot = extensions.snapshot(autoload=True, n_retains=1, saver_rank=0) snapshot._rank = distributed.rank() snapshot._size = distributed.size() snapshot._local_rank = distributed.rank() else: snapshot = extensions.snapshot(autoload=True, n_retains=1) manager.extend(snapshot, trigger=Trigger(snapshot_interval_iter, n_iter)) return manager