コード例 #1
0
ファイル: train.py プロジェクト: HiroakiMikami/mlprogram
def save_results(output_dir: str, model: nn.Module,
                 optimizer: torch.optim.Optimizer) -> None:
    if distributed.is_main_process():
        logger.info("Dump the last model")
        torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))
        torch.save(optimizer.state_dict(),
                   os.path.join(output_dir, "optimizer.pt"))
コード例 #2
0
ファイル: evaluate.py プロジェクト: nashid/mlprogram
def evaluate(input_dir: str, workspace_dir: str, output_dir: str,
             valid_dataset: torch.utils.data.Dataset,
             model: nn.Module,
             synthesizer: Synthesizer,
             metrics: Mapping[str, Callable[[Environment, Code], float]],
             top_n: List[int] = [1],
             device: torch.device = torch.device("cpu"),
             n_samples: Optional[int] = None) \
        -> None:
    os.makedirs(workspace_dir, exist_ok=True)

    logger.info("Prepare model")
    model.to(device)

    evaluate_synthesizer = EvaluateSynthesizer[Code,
                                               GroundTruth](valid_dataset,
                                                            synthesizer,
                                                            metrics, top_n,
                                                            n_samples)

    model_dir = os.path.join(input_dir, "model")
    if len(os.listdir(model_dir)) > 1:
        logger.warning(f"There are multiple models in {model_dir}")
    if len(os.listdir(model_dir)) == 0:
        logger.warning(f"There are no models in {model_dir}")
    pathes = []
    for model_path in os.listdir(model_dir):
        model_path = os.path.join(model_dir, os.path.basename(model_path))
        score = \
            torch.load(model_path, map_location=torch.device("cpu"))["score"]
        pathes.append((score, model_path))
    pathes.sort(key=lambda x: -x[0])  # type: ignore
    model_path = pathes[0][1]

    logger.info(f"Start evaluation: {model_path}")
    state_dict = \
        torch.load(model_path, map_location=torch.device("cpu"))["model"]
    model.load_state_dict(state_dict)

    result = evaluate_synthesizer()

    logger.info("Save result to output_dir")
    if distributed.is_main_process():
        os.makedirs(output_dir, exist_ok=True)
        torch.save(result, os.path.join(output_dir, "result.pt"))
        with open(os.path.join(output_dir, "result_metrics.json"),
                  "w") as file:
            json.dump(
                {
                    "metrics": result.metrics,
                    "generation_rate": result.generation_rate,
                    "generation_time": result.generation_time
                }, file)
コード例 #3
0
ファイル: train.py プロジェクト: nashid/mlprogram
def save_results(workspace_dir: str, output_dir: str, model: nn.Module,
                 optimizer: torch.optim.Optimizer) -> None:
    if distributed.is_main_process():
        model_dir = os.path.join(workspace_dir, "model")
        logger.info("Copy log to output_dir")
        if os.path.exists(os.path.join(workspace_dir, "log")):
            os.makedirs(output_dir, exist_ok=True)
            shutil.copyfile(os.path.join(workspace_dir, "log"),
                            os.path.join(output_dir, "log.json"))

        logger.info("Copy models to output_dir")
        out_model_dir = os.path.join(output_dir, "model")
        if os.path.exists(out_model_dir):
            shutil.rmtree(out_model_dir)
        shutil.copytree(model_dir, out_model_dir)

        logger.info("Dump the last model")
        torch.save(model.state_dict(), os.path.join(output_dir, "model.pt"))
        torch.save(optimizer.state_dict(),
                   os.path.join(output_dir, "optimizer.pt"))
コード例 #4
0
ファイル: train.py プロジェクト: HiroakiMikami/mlprogram
def create_extensions_manager(n_iter: int,
                              evaluation_interval_iter: int,
                              snapshot_interval_iter: int,
                              iter_per_epoch: int,
                              model: nn.Module,
                              optimizer: torch.optim.Optimizer,
                              evaluate: Optional[Callable[[], None]],
                              metric: str,
                              maximize: bool,
                              threshold: Optional[float],
                              output_dir: str,
                              report_metrics: Optional[List[str]] = None):
    model_dir = os.path.join(output_dir, "model")

    logger.info("Prepare pytorch-pfn-extras")
    manager = ppe.training.ExtensionsManager(
        model,
        optimizer,
        n_iter / iter_per_epoch,
        out_dir=os.path.join(output_dir),
        extensions=[],
        iters_per_epoch=iter_per_epoch,
    )
    manager.extend(extensions.FailOnNonNumber(),
                   trigger=Trigger(evaluation_interval_iter, n_iter))
    if evaluate is not None:
        manager.extend(
            Call(evaluate),
            trigger=Trigger(evaluation_interval_iter, n_iter),
        )
    if distributed.is_main_process():
        manager.extend(
            extensions.LogReport(
                trigger=Trigger(100, n_iter),
                filename="log.json",
            ))
        manager.extend(extensions.ProgressBar())
        manager.extend(
            SaveTopKModel(model_dir, 1, metric, model, maximize=maximize),
            trigger=Trigger(evaluation_interval_iter, n_iter),
        )
        metrics = report_metrics or []
        manager.extend(
            extensions.PrintReport(entries=[
                "loss", *metrics, "iteration", "epoch", "time.iteration",
                "gpu.time.iteration", "elapsed_time"
            ]),
            trigger=Trigger(100, n_iter),
        )
    if threshold is not None:
        manager.extend(
            StopByThreshold(metric, threshold, maximize=maximize),
            trigger=Trigger(evaluation_interval_iter, n_iter),
        )
    if distributed.is_initialized():
        snapshot = extensions.snapshot(autoload=True,
                                       n_retains=1,
                                       saver_rank=0)
        snapshot._rank = distributed.rank()
        snapshot._size = distributed.size()
        snapshot._local_rank = distributed.rank()
    else:
        snapshot = extensions.snapshot(autoload=True, n_retains=1)
    manager.extend(snapshot, trigger=Trigger(snapshot_interval_iter, n_iter))
    return manager