Esempio n. 1
0
File: trainer.py Progetto: iimuz/til
def train(config: Config):
    """学習処理の実行スクリプト."""
    pl.seed_everything(config.random_seed)

    # 学習を途中から再開する場合などの設定
    cache_dir = pathlib.Path(config.cache_dir)
    cache_dir.mkdir(exist_ok=True)
    trainer_params = dict()
    lastckpt = cache_dir.joinpath("last.ckpt")
    if config.resume:
        trainer_params["resume_from_checkpoint"] = str(lastckpt)
    elif lastckpt.exists():
        lastckpt.unlink()
    for filepath in cache_dir.glob("epoch*.ckpt"):
        filepath.unlink()

    # ログ設定
    pl_logger = pl_loggers.MLFlowLogger(
        experiment_name=config.experiment_name,
        tracking_uri=os.environ.get("MLFLOW_TRACKING_URI", None),
        tags={
            "mlflow.source.name": pathlib.Path(__file__).name,
            "mlflow.source.git.commit": ut.get_commit_id(),
        },
    )

    # ネットワーク、データセットの取得及び学習
    network = tv_models.vgg16(pretrained=False)
    params = dc.asdict(config)
    model = Trainer(network, **params)

    callbacks: t.List[t.Any] = list()
    model_checkpoint = pl_callbacks.ModelCheckpoint(
        filepath=str(cache_dir),
        monitor="val_loss",
        save_last=True,
        save_top_k=config.save_top_k,
        save_weights_only=config.save_weights_only,
        mode="min",
        period=1,
    )
    callbacks.append(model_checkpoint)
    if config.early_stop:
        callbacks.append(
            pl_callbacks.EarlyStopping(
                monitor="val_loss",
                min_delta=0.0,
                patience=3,
                verbose=False,
                mode="auto",
            ))

    pl_trainer = pl.Trainer(
        default_root_dir=str(cache_dir),
        fast_dev_run=False,
        min_epochs=config.min_epochs,
        max_epochs=config.max_epochs,
        gpus=[0] if config.use_gpu and cuda.is_available() else None,
        progress_bar_refresh_rate=config.progress_bar_refresh_rate,
        profiler=config.profiler,
        callbacks=callbacks,
        logger=pl_logger,
        log_gpu_memory=True,
        **trainer_params,
    )
    datamodule = dataset_food101.Food101WithLableModule(
        batch_size=config.batch_size,
        num_workers=config.num_workers,
    )
    pl_trainer.fit(model, datamodule)

    # ログに追加情報を設定
    mlf_client = mlflow.tracking.MlflowClient()
    for ckptfile in cache_dir.glob("epoch*.ckpt"):
        model = model.load_from_checkpoint(str(ckptfile), network, **params)
        with tempfile.TemporaryDirectory() as dname:
            mlf_model_path = pathlib.Path(dname).joinpath(ckptfile.stem)
            mlf_pytorch.save_model(model.network, mlf_model_path)
            mlf_client.log_artifact(pl_logger.run_id, mlf_model_path)
Esempio n. 2
0
        x = self.model(x)['out']
        return x


deeplab3 = DL3(dl3)

#torchvision.models.segmentation.fcn_resnet50(pretrained=False, num_classes=2, progress=False)

run_dir = config.LEARNER.runsave_dir + '/' + name
#os.mkdir(run_dir)
exp_name = 'fastai_dl3'
mlflow_CB = partial(
    MLFlowTracker,
    exp_name=exp_name,
    uri='file:/workspace/oct_ca_seg/runsaves/fastai_experiments/mlruns/',
    params=config.config_dict,
    log_model=True,
    nb_path="/workspace/oct_ca_seg/oct/02_caps.ipynb")
learner = Learner(data=data,
                  model=deeplab3,
                  metrics=metrics,
                  callback_fns=mlflow_CB)

with mlflow.start_run():
    learner.fit_one_cycle(1, slice(config.LEARNER.lr), pct_start=0.9)
    MLPY.save_model(learner.model, run_dir + '/model')
    save_all_results(learner, run_dir, exp_name)
    saveConfigRun(config.config_dict,
                  run_dir=Path(run_dir),
                  name='configDL3__bs16_epochs30_lr0.001.json')
Esempio n. 3
0
def train(config: Config):
    """学習処理の実行スクリプト."""
    params = dc.asdict(config)
    pl.seed_everything(config.random_seed)

    # 学習を途中から再開する場合などの設定
    cache_dir = directories.get_processed().joinpath(config.cache_dir)
    cache_dir.mkdir(exist_ok=True)
    trainer_params = dict()
    lastckpt = cache_dir.joinpath("last.ckpt")
    if config.resume:
        trainer_params["resume_from_checkpoint"] = str(lastckpt)
    elif lastckpt.exists():
        lastckpt.unlink()
    for filepath in cache_dir.glob("epoch*.ckpt"):
        filepath.unlink()

    # 中間データの保存設定
    model_checkpoint = pl_callbacks.ModelCheckpoint(
        filepath=str(cache_dir),
        monitor="val_loss",
        save_last=True,
        save_top_k=config.save_top_k,
        save_weights_only=config.save_weights_only,
        mode="min",
        period=1,
    )

    # ログ設定
    pl_logger = pl_loggers.MLFlowLogger(
        experiment_name=config.experiment_name,
        tracking_uri=os.environ.get("MLFLOW_TRACKING_URI", None),
        tags={
            "mlflow.source.name": pathlib.Path(__file__).name,
            "mlflow.source.git.commit": ut.get_commit_id(),
        },
    )

    # ネットワーク、データセットの取得及び学習
    network = get_network(
        NetworkName.value_of(config.network_name),
        in_channels=config.in_channels,
        out_channels=config.out_channels,
    )
    model = AETrainer(network, params)
    pl_trainer = pl.Trainer(
        early_stop_callback=config.early_stop,
        default_root_dir=str(cache_dir),
        fast_dev_run=False,
        min_epochs=config.min_epochs,
        max_epochs=config.max_epochs,
        gpus=[0] if config.use_gpu and tc.is_available() else None,
        progress_bar_refresh_rate=config.progress_bar_refresh_rate,
        profiler=config.profiler,
        checkpoint_callback=model_checkpoint,
        logger=pl_logger,
        log_gpu_memory=True,
        **trainer_params,
    )
    datamodule = get_datamodule(
        DatasetName.value_of(config.dataset_name),
        image_size=config.resize_image,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
    )
    pl_trainer.fit(model, datamodule)

    # ログに追加情報を設定
    mlf_client = mlflow.tracking.MlflowClient()
    for key, val in pl_trainer.profiler.recorded_durations.items():
        mlf_client.log_metric(pl_logger.run_id, f"{key}_mean", np.mean(val))
        mlf_client.log_metric(pl_logger.run_id, f"{key}_sum", np.sum(val))
    for ckptfile in cache_dir.glob("epoch*.ckpt"):
        model = model.load_from_checkpoint(str(ckptfile), network, params)
        with tempfile.TemporaryDirectory() as dname:
            mlf_model_path = pathlib.Path(dname).joinpath(ckptfile.stem)
            mlf_pytorch.save_model(model.network, mlf_model_path)
            mlf_client.log_artifact(pl_logger.run_id, mlf_model_path)