def train(config: Config): """学習処理の実行スクリプト.""" pl.seed_everything(config.random_seed) # 学習を途中から再開する場合などの設定 cache_dir = pathlib.Path(config.cache_dir) cache_dir.mkdir(exist_ok=True) trainer_params = dict() lastckpt = cache_dir.joinpath("last.ckpt") if config.resume: trainer_params["resume_from_checkpoint"] = str(lastckpt) elif lastckpt.exists(): lastckpt.unlink() for filepath in cache_dir.glob("epoch*.ckpt"): filepath.unlink() # ログ設定 pl_logger = pl_loggers.MLFlowLogger( experiment_name=config.experiment_name, tracking_uri=os.environ.get("MLFLOW_TRACKING_URI", None), tags={ "mlflow.source.name": pathlib.Path(__file__).name, "mlflow.source.git.commit": ut.get_commit_id(), }, ) # ネットワーク、データセットの取得及び学習 network = tv_models.vgg16(pretrained=False) params = dc.asdict(config) model = Trainer(network, **params) callbacks: t.List[t.Any] = list() model_checkpoint = pl_callbacks.ModelCheckpoint( filepath=str(cache_dir), monitor="val_loss", save_last=True, save_top_k=config.save_top_k, save_weights_only=config.save_weights_only, mode="min", period=1, ) callbacks.append(model_checkpoint) if config.early_stop: callbacks.append( pl_callbacks.EarlyStopping( monitor="val_loss", min_delta=0.0, patience=3, verbose=False, mode="auto", )) pl_trainer = pl.Trainer( default_root_dir=str(cache_dir), fast_dev_run=False, min_epochs=config.min_epochs, max_epochs=config.max_epochs, gpus=[0] if config.use_gpu and cuda.is_available() else None, progress_bar_refresh_rate=config.progress_bar_refresh_rate, profiler=config.profiler, callbacks=callbacks, logger=pl_logger, log_gpu_memory=True, **trainer_params, ) datamodule = dataset_food101.Food101WithLableModule( batch_size=config.batch_size, num_workers=config.num_workers, ) pl_trainer.fit(model, datamodule) # ログに追加情報を設定 mlf_client = mlflow.tracking.MlflowClient() for ckptfile in cache_dir.glob("epoch*.ckpt"): model = model.load_from_checkpoint(str(ckptfile), network, **params) with tempfile.TemporaryDirectory() as dname: mlf_model_path = pathlib.Path(dname).joinpath(ckptfile.stem) mlf_pytorch.save_model(model.network, mlf_model_path) mlf_client.log_artifact(pl_logger.run_id, mlf_model_path)
x = self.model(x)['out'] return x deeplab3 = DL3(dl3) #torchvision.models.segmentation.fcn_resnet50(pretrained=False, num_classes=2, progress=False) run_dir = config.LEARNER.runsave_dir + '/' + name #os.mkdir(run_dir) exp_name = 'fastai_dl3' mlflow_CB = partial( MLFlowTracker, exp_name=exp_name, uri='file:/workspace/oct_ca_seg/runsaves/fastai_experiments/mlruns/', params=config.config_dict, log_model=True, nb_path="/workspace/oct_ca_seg/oct/02_caps.ipynb") learner = Learner(data=data, model=deeplab3, metrics=metrics, callback_fns=mlflow_CB) with mlflow.start_run(): learner.fit_one_cycle(1, slice(config.LEARNER.lr), pct_start=0.9) MLPY.save_model(learner.model, run_dir + '/model') save_all_results(learner, run_dir, exp_name) saveConfigRun(config.config_dict, run_dir=Path(run_dir), name='configDL3__bs16_epochs30_lr0.001.json')
def train(config: Config): """学習処理の実行スクリプト.""" params = dc.asdict(config) pl.seed_everything(config.random_seed) # 学習を途中から再開する場合などの設定 cache_dir = directories.get_processed().joinpath(config.cache_dir) cache_dir.mkdir(exist_ok=True) trainer_params = dict() lastckpt = cache_dir.joinpath("last.ckpt") if config.resume: trainer_params["resume_from_checkpoint"] = str(lastckpt) elif lastckpt.exists(): lastckpt.unlink() for filepath in cache_dir.glob("epoch*.ckpt"): filepath.unlink() # 中間データの保存設定 model_checkpoint = pl_callbacks.ModelCheckpoint( filepath=str(cache_dir), monitor="val_loss", save_last=True, save_top_k=config.save_top_k, save_weights_only=config.save_weights_only, mode="min", period=1, ) # ログ設定 pl_logger = pl_loggers.MLFlowLogger( experiment_name=config.experiment_name, tracking_uri=os.environ.get("MLFLOW_TRACKING_URI", None), tags={ "mlflow.source.name": pathlib.Path(__file__).name, "mlflow.source.git.commit": ut.get_commit_id(), }, ) # ネットワーク、データセットの取得及び学習 network = get_network( NetworkName.value_of(config.network_name), in_channels=config.in_channels, out_channels=config.out_channels, ) model = AETrainer(network, params) pl_trainer = pl.Trainer( early_stop_callback=config.early_stop, default_root_dir=str(cache_dir), fast_dev_run=False, min_epochs=config.min_epochs, max_epochs=config.max_epochs, gpus=[0] if config.use_gpu and tc.is_available() else None, progress_bar_refresh_rate=config.progress_bar_refresh_rate, profiler=config.profiler, checkpoint_callback=model_checkpoint, logger=pl_logger, log_gpu_memory=True, **trainer_params, ) datamodule = get_datamodule( DatasetName.value_of(config.dataset_name), image_size=config.resize_image, batch_size=config.batch_size, num_workers=config.num_workers, ) pl_trainer.fit(model, datamodule) # ログに追加情報を設定 mlf_client = mlflow.tracking.MlflowClient() for key, val in pl_trainer.profiler.recorded_durations.items(): mlf_client.log_metric(pl_logger.run_id, f"{key}_mean", np.mean(val)) mlf_client.log_metric(pl_logger.run_id, f"{key}_sum", np.sum(val)) for ckptfile in cache_dir.glob("epoch*.ckpt"): model = model.load_from_checkpoint(str(ckptfile), network, params) with tempfile.TemporaryDirectory() as dname: mlf_model_path = pathlib.Path(dname).joinpath(ckptfile.stem) mlf_pytorch.save_model(model.network, mlf_model_path) mlf_client.log_artifact(pl_logger.run_id, mlf_model_path)