def _getCheckpointCallback(self, version: int = None, stage=None): model_dir = self.paths.get("model_dir", f'model/{DATE}') model_dir = os.path.join(model_dir, self.name) if version or version == 0: version = f"version_{version}" if isinstance(version, int) else version model_dir = os.path.join(model_dir, version) exist = os.path.exists(model_dir) and os.listdir(model_dir) if stage == 'test' and not exist: raise FileNotFoundError(model_dir) elif stage is None and exist: raise FileExistsError(model_dir) self.model_dir = model_dir checkpoint_callback = ModelCheckpoint( monitor="err/B-M/validation", dirpath=model_dir, filename="best", save_last=True, mode="min", ) checkpoint_callback.FILE_EXTENSION = ".pth" checkpoint_callback.best_model_path = model_dir checkpoint_callback.CHECKPOINT_NAME_LAST = "latest" return checkpoint_callback
def configure_checkpoint_callbacks(self) -> List[ModelCheckpoint]: train_callback = ModelCheckpoint( monitor=None, every_n_train_steps=self.config.training.checkpoint_interval, dirpath=get_mmf_env(key="save_dir"), filename="models/model_{step}", save_top_k=-1, save_last=True, verbose=True, ) train_callback.CHECKPOINT_NAME_LAST = "current" return [train_callback]
def _train( task: str, ov: List[str], do_sweep: bool, ): """ Run training Args: task: task to run training for ov: overwrites for config manager do_sweep: determine best emprical parameters for run """ print(f"Overwrites: {ov}") initialize_config_module(config_module="nndet.conf") cfg = compose(task, "config.yaml", overrides=ov if ov is not None else []) assert cfg.host.parent_data is not None, 'Parent data can not be None' assert cfg.host.parent_results is not None, 'Output dir can not be None' train_dir = init_train_dir(cfg) pl_logger = MLFlowLogger( experiment_name=cfg["task"], tags={ "host": socket.gethostname(), "fold": cfg["exp"]["fold"], "task": cfg["task"], "job_id": os.getenv('LSB_JOBID', 'no_id'), "mlflow.runName": cfg["exp"]["id"], }, save_dir=os.getenv("MLFLOW_TRACKING_URI", "./mlruns"), ) pl_logger.log_hyperparams( flatten_mapping( {"model": OmegaConf.to_container(cfg["model_cfg"], resolve=True)})) pl_logger.log_hyperparams( flatten_mapping({ "trainer": OmegaConf.to_container(cfg["trainer_cfg"], resolve=True) })) logger.remove() logger.add(sys.stdout, format="{level} {message}", level="INFO") log_file = Path(os.getcwd()) / "train.log" logger.add(log_file, level="INFO") logger.info(f"Log file at {log_file}") meta_data = {} meta_data["torch_version"] = str(torch.__version__) meta_data["date"] = str(datetime.now()) meta_data["git"] = log_git(nndet.__path__[0], repo_name="nndet") save_json(meta_data, "./meta.json") try: write_requirements_to_file("requirements.txt") except Exception as e: logger.error(f"Could not log req: {e}") plan_path = Path(str(cfg.host["plan_path"])) plan = load_pickle(plan_path) save_json(create_debug_plan(plan), "./plan_debug.json") data_dir = Path(cfg.host["preprocessed_output_dir"] ) / plan["data_identifier"] / "imagesTr" datamodule = Datamodule( augment_cfg=OmegaConf.to_container(cfg["augment_cfg"], resolve=True), plan=plan, data_dir=data_dir, fold=cfg["exp"]["fold"], ) module = MODULE_REGISTRY[cfg["module"]]( model_cfg=OmegaConf.to_container(cfg["model_cfg"], resolve=True), trainer_cfg=OmegaConf.to_container(cfg["trainer_cfg"], resolve=True), plan=plan, ) callbacks = [] checkpoint_cb = ModelCheckpoint( dirpath=train_dir, filename='model_best', save_last=True, save_top_k=1, monitor=cfg["trainer_cfg"]["monitor_key"], mode=cfg["trainer_cfg"]["monitor_mode"], ) checkpoint_cb.CHECKPOINT_NAME_LAST = 'model_last' callbacks.append(checkpoint_cb) callbacks.append(LearningRateMonitor(logging_interval="epoch")) OmegaConf.save(cfg, str(Path(os.getcwd()) / "config.yaml")) OmegaConf.save(cfg, str(Path(os.getcwd()) / "config_resolved.yaml"), resolve=True) save_pickle(plan, train_dir / "plan.pkl") # backup plan splits = load_pickle( Path(cfg.host.preprocessed_output_dir) / datamodule.splits_file) save_pickle(splits, train_dir / "splits.pkl") trainer_kwargs = {} if cfg["train"]["mode"].lower() == "resume": trainer_kwargs[ "resume_from_checkpoint"] = train_dir / "model_last.ckpt" num_gpus = cfg["trainer_cfg"]["gpus"] logger.info(f"Using {num_gpus} GPUs for training") plugins = cfg["trainer_cfg"].get("plugins", None) logger.info(f"Using {plugins} plugins for training") trainer = pl.Trainer( gpus=list(range(num_gpus)) if num_gpus > 1 else num_gpus, accelerator=cfg["trainer_cfg"]["accelerator"], precision=cfg["trainer_cfg"]["precision"], amp_backend=cfg["trainer_cfg"]["amp_backend"], amp_level=cfg["trainer_cfg"]["amp_level"], benchmark=cfg["trainer_cfg"]["benchmark"], deterministic=cfg["trainer_cfg"]["deterministic"], callbacks=callbacks, logger=pl_logger, max_epochs=module.max_epochs, progress_bar_refresh_rate=None if bool(int(os.getenv("det_verbose", 1))) else 0, reload_dataloaders_every_epoch=False, num_sanity_val_steps=10, weights_summary='full', plugins=plugins, terminate_on_nan=True, # TODO: make modular move_metrics_to_cpu=True, **trainer_kwargs) trainer.fit(module, datamodule=datamodule) if do_sweep: case_ids = splits[cfg["exp"]["fold"]]["val"] if "debug" in cfg and "num_cases_val" in cfg["debug"]: case_ids = case_ids[:cfg["debug"]["num_cases_val"]] inference_plan = module.sweep( cfg=OmegaConf.to_container(cfg, resolve=True), save_dir=train_dir, train_data_dir=data_dir, case_ids=case_ids, run_prediction=True, ) plan["inference_plan"] = inference_plan save_pickle(plan, train_dir / "plan_inference.pkl") ensembler_cls = module.get_ensembler_cls( key="boxes", dim=plan["network_dim"]) # TODO: make this configurable for restore in [True, False]: target_dir = train_dir / "val_predictions" if restore else \ train_dir / "val_predictions_preprocessed" extract_results( source_dir=train_dir / "sweep_predictions", target_dir=target_dir, ensembler_cls=ensembler_cls, restore=restore, **inference_plan, ) _evaluate( task=cfg["task"], model=cfg["exp"]["id"], fold=cfg["exp"]["fold"], test=False, do_boxes_eval=True, # TODO: make this configurable do_analyze_boxes=True, # TODO: make this configurable )
logger.debug(args) segnet_model = LitSegNet(conf=args, viz=args.viz, dataset_seq=args.dataset_seq, save=args.save, save_xp=args.save_xp, test_set=args.test_set, nopredict=args.nopredict, test_max = args.test_samples) if args.prefix is None: args.prefix = segnet_model.hparams.save_prefix logger.debug(args.prefix) checkpoint_callback = ModelCheckpoint( dirpath='lightning_logs', filename=args.prefix+'-{epoch}-{val_loss:.4f}', verbose=True, monitor='val_loss', mode='min', save_last=True ) checkpoint_callback.CHECKPOINT_NAME_LAST = f"{args.prefix}-last" lr_monitor = LearningRateMonitor(logging_interval='step') callbacks = [lr_monitor] if args.train: logger.warning("Training phase") wandb_logger = WandbLogger(project='segnet-freiburg', log_model = False, name = segnet_model.hparams.save_prefix) wandb_logger.log_hyperparams(segnet_model.hparams) #wandb_logger.watch(segnet_model, log='parameters', log_freq=100) if args.update_output_layer or args.init: segnet_model = segnet_model.load_from_checkpoint(checkpoint_path=args.train_checkpoint, conf=args)
def train(): parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, help='Size of the batches for each training step.') parser.add_argument('--max_epoch', type=int, help='The maximum number of training epoch.') parser.add_argument('--data_path', type=str, default='/data/CAM_IO/robot/images', help='Data path :)') parser.add_argument('--log_path', type=str, default='/OOB_RECOG/logs', help='log path :)') parser.add_argument('--num_gpus', type=int, default=2, help='The number of GPUs using for training.') ## init lr parser.add_argument('--init_lr', type=float, help='optimizer for init lr') ## log saved in project_name parser.add_argument('--project_name', type=str, help='log saved in project_name') ## Robot Lapa parser.add_argument('--dataset', type=str, default='robot', choices=['robot', 'lapa'], help='[robot, lapa] choice on dataset') ## OOB NIR parser.add_argument('--task', type=str, default='OOB', choices=['OOB', 'NIR'], help='[OOB, NIR] choice on task') args, _ = parser.parse_known_args() # hyper parameter setting config_hparams = { 'optimizer_lr': args.init_lr, } print('\n\n') print('batch size : ', args.batch_size) print('init lr : ', config_hparams['optimizer_lr']) print('\n\n') # 사용할 GPU 디바이스 번호들 os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1' # extracted images path base_path = args.data_path # log base path log_base_path = os.path.join(args.log_path, args.dataset, args.task) # bath size BATCH_SIZE = args.batch_size # model model = CAMIO(config_hparams) # Trainer에서 사용할 모델 // add config_hparams # dataset 설정 trainset = CAMIO_Dataset(base_path, is_train=True, test_mode=False, data_ratio=1) valiset = CAMIO_Dataset(base_path, is_train=False, test_mode=False, data_ratio=1) print('trainset len : ', len(trainset)) print('valiset len : ', len(valiset)) train_loader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) vali_loader = torch.utils.data.DataLoader(valiset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) ###### this section is for check that dataset gets img and label correctly #### ### 해당 section은 단지 모델이 학습하는 데이터셋이 무엇인지 확인하기 위해 구성된 Dataset의 image와 label정보를 모두 /get_dataset_results에 저장하는 부분 ### 학습시 꼭 필요한 부분이 아님. # datset log check ''' trainset_save_path = os.path.join('get_dataset_results', 'trainset') valiset_save_path = os.path.join('get_dataset_results', 'valiset') trainset_df = pd.DataFrame({ 'img_list' : trainset.img_list, 'label_list' : trainset.label_list }) valiset_df = pd.DataFrame({ 'img_list' : valiset.img_list, 'label_list' : valiset.label_list }) # csv 저장 # trainset_df.to_csv(os.path.join(trainset_save_path, 'trainset.csv'), mode='w') valiset_df.to_csv(os.path.join(valiset_save_path, 'valiset.csv'), mode='w') # 사용된 이미지(img_list) 파일복사 | train label_class = [0, 1] copied_count = [0, 0] for tar_label in label_class : for img_path in tqdm(trainset_df[trainset_df['label_list'] == tar_label]['img_list'], desc='trainset coping_{}'.format(tar_label)) : file_name, file_ext = os.path.basename(img_path).split('.') copy_name = file_name + '_copy_' + str(tar_label) + '.' + file_ext shutil.copyfile(img_path, os.path.join(trainset_save_path, str(tar_label), copy_name)) copied_count[tar_label]+=1 print('train copeied count : ', copied_count) # 사용된 이미지(img_list) 파일복사 | validation label_class = [0, 1] copied_count = [0, 0] for tar_label in label_class : for img_path in tqdm(valiset_df[valiset_df['label_list'] == tar_label]['img_list'], desc='valiset coping_{}'.format(tar_label)) : file_name, file_ext = os.path.basename(img_path).split('.') copy_name = file_name + 'copy' + '_' + str(tar_label) + '.' + file_ext shutil.copyfile(img_path, os.path.join(valiset_save_path, str(tar_label), copy_name)) copied_count[tar_label]+=1 print('vali copeied count : ', copied_count) ''' ##### ### ### """ dirpath : log 저장되는 위치 filename : 저장되는 checkpoint file 이름 monitor : 저장할 metric 기준 """ checkpoint_filename = 'ckpoint_{}-batch={}-lr={}-'.format( args.project_name, BATCH_SIZE, args.init_lr) checkpoint_filename = checkpoint_filename + '{epoch}-{val_loss:.4f}' checkpoint_callback = ModelCheckpoint( dirpath=os.path.join(log_base_path, args.project_name), filename=checkpoint_filename, # {epoch}-{val_loss:.4f} save_top_k=1, save_last=True, verbose=True, monitor="val_loss", mode="min") # change last checkpoint name checkpoint_callback.CHECKPOINT_NAME_LAST = 'ckpoint_{}-batch={}-lr={}-'.format( args.project_name, BATCH_SIZE, args.init_lr) + '{epoch}-last' """ tensorboard logger save_dir : checkpoint log 저장 위치처럼 tensorboard log 저장위치 name : tensorboard log 저장할 폴더 이름 (이 안에 하위폴더로 version_0, version_1, ... 이런식으로 생김) default_hp_metric : 뭔지 모르는데 거슬려서 False """ tb_logger = pl_loggers.TensorBoardLogger(save_dir=os.path.join( log_base_path, args.project_name), name='TB_log', default_hp_metric=False) """ gpus : GPU 몇 개 사용할건지, 1개면 ddp 안함 max_epochs : 최대 몇 epoch 할지 checkpoint_callback : 위에 정의한 callback 함수 logger : tensorboard logger, 다른 custom logger도 사용가능 accelerator : 멀티 GPU 모드 설정 """ # pytorch lightning Trainer Class ## train trainer = pl.Trainer(gpus=args.num_gpus, max_epochs=args.max_epoch, checkpoint_callback=checkpoint_callback, logger=tb_logger, accelerator='ddp') trainer.fit(model, train_loader, vali_loader)