Beispiel #1
0
    def _getCheckpointCallback(self, version: int = None, stage=None):
        model_dir = self.paths.get("model_dir", f'model/{DATE}')
        model_dir = os.path.join(model_dir, self.name)
        if version or version == 0:
            version = f"version_{version}" if isinstance(version,
                                                         int) else version
            model_dir = os.path.join(model_dir, version)

        exist = os.path.exists(model_dir) and os.listdir(model_dir)
        if stage == 'test' and not exist:
            raise FileNotFoundError(model_dir)
        elif stage is None and exist:
            raise FileExistsError(model_dir)

        self.model_dir = model_dir

        checkpoint_callback = ModelCheckpoint(
            monitor="err/B-M/validation",
            dirpath=model_dir,
            filename="best",
            save_last=True,
            mode="min",
        )
        checkpoint_callback.FILE_EXTENSION = ".pth"
        checkpoint_callback.best_model_path = model_dir
        checkpoint_callback.CHECKPOINT_NAME_LAST = "latest"
        return checkpoint_callback
 def configure_checkpoint_callbacks(self) -> List[ModelCheckpoint]:
     train_callback = ModelCheckpoint(
         monitor=None,
         every_n_train_steps=self.config.training.checkpoint_interval,
         dirpath=get_mmf_env(key="save_dir"),
         filename="models/model_{step}",
         save_top_k=-1,
         save_last=True,
         verbose=True,
     )
     train_callback.CHECKPOINT_NAME_LAST = "current"
     return [train_callback]
Beispiel #3
0
def _train(
    task: str,
    ov: List[str],
    do_sweep: bool,
):
    """
    Run training

    Args:
        task: task to run training for
        ov: overwrites for config manager
        do_sweep: determine best emprical parameters for run
    """
    print(f"Overwrites: {ov}")
    initialize_config_module(config_module="nndet.conf")
    cfg = compose(task, "config.yaml", overrides=ov if ov is not None else [])

    assert cfg.host.parent_data is not None, 'Parent data can not be None'
    assert cfg.host.parent_results is not None, 'Output dir can not be None'

    train_dir = init_train_dir(cfg)

    pl_logger = MLFlowLogger(
        experiment_name=cfg["task"],
        tags={
            "host": socket.gethostname(),
            "fold": cfg["exp"]["fold"],
            "task": cfg["task"],
            "job_id": os.getenv('LSB_JOBID', 'no_id'),
            "mlflow.runName": cfg["exp"]["id"],
        },
        save_dir=os.getenv("MLFLOW_TRACKING_URI", "./mlruns"),
    )
    pl_logger.log_hyperparams(
        flatten_mapping(
            {"model": OmegaConf.to_container(cfg["model_cfg"], resolve=True)}))
    pl_logger.log_hyperparams(
        flatten_mapping({
            "trainer":
            OmegaConf.to_container(cfg["trainer_cfg"], resolve=True)
        }))

    logger.remove()
    logger.add(sys.stdout, format="{level} {message}", level="INFO")
    log_file = Path(os.getcwd()) / "train.log"
    logger.add(log_file, level="INFO")
    logger.info(f"Log file at {log_file}")

    meta_data = {}
    meta_data["torch_version"] = str(torch.__version__)
    meta_data["date"] = str(datetime.now())
    meta_data["git"] = log_git(nndet.__path__[0], repo_name="nndet")
    save_json(meta_data, "./meta.json")
    try:
        write_requirements_to_file("requirements.txt")
    except Exception as e:
        logger.error(f"Could not log req: {e}")

    plan_path = Path(str(cfg.host["plan_path"]))
    plan = load_pickle(plan_path)
    save_json(create_debug_plan(plan), "./plan_debug.json")

    data_dir = Path(cfg.host["preprocessed_output_dir"]
                    ) / plan["data_identifier"] / "imagesTr"

    datamodule = Datamodule(
        augment_cfg=OmegaConf.to_container(cfg["augment_cfg"], resolve=True),
        plan=plan,
        data_dir=data_dir,
        fold=cfg["exp"]["fold"],
    )
    module = MODULE_REGISTRY[cfg["module"]](
        model_cfg=OmegaConf.to_container(cfg["model_cfg"], resolve=True),
        trainer_cfg=OmegaConf.to_container(cfg["trainer_cfg"], resolve=True),
        plan=plan,
    )
    callbacks = []
    checkpoint_cb = ModelCheckpoint(
        dirpath=train_dir,
        filename='model_best',
        save_last=True,
        save_top_k=1,
        monitor=cfg["trainer_cfg"]["monitor_key"],
        mode=cfg["trainer_cfg"]["monitor_mode"],
    )
    checkpoint_cb.CHECKPOINT_NAME_LAST = 'model_last'
    callbacks.append(checkpoint_cb)
    callbacks.append(LearningRateMonitor(logging_interval="epoch"))

    OmegaConf.save(cfg, str(Path(os.getcwd()) / "config.yaml"))
    OmegaConf.save(cfg,
                   str(Path(os.getcwd()) / "config_resolved.yaml"),
                   resolve=True)
    save_pickle(plan, train_dir / "plan.pkl")  # backup plan
    splits = load_pickle(
        Path(cfg.host.preprocessed_output_dir) / datamodule.splits_file)
    save_pickle(splits, train_dir / "splits.pkl")

    trainer_kwargs = {}
    if cfg["train"]["mode"].lower() == "resume":
        trainer_kwargs[
            "resume_from_checkpoint"] = train_dir / "model_last.ckpt"

    num_gpus = cfg["trainer_cfg"]["gpus"]
    logger.info(f"Using {num_gpus} GPUs for training")
    plugins = cfg["trainer_cfg"].get("plugins", None)
    logger.info(f"Using {plugins} plugins for training")

    trainer = pl.Trainer(
        gpus=list(range(num_gpus)) if num_gpus > 1 else num_gpus,
        accelerator=cfg["trainer_cfg"]["accelerator"],
        precision=cfg["trainer_cfg"]["precision"],
        amp_backend=cfg["trainer_cfg"]["amp_backend"],
        amp_level=cfg["trainer_cfg"]["amp_level"],
        benchmark=cfg["trainer_cfg"]["benchmark"],
        deterministic=cfg["trainer_cfg"]["deterministic"],
        callbacks=callbacks,
        logger=pl_logger,
        max_epochs=module.max_epochs,
        progress_bar_refresh_rate=None
        if bool(int(os.getenv("det_verbose", 1))) else 0,
        reload_dataloaders_every_epoch=False,
        num_sanity_val_steps=10,
        weights_summary='full',
        plugins=plugins,
        terminate_on_nan=True,  # TODO: make modular
        move_metrics_to_cpu=True,
        **trainer_kwargs)
    trainer.fit(module, datamodule=datamodule)

    if do_sweep:
        case_ids = splits[cfg["exp"]["fold"]]["val"]
        if "debug" in cfg and "num_cases_val" in cfg["debug"]:
            case_ids = case_ids[:cfg["debug"]["num_cases_val"]]

        inference_plan = module.sweep(
            cfg=OmegaConf.to_container(cfg, resolve=True),
            save_dir=train_dir,
            train_data_dir=data_dir,
            case_ids=case_ids,
            run_prediction=True,
        )

        plan["inference_plan"] = inference_plan
        save_pickle(plan, train_dir / "plan_inference.pkl")

        ensembler_cls = module.get_ensembler_cls(
            key="boxes",
            dim=plan["network_dim"])  # TODO: make this configurable
        for restore in [True, False]:
            target_dir = train_dir / "val_predictions" if restore else \
                train_dir / "val_predictions_preprocessed"
            extract_results(
                source_dir=train_dir / "sweep_predictions",
                target_dir=target_dir,
                ensembler_cls=ensembler_cls,
                restore=restore,
                **inference_plan,
            )

        _evaluate(
            task=cfg["task"],
            model=cfg["exp"]["id"],
            fold=cfg["exp"]["fold"],
            test=False,
            do_boxes_eval=True,  # TODO: make this configurable
            do_analyze_boxes=True,  # TODO: make this configurable
        )
Beispiel #4
0
    logger.debug(args)
    segnet_model = LitSegNet(conf=args, viz=args.viz, dataset_seq=args.dataset_seq, save=args.save, save_xp=args.save_xp, test_set=args.test_set, nopredict=args.nopredict, test_max = args.test_samples)

    if args.prefix is None:
        args.prefix = segnet_model.hparams.save_prefix
    logger.debug(args.prefix)

    checkpoint_callback = ModelCheckpoint(
        dirpath='lightning_logs',
        filename=args.prefix+'-{epoch}-{val_loss:.4f}',
        verbose=True,
        monitor='val_loss',
        mode='min',
        save_last=True
    )
    checkpoint_callback.CHECKPOINT_NAME_LAST = f"{args.prefix}-last"

    lr_monitor = LearningRateMonitor(logging_interval='step')

    callbacks = [lr_monitor]

    if args.train:
        logger.warning("Training phase")
        wandb_logger = WandbLogger(project='segnet-freiburg', log_model = False, name = segnet_model.hparams.save_prefix)
        wandb_logger.log_hyperparams(segnet_model.hparams)
        #wandb_logger.watch(segnet_model, log='parameters', log_freq=100)



        if args.update_output_layer or args.init:
            segnet_model = segnet_model.load_from_checkpoint(checkpoint_path=args.train_checkpoint, conf=args)
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument('--batch_size',
                        type=int,
                        help='Size of the batches for each training step.')
    parser.add_argument('--max_epoch',
                        type=int,
                        help='The maximum number of training epoch.')
    parser.add_argument('--data_path',
                        type=str,
                        default='/data/CAM_IO/robot/images',
                        help='Data path :)')
    parser.add_argument('--log_path',
                        type=str,
                        default='/OOB_RECOG/logs',
                        help='log path :)')
    parser.add_argument('--num_gpus',
                        type=int,
                        default=2,
                        help='The number of GPUs using for training.')

    ## init lr
    parser.add_argument('--init_lr', type=float, help='optimizer for init lr')

    ## log saved in project_name
    parser.add_argument('--project_name',
                        type=str,
                        help='log saved in project_name')

    ## Robot Lapa
    parser.add_argument('--dataset',
                        type=str,
                        default='robot',
                        choices=['robot', 'lapa'],
                        help='[robot, lapa] choice on dataset')

    ## OOB NIR
    parser.add_argument('--task',
                        type=str,
                        default='OOB',
                        choices=['OOB', 'NIR'],
                        help='[OOB, NIR] choice on task')

    args, _ = parser.parse_known_args()

    # hyper parameter setting
    config_hparams = {
        'optimizer_lr': args.init_lr,
    }

    print('\n\n')
    print('batch size : ', args.batch_size)
    print('init lr : ', config_hparams['optimizer_lr'])
    print('\n\n')

    # 사용할 GPU 디바이스 번호들
    os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1'

    # extracted images path
    base_path = args.data_path

    # log base path
    log_base_path = os.path.join(args.log_path, args.dataset, args.task)

    # bath size
    BATCH_SIZE = args.batch_size

    # model
    model = CAMIO(config_hparams)  # Trainer에서 사용할 모델 // add config_hparams

    # dataset 설정
    trainset = CAMIO_Dataset(base_path,
                             is_train=True,
                             test_mode=False,
                             data_ratio=1)
    valiset = CAMIO_Dataset(base_path,
                            is_train=False,
                            test_mode=False,
                            data_ratio=1)

    print('trainset len : ', len(trainset))
    print('valiset len : ', len(valiset))

    train_loader = torch.utils.data.DataLoader(trainset,
                                               batch_size=BATCH_SIZE,
                                               shuffle=False,
                                               num_workers=8)
    vali_loader = torch.utils.data.DataLoader(valiset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=False,
                                              num_workers=8)

    ###### this section is for check that dataset gets img and label correctly ####
    ### 해당 section은 단지 모델이 학습하는 데이터셋이 무엇인지 확인하기 위해 구성된 Dataset의 image와 label정보를 모두 /get_dataset_results에 저장하는 부분
    ### 학습시 꼭 필요한 부분이 아님.
    # datset log check
    ''' 
    trainset_save_path = os.path.join('get_dataset_results', 'trainset')
    valiset_save_path = os.path.join('get_dataset_results', 'valiset')

    trainset_df = pd.DataFrame({
        'img_list' : trainset.img_list,
        'label_list' : trainset.label_list
    })

    valiset_df = pd.DataFrame({
        'img_list' : valiset.img_list,
        'label_list' : valiset.label_list
    })
    
    # csv 저장
    # trainset_df.to_csv(os.path.join(trainset_save_path, 'trainset.csv'), mode='w')
    valiset_df.to_csv(os.path.join(valiset_save_path, 'valiset.csv'), mode='w')

    # 사용된 이미지(img_list) 파일복사 | train

    label_class = [0, 1]
    copied_count = [0, 0]
    for tar_label in label_class :
        for img_path in tqdm(trainset_df[trainset_df['label_list'] == tar_label]['img_list'], desc='trainset coping_{}'.format(tar_label)) :
            file_name, file_ext = os.path.basename(img_path).split('.')
            copy_name = file_name + '_copy_' + str(tar_label) + '.' + file_ext
            shutil.copyfile(img_path, os.path.join(trainset_save_path, str(tar_label), copy_name))

            copied_count[tar_label]+=1
    

    print('train copeied count : ', copied_count)
    

    # 사용된 이미지(img_list) 파일복사 | validation
    label_class = [0, 1]
    copied_count = [0, 0]
    for tar_label in label_class :
        for img_path in tqdm(valiset_df[valiset_df['label_list'] == tar_label]['img_list'], desc='valiset coping_{}'.format(tar_label)) :
            file_name, file_ext = os.path.basename(img_path).split('.')
            copy_name = file_name + 'copy' + '_' + str(tar_label) + '.' + file_ext
            shutil.copyfile(img_path, os.path.join(valiset_save_path, str(tar_label), copy_name))

            copied_count[tar_label]+=1

    print('vali copeied count : ', copied_count)
    '''

    ##### ### ###
    """
        dirpath : log 저장되는 위치
        filename : 저장되는 checkpoint file 이름
        monitor : 저장할 metric 기준
    """
    checkpoint_filename = 'ckpoint_{}-batch={}-lr={}-'.format(
        args.project_name, BATCH_SIZE, args.init_lr)
    checkpoint_filename = checkpoint_filename + '{epoch}-{val_loss:.4f}'
    checkpoint_callback = ModelCheckpoint(
        dirpath=os.path.join(log_base_path, args.project_name),
        filename=checkpoint_filename,  # {epoch}-{val_loss:.4f}
        save_top_k=1,
        save_last=True,
        verbose=True,
        monitor="val_loss",
        mode="min")

    # change last checkpoint name
    checkpoint_callback.CHECKPOINT_NAME_LAST = 'ckpoint_{}-batch={}-lr={}-'.format(
        args.project_name, BATCH_SIZE, args.init_lr) + '{epoch}-last'
    """
        tensorboard logger
        save_dir : checkpoint log 저장 위치처럼 tensorboard log 저장위치
        name : tensorboard log 저장할 폴더 이름 (이 안에 하위폴더로 version_0, version_1, ... 이런식으로 생김)
        default_hp_metric : 뭔지 모르는데 거슬려서 False
    """
    tb_logger = pl_loggers.TensorBoardLogger(save_dir=os.path.join(
        log_base_path, args.project_name),
                                             name='TB_log',
                                             default_hp_metric=False)
    """
        gpus : GPU 몇 개 사용할건지, 1개면 ddp 안함
        max_epochs : 최대 몇 epoch 할지
        checkpoint_callback : 위에 정의한 callback 함수
        logger : tensorboard logger, 다른 custom logger도 사용가능
        accelerator : 멀티 GPU 모드 설정
    """

    # pytorch lightning Trainer Class
    ## train
    trainer = pl.Trainer(gpus=args.num_gpus,
                         max_epochs=args.max_epoch,
                         checkpoint_callback=checkpoint_callback,
                         logger=tb_logger,
                         accelerator='ddp')
    trainer.fit(model, train_loader, vali_loader)