Exemple #1
0
def train_center_net(train_df, oof_df):
    train_dataset = centernet.WheatDataset(train_df, transforms=get_train_transforms())
    train_dataloader = DataLoader(train_dataset, batch_size=Config.Train.batch_size,
                                  shuffle=True, num_workers=4, drop_last=True, pin_memory=True)
    oof_dataset = centernet.WheatDataset(oof_df, test=True, transforms=get_valid_transforms())
    oof_dataloader = DataLoader(oof_dataset, batch_size=Config.Train.batch_size,
                                shuffle=False, num_workers=4, pin_memory=True)
    model = Resnest50CenterNet(conf=Config)
    early_stop = callbacks.EarlyStopping(monitor='val_map',
                                         patience=10,
                                         mode='max',
                                         verbose=True)
    checkpoint = callbacks.ModelCheckpoint(str(Config.Train.checkpoint_dir),
                                           monitor='val_map',
                                           verbose=True,
                                           mode='max',
                                           save_top_k=1)
    cbs = [
        callbacks.LearningRateLogger()
    ]
    trainer = Trainer(gpus=1,
                      early_stop_callback=early_stop,
                      checkpoint_callback=checkpoint,
                      callbacks=cbs,
                      benchmark=True,
                      deterministic=True,
                      max_epochs=Config.Train.epochs)
    trainer.fit(model, train_dataloader=train_dataloader,
                val_dataloaders=oof_dataloader)

    valid_dataset = centernet.WheatDataset(get_data(mode='valid'), test=True, transforms=get_test_transforms())
    valid_dataloader = DataLoader(valid_dataset, batch_size=Config.Train.batch_size,
                                  shuffle=False, num_workers=4, pin_memory=True)
    trainer.test(model, test_dataloaders=valid_dataloader)
Exemple #2
0
    def run(config="config/base.yml"):
        config = util.load_config(config)
        now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
        run_dir = path.join("wandb", now)
        run_dir = path.abspath(run_dir)
        os.environ['WANDB_PROJECT'] = "linear_turing"
        os.environ['TOKENIZERS_PARALLELISM'] = 'true'

        checkpoint_callback = callbacks.ModelCheckpoint(monitor='val_loss',
                                                        mode='min',
                                                        save_weights_only=True,
                                                        save_last=True,
                                                        filename='{epoch}_{val_loss:.2f}')

        other_callbacks = [
            pl.callbacks.LearningRateMonitor(),
            callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=10)
        ]

        experiment = Experiment(config)

        trainer = pl.Trainer(logger=pl.loggers.WandbLogger(log_model=True),
                             checkpoint_callback=checkpoint_callback,
                             callbacks=other_callbacks,
                             **config['trainer'])

        trainer.fit(experiment)
Exemple #3
0
def train(args):
    model = RNN()
    data_module = DataModule(args)

    callbacks_list = None
    if args.val_path:
        callbacks_list = []
        callbacks_list.append(callbacks.EarlyStopping(monitor='val_acc', patience=PATIENCE))
        callbacks_list.append(callbacks.ModelCheckpoint(filepath=args.out_path, monitor='val_acc', prefix='rnn'))

    gpus = N_GPU if torch.cuda.is_available() else None
    trainer = pl.Trainer(gpus=gpus, max_epochs=MAX_EPOCHS, callbacks=callbacks_list)

    trainer.fit(model, datamodule=data_module)
Exemple #4
0
def get_callbacks(cfg, output_dir):
    cbacks = []
    checkpoint_path = os.path.join(output_dir, cfg.CHECKPOINT.NAME)
    checkpoint = pl_callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                              save_last=False,
                                              monitor=cfg.CHECKPOINT.MONITOR,
                                              mode=cfg.CHECKPOINT.MONITOR_MODE)
    cs = [
        pl_callbacks.EarlyStopping(monitor=cfg.CHECKPOINT.MONITOR,
                                   mode=cfg.CHECKPOINT.MONITOR_MODE,
                                   **cfg.EARLY_STOPPING),
        pl_callbacks.LearningRateLogger(),
        inspector.AnalysisCallback()
    ]
    return checkpoint, cs
Exemple #5
0
def load_callbacks():
    callbacks = []
    callbacks.append(
        plc.EarlyStopping(monitor='val_acc',
                          mode='max',
                          patience=10,
                          min_delta=0.001))

    callbacks.append(
        plc.ModelCheckpoint(monitor='val_acc',
                            filename='best-{epoch:02d}-{val_acc:.3f}',
                            save_top_k=1,
                            mode='max',
                            save_last=True))

    if args.lr_scheduler:
        callbacks.append(plc.LearningRateMonitor(logging_interval='epoch'))
    return callbacks
Exemple #6
0
def get_loggers_callbacks(args, model=None):

    try:
        # Setup logger(s) params
        csv_logger_params = dict(
            save_dir="./experiments",
            name=os.path.join(*args.save_dir.split("/")[1:-1]),
            version=args.save_dir.split("/")[-1],
        )
        wandb_logger_params = dict(
            log_model=False,
            name=os.path.join(*args.save_dir.split("/")[1:]),
            offline=args.debug,
            project="utime",
            save_dir=args.save_dir,
        )
        loggers = [
            pl_loggers.CSVLogger(**csv_logger_params),
            pl_loggers.WandbLogger(**wandb_logger_params),
        ]
        if model:
            loggers[-1].watch(model)

        # Setup callback(s) params
        checkpoint_monitor_params = dict(
            filepath=os.path.join(args.save_dir,
                                  "{epoch:03d}-{eval_loss:.2f}"),
            monitor=args.checkpoint_monitor,
            save_last=True,
            save_top_k=1,
        )
        earlystopping_parameters = dict(
            monitor=args.earlystopping_monitor,
            patience=args.earlystopping_patience,
        )
        callbacks = [
            pl_callbacks.ModelCheckpoint(**checkpoint_monitor_params),
            pl_callbacks.EarlyStopping(**earlystopping_parameters),
            pl_callbacks.LearningRateMonitor(),
        ]

        return loggers, callbacks
    except AttributeError:
        return None, None
def main():
    logger.remove()
    logger.add(sys.stdout,
               colorize=True,
               format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> " +
               "| <level>{level}</level> " +
               "| <light-black>{file.path}:{line}</light-black> | {message}")
    hparams = parse_args()
    if hparams.restore:
        wandb.init(project=hparams.project, tags=hparams.tags)
        model = LevelClassification.load_from_checkpoint(hparams.restore)
        logger.info("Restored model")
    else:
        # wandb.init is called in LevelClassification
        model = LevelClassification(hparams)
        experiment_logger = loggers.WandbLogger(project=hparams.project,
                                                tags=hparams.tags)
        hparams.checkpoint_dir = os.path.join(experiment_logger.experiment.dir,
                                              "checkpoints")
        checkpoint_cb = callbacks.ModelCheckpoint(hparams.checkpoint_dir,
                                                  save_top_k=1)
        trainer = pl.Trainer(logger=experiment_logger,
                             gpus=1 if hparams.device == "cuda" else 0,
                             checkpoint_callback=checkpoint_cb,
                             callbacks=[EmbeddingsCallback()],
                             early_stop_callback=callbacks.EarlyStopping(),
                             fast_dev_run=hparams.debug)
        trainer.fit(model)
    model.freeze()
    baseline_datasets = []
    logger.info("Baselines {}", os.listdir(hparams.baseline_level_dir))
    for i, baseline_level_dir in enumerate(
            sorted(os.listdir(hparams.baseline_level_dir))):
        baseline_dataset = LevelSnippetDataset(
            level_dir=os.path.join(os.getcwd(), hparams.baseline_level_dir,
                                   baseline_level_dir),
            slice_width=model.dataset.slice_width,
            token_list=model.dataset.token_list)
        baseline_datasets.append(baseline_dataset)
    visualize_embeddings(model.dataset, model, "test", hparams, None,
                         baseline_datasets)
Exemple #8
0
def run(config):
    if isinstance(config, str):
        with open(config) as f:
            config = yaml.safe_load(f)

    now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    run_dir = path.join("wandb", now)
    run_dir = path.abspath(run_dir)
    os.environ['WANDB_RUN_DIR'] = run_dir

    checkpoint_callback = callbacks.ModelCheckpoint(
        run_dir, monitor=config['early_stopping']['monitor'])
    early_stopping_callback = callbacks.EarlyStopping(
        **config['early_stopping'])

    experiment = Experiment(config)
    trainer = pl.Trainer(logger=False,
                         checkpoint_callback=checkpoint_callback,
                         early_stop_callback=early_stopping_callback,
                         **config['trainer'])
    trainer.fit(experiment)
Exemple #9
0
def train_faster_rcnn(train_df, oof_df):
    train_dataset = rcnn.WheatDataset(train_df, transforms=get_train_transforms())
    train_dataloader = DataLoader(train_dataset, batch_size=Config.Train.batch_size,
                                  shuffle=True, num_workers=4, drop_last=True,
                                  collate_fn=collate_fn, pin_memory=True)
    oof_dataset = rcnn.WheatDataset(oof_df, test=True, transforms=get_valid_transforms())
    oof_dataloader = DataLoader(oof_dataset, batch_size=Config.Train.batch_size,
                                shuffle=False, num_workers=4,
                                collate_fn=collate_fn, pin_memory=True)
    # model = FasterRCNNResnet50FPN.load_from_checkpoint('checkpoints\\faster_rcnn\\epoch=9.ckpt', **Config)
    model = FasterRCNNResnet50FPN(conf=Config)
    early_stop = callbacks.EarlyStopping(monitor='val_loss',
                                         patience=20,
                                         mode='min',
                                         verbose=True)
    checkpoint = callbacks.ModelCheckpoint(str(Config.Train.checkpoint_dir),
                                           monitor='val_loss',
                                           verbose=True,
                                           save_top_k=1)
    cbs = [
        callbacks.LearningRateLogger()
    ]
    trainer = Trainer(gpus=1,
                      early_stop_callback=early_stop,
                      checkpoint_callback=checkpoint,
                      callbacks=cbs,
                      benchmark=True,
                      deterministic=True,
                      max_epochs=Config.Train.epochs)
    trainer.fit(model, train_dataloader=train_dataloader,
                val_dataloaders=oof_dataloader)

    valid_dataset = rcnn.WheatDataset(get_data(mode='valid'), test=True, transforms=get_test_transforms())
    valid_dataloader = DataLoader(valid_dataset, batch_size=Config.Train.batch_size,
                                  shuffle=False, num_workers=4,
                                  collate_fn=collate_fn, pin_memory=True)
    trainer.test(model, test_dataloaders=valid_dataloader)
Exemple #10
0
        valid_data = DataLoader(valid_dataset,
                                num_workers=8,
                                pin_memory=True,
                                batch_sampler=valid_batch_sampler,
                                collate_fn=_collate_fn)

        test_dataset = LanguageModelingDataset(datasets['test'])
        test_batch_sampler = BPTTBatchSampler(test_dataset, hparams.bptt,
                                              hparams.batch_size)
        test_data = DataLoader(test_dataset,
                               num_workers=8,
                               pin_memory=True,
                               batch_sampler=test_batch_sampler,
                               collate_fn=_collate_fn)

        early_stop_callback = callbacks.EarlyStopping(monitor='val_ppl',
                                                      mode='min')
        model_checkpoint_callback = callbacks.ModelCheckpoint(
            monitor='val_ppl',
            save_last=True,
            save_top_k=5,
            save_weights_only=False,
            mode='min')

        trainer = Trainer.from_argparse_args(
            hparams,
            default_root_dir=os.path.abspath(
                os.path.expanduser("~/data/awd-lstm")),
            callbacks=[
                early_stop_callback, model_checkpoint_callback,
                NNICallback()
            ])
Exemple #11
0
def train(config: Config):
    """学習処理の実行スクリプト."""
    pl.seed_everything(config.random_seed)

    # 学習を途中から再開する場合などの設定
    cache_dir = pathlib.Path(config.cache_dir)
    cache_dir.mkdir(exist_ok=True)
    trainer_params = dict()
    lastckpt = cache_dir.joinpath("last.ckpt")
    if config.resume:
        trainer_params["resume_from_checkpoint"] = str(lastckpt)
    elif lastckpt.exists():
        lastckpt.unlink()
    for filepath in cache_dir.glob("epoch*.ckpt"):
        filepath.unlink()

    # ログ設定
    pl_logger = pl_loggers.MLFlowLogger(
        experiment_name=config.experiment_name,
        tracking_uri=os.environ.get("MLFLOW_TRACKING_URI", None),
        tags={
            "mlflow.source.name": pathlib.Path(__file__).name,
            "mlflow.source.git.commit": ut.get_commit_id(),
        },
    )

    # ネットワーク、データセットの取得及び学習
    network = tv_models.vgg16(pretrained=False)
    params = dc.asdict(config)
    model = Trainer(network, **params)

    callbacks: t.List[t.Any] = list()
    model_checkpoint = pl_callbacks.ModelCheckpoint(
        filepath=str(cache_dir),
        monitor="val_loss",
        save_last=True,
        save_top_k=config.save_top_k,
        save_weights_only=config.save_weights_only,
        mode="min",
        period=1,
    )
    callbacks.append(model_checkpoint)
    if config.early_stop:
        callbacks.append(
            pl_callbacks.EarlyStopping(
                monitor="val_loss",
                min_delta=0.0,
                patience=3,
                verbose=False,
                mode="auto",
            ))

    pl_trainer = pl.Trainer(
        default_root_dir=str(cache_dir),
        fast_dev_run=False,
        min_epochs=config.min_epochs,
        max_epochs=config.max_epochs,
        gpus=[0] if config.use_gpu and cuda.is_available() else None,
        progress_bar_refresh_rate=config.progress_bar_refresh_rate,
        profiler=config.profiler,
        callbacks=callbacks,
        logger=pl_logger,
        log_gpu_memory=True,
        **trainer_params,
    )
    datamodule = dataset_food101.Food101WithLableModule(
        batch_size=config.batch_size,
        num_workers=config.num_workers,
    )
    pl_trainer.fit(model, datamodule)

    # ログに追加情報を設定
    mlf_client = mlflow.tracking.MlflowClient()
    for ckptfile in cache_dir.glob("epoch*.ckpt"):
        model = model.load_from_checkpoint(str(ckptfile), network, **params)
        with tempfile.TemporaryDirectory() as dname:
            mlf_model_path = pathlib.Path(dname).joinpath(ckptfile.stem)
            mlf_pytorch.save_model(model.network, mlf_model_path)
            mlf_client.log_artifact(pl_logger.run_id, mlf_model_path)
Exemple #12
0
    if earlystopping_tracking in ['val_loss',]:
        earlystopping_tracking = earlystopping_tracking
        earlystopping_mode = 'min'
        earlystopping_min_delta = 0.0001
    elif earlystopping_tracking in ['val_epoch_F1','val_epoch_auPRC']:
        earlystopping_mode = 'max'
        earlystopping_min_delta = 0.001
    else:
        raise 
    checkpoint_callback = pl_callbacks.ModelCheckpoint(dirpath=save_model_folder,
                                        mode = earlystopping_mode,
                                        monitor=earlystopping_tracking,
                                        save_top_k=1,save_last=True,)
    earlystop_callback = pl_callbacks.EarlyStopping(earlystopping_tracking,verbose=True,
                                        mode = earlystopping_mode,
                                        min_delta=earlystopping_min_delta,
                                        patience=10,)
    
    trainer = Trainer(
                    gpus=[gpus,],
                    accelerator=None,
                    max_epochs=200, min_epochs=5,
                    default_root_dir= save_folder,
                    fast_dev_run=False,
                    check_val_every_n_epoch=1,
                    callbacks=  [checkpoint_callback,
                                earlystop_callback,],
                    )
    trainer.fit(model, datamodule=datamodule,)

Exemple #13
0
from model.trainer import Train_GraphDialogRe
from utils.data_reader import Vocab

if __name__ == "__main__":
    seed_everything(config.seed)
    dgl.random.seed(config.seed)

    model = Train_GraphDialogRe(config)

    logger = loggers.TensorBoardLogger(save_dir=config.save_dir)
    checkpoint_args = dict(
        monitor='eval_f1',
        mode='max',
    )
    early_stopping = callbacks.EarlyStopping(patience=5,
                                             strict=True,
                                             verbose=True,
                                             **checkpoint_args)
    ckpt_callback = callbacks.ModelCheckpoint(
        filepath=os.path.join(
            logger.log_dir,
            '{epoch}-{val_loss:.4f}-{eval_f1:.4f}-{eval_T2:.3f}'
        ),  # same path with logdir
        save_top_k=1,
        verbose=True,
        prefix='',
        **checkpoint_args,
    )

    trainer_args = dict(
        gpus=config.gpus,
        num_nodes=config.num_nodes,