Esempio n. 1
0
def main(conf):
    train_set = PodcastMixDataloader(
        csv_dir=conf["data"]["train_dir"],
        sample_rate=conf["data"]["sample_rate"],
        original_sample_rate=conf["data"]["original_sample_rate"],
        segment=conf["data"]["segment"],
        shuffle_tracks=True,
        multi_speakers=conf["training"]["multi_speakers"])
    val_set = PodcastMixDataloader(
        csv_dir=conf["data"]["valid_dir"],
        sample_rate=conf["data"]["sample_rate"],
        original_sample_rate=conf["data"]["original_sample_rate"],
        segment=conf["data"]["segment"],
        shuffle_tracks=True,
        multi_speakers=conf["training"]["multi_speakers"])
    train_loader = DataLoader(train_set,
                              shuffle=True,
                              batch_size=conf["training"]["batch_size"],
                              num_workers=conf["training"]["num_workers"],
                              drop_last=True,
                              pin_memory=True)
    val_loader = DataLoader(val_set,
                            shuffle=False,
                            batch_size=conf["training"]["batch_size"],
                            num_workers=conf["training"]["num_workers"],
                            drop_last=True,
                            pin_memory=True)

    if (conf["model"]["name"] == "ConvTasNet"):
        sys.path.append('ConvTasNet_model')
        from conv_tasnet_norm import ConvTasNetNorm
        conf["masknet"].update({"n_src": conf["data"]["n_src"]})
        model = ConvTasNetNorm(**conf["filterbank"],
                               **conf["masknet"],
                               sample_rate=conf["data"]["sample_rate"])
        loss_func = LogL2Time()
        plugins = None
    elif (conf["model"]["name"] == "UNet"):
        # UNet with logl2 time loss and normalization inside model
        sys.path.append('UNet_model')
        from unet_model import UNet
        model = UNet(conf["data"]["sample_rate"], conf["data"]["fft_size"],
                     conf["data"]["hop_size"], conf["data"]["window_size"],
                     conf["convolution"]["kernel_size"],
                     conf["convolution"]["stride"])
        loss_func = LogL2Time()
        plugins = DDPPlugin(find_unused_parameters=False)
    optimizer = make_optimizer(model.parameters(), **conf["optim"])
    if conf["training"]["half_lr"]:
        scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                      factor=0.5,
                                      patience=5)

    # Just after instantiating, save the args. Easy loading in the future.
    exp_dir = conf["model"]["name"] + "_model/" + conf["main_args"]["exp_dir"]
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, "conf.yml")
    with open(conf_path, "w") as outfile:
        yaml.safe_dump(conf, outfile)

    system = System(model=model,
                    loss_func=loss_func,
                    optimizer=optimizer,
                    train_loader=train_loader,
                    val_loader=val_loader,
                    scheduler=scheduler,
                    config=conf)

    # Define callbacks
    callbacks = []
    checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
    checkpoint = ModelCheckpoint(checkpoint_dir,
                                 monitor="val_loss",
                                 mode="min",
                                 save_top_k=5,
                                 verbose=True)
    callbacks.append(checkpoint)
    if conf["training"]["early_stop"]:
        callbacks.append(
            EarlyStopping(monitor="val_loss",
                          mode="min",
                          patience=100,
                          verbose=True))

    # Don't ask GPU if they are not available.
    gpus = -1 if torch.cuda.is_available() else None
    distributed_backend = "ddp" if torch.cuda.is_available() else None
    trainer = pl.Trainer(
        max_epochs=conf["training"]["epochs"],
        callbacks=callbacks,
        default_root_dir=exp_dir,
        gpus=gpus,
        distributed_backend=distributed_backend,
        gradient_clip_val=5.0,
        resume_from_checkpoint=conf["main_args"]["resume_from"],
        precision=32,
        plugins=plugins)
    trainer.fit(system)

    best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        print(best_k, f)
        json.dump(best_k, f, indent=0)
    print(checkpoint.best_model_path)
    state_dict = torch.load(checkpoint.best_model_path)
    system.load_state_dict(state_dict=state_dict["state_dict"])
    system.cpu()

    to_save = system.model.serialize()
    to_save.update(train_set.get_infos())
    torch.save(to_save, os.path.join(exp_dir, "best_model.pth"))
Esempio n. 2
0
    def __call__(self, trial):
        torch.cuda.empty_cache()

        trial.set_user_attr('fold', self.fold)
        kwargs = self.suggestions(trial)

        start = datetime.now()
        print(f"Training on fold {self.fold}")
        train_ds, val_ds, input_cols, cond_cols = read(self.path, self.exp,
                                                       self.fold, self.subset)
        cols = list(np.concatenate((input_cols, cond_cols, [self.target])))
        train = train_ds.to_table(columns=cols).to_pandas()
        val = val_ds.to_table(columns=cols).to_pandas()
        # DataModule
        dm = CTRPDataModule(train, val, input_cols, cond_cols, self.target,
                            kwargs['batch_size'])
        del train, val
        print(f"Time elapsed loading data: {datetime.now()-start}")
        # Model
        model = ConditionalNetwork(
            n_blocks=kwargs['n_blocks'],
            exp=self.exp,
            inputs_sz=len(dm.input_cols),
            conds_sz=len(dm.cond_cols),
            inputs_emb_layers=kwargs['inputs_emb_layers'],
            conds_emb_layers=kwargs['conds_emb_layers'],
            film_layers=kwargs['film_layers'],
            linear_layers=kwargs['linear_layers'],
            ps_emb=kwargs['ps_emb'],
            ps_film=kwargs['ps_film'],
            ps_linear=kwargs['ps_linear'],
            learning_rate=kwargs['learning_rate'],
            weight_decay=kwargs['weight_decay'],
            batch_size=kwargs['batch_size'])
        # Callbacks
        logger = TensorBoardLogger(
            save_dir=self.logs,
            version=f"trial{trial.number}_{self.exp}_fold_{self.fold}",
            name='model_logs')
        early_stop = EarlyStopping(monitor='val_r2',
                                   min_delta=0.0001,
                                   patience=12,
                                   verbose=False,
                                   mode='max')
        # Trainer
        trainer = Trainer(
            default_root_dir=logger.
            log_dir,  #in order to avoid lr_find_temp.ckpt conflicts
            auto_lr_find=False,
            auto_scale_batch_size=False,
            max_epochs=self.epochs,
            gpus=self.gpu,
            accelerator=self.accelerator,
            logger=logger,
            callbacks=[
                PyTorchLightningPruningCallback(trial, monitor="val_r2"),
                early_stop
            ],
            flush_logs_every_n_steps=200,
            precision=32,
            profiler="simple",
            deterministic=True)
        trainer.fit(model, dm)

        # save and clean up gpu
        r2 = trainer.callback_metrics["val_r2"].item()
        del dm, model, trainer
        torch.cuda.empty_cache()

        print("Completed fold {} in {}".format(self.fold,
                                               str(datetime.now() - start)))
        print(f'Fold val_r2: {r2}')

        return r2
Esempio n. 3
0
def main(hparams):
    """
    Trains the Lightning model as specified in `hparams`
    """
    # in order to make sure every model in multi-GPU have the same weight
    seed = 1234567
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    model = Lightning_Unet(hparams)
    if COMPUTECANADA:
        cur_path = Path(__file__).resolve().parent
        default_root_dir = cur_path
        checkpoint_file = Path(
            __file__).resolve().parent / "checkpoint/{epoch}-{val_dice:.5f}"
        if not os.path.exists(Path(__file__).resolve().parent / "checkpoint"):
            os.mkdir(Path(__file__).resolve().parent / "checkpoint")
    else:
        default_root_dir = "./log"
        if not os.path.exists(default_root_dir):
            os.mkdir(default_root_dir)
        checkpoint_file = "./log/checkpoint"
        if not os.path.exists(checkpoint_file):
            os.mkdir(checkpoint_file)
        checkpoint_file = Path(checkpoint_file) / "{epoch}-{val_dice:.2f}"

    # After training finishes, use best_model_path to retrieve the path to the best
    # checkpoint file and best_model_score to retrieve its score.
    checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_file,
        save_top_k=3,
        verbose=True,
        # monitor='val_dice',
        mode='max',
        prefix='',
        save_weights_only=False,
        # could realize to save the checkpoint several times in one epoch
        period=-1,
    )

    early_stop_callback = EarlyStopping(
        # monitor='val_loss',
        min_delta=0.00,
        patience=300,
        strict=True,
        verbose=False,
        mode='max')

    tb_logger = loggers.TensorBoardLogger(hparams.TensorBoardLogger)

    trainer = Trainer(
        gpus=hparams.gpus,
        num_nodes=hparams.nodes,
        distributed_backend='ddp',
        # the next two can be combined to use, in a straight way
        val_check_interval=0.5,
        # check_val_every_n_epoch=3,
        # log every k batches instead
        row_log_interval=10,
        # set the interval at which you want to log using this trainer flag.
        log_save_interval=10,
        checkpoint_callback=checkpoint_callback,
        early_stop_callback=early_stop_callback,
        callbacks=[LearningRateLogger()],
        # runs 1 train, val, test  batch and program ends
        fast_dev_run=hparams.fast_dev_run,
        default_root_dir=default_root_dir,
        logger=tb_logger,
        max_epochs=10000,
        # this need to be string
        # resume_from_checkpoint=str(Path(__file__).resolve().parent / "checkpoint" / hparams.checkpoint_file),
        profiler=True,
        auto_lr_find=False,
        # simulate a larger batch size for gradient descent to provide a good estimate
        # accumulate_grad_batches=4,
    )

    # if COMPUTECANADA:
    #     pickle.dumps(model)
    # lr_finder = trainer.lr_find(model)
    #
    # # Plot with
    # fig = lr_finder.plot(suggest=True)
    # fig.show()
    #
    # # Pick point based on plot, or get suggestion
    # new_lr = lr_finder.suggestion()
    # print(f"recommend learning_rate: {new_lr}")
    # model.hparams.learning_rate = new_lr

    trainer.fit(model)
def test_early_stopping_mode_options():
    with pytest.raises(MisconfigurationException, match="`mode` can be .* got unknown_option"):
        EarlyStopping(mode="unknown_option")
Esempio n. 5
0
def main(conf):
    train_set = WhamDataset(
        conf["data"]["train_dir"],
        conf["data"]["task"],
        sample_rate=conf["data"]["sample_rate"],
        segment=conf["data"]["segment"],
        nondefault_nsrc=conf["data"]["nondefault_nsrc"],
    )
    val_set = WhamDataset(
        conf["data"]["valid_dir"],
        conf["data"]["task"],
        sample_rate=conf["data"]["sample_rate"],
        nondefault_nsrc=conf["data"]["nondefault_nsrc"],
    )

    train_loader = DataLoader(
        train_set,
        shuffle=True,
        batch_size=conf["training"]["batch_size"],
        num_workers=conf["training"]["num_workers"],
        drop_last=True,
    )
    val_loader = DataLoader(
        val_set,
        shuffle=False,
        batch_size=conf["training"]["batch_size"],
        num_workers=conf["training"]["num_workers"],
        drop_last=True,
    )
    # Update number of source values (It depends on the task)
    conf["masknet"].update({"n_src": train_set.n_src})

    model = DPTNet(**conf["filterbank"], **conf["masknet"])
    optimizer = make_optimizer(model.parameters(), **conf["optim"])
    from asteroid.engine.schedulers import DPTNetScheduler

    schedulers = {
        "scheduler": DPTNetScheduler(
            optimizer, len(train_loader) // conf["training"]["batch_size"], 64
        ),
        "interval": "step",
    }

    # Just after instantiating, save the args. Easy loading in the future.
    exp_dir = conf["main_args"]["exp_dir"]
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, "conf.yml")
    with open(conf_path, "w") as outfile:
        yaml.safe_dump(conf, outfile)

    # Define Loss function.
    loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx")
    system = System(
        model=model,
        loss_func=loss_func,
        optimizer=optimizer,
        scheduler=schedulers,
        train_loader=train_loader,
        val_loader=val_loader,
        config=conf,
    )

    # Define callbacks
    callbacks = []
    checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
    checkpoint = ModelCheckpoint(
        checkpoint_dir, monitor="val_loss", mode="min", save_top_k=5, verbose=True
    )
    callbacks.append(checkpoint)
    if conf["training"]["early_stop"]:
        callbacks.append(EarlyStopping(monitor="val_loss", mode="min", patience=30, verbose=True))

    # Don't ask GPU if they are not available.
    gpus = -1 if torch.cuda.is_available() else None
    distributed_backend = "ddp" if torch.cuda.is_available() else None
    trainer = pl.Trainer(
        max_epochs=conf["training"]["epochs"],
        callbacks=callbacks,
        default_root_dir=exp_dir,
        gpus=gpus,
        distributed_backend=distributed_backend,
        gradient_clip_val=conf["training"]["gradient_clipping"],
    )
    trainer.fit(system)

    best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        json.dump(best_k, f, indent=0)

    state_dict = torch.load(checkpoint.best_model_path)
    system.load_state_dict(state_dict=state_dict["state_dict"])
    system.cpu()

    to_save = system.model.serialize()
    to_save.update(train_set.get_infos())
    torch.save(to_save, os.path.join(exp_dir, "best_model.pth"))
Esempio n. 6
0
def main(conf):
    train_dirs = [
        conf["data"]["train_dir"].format(n_src)
        for n_src in conf["masknet"]["n_srcs"]
    ]
    valid_dirs = [
        conf["data"]["valid_dir"].format(n_src)
        for n_src in conf["masknet"]["n_srcs"]
    ]
    train_set = Wsj0mixVariable(
        json_dirs=train_dirs,
        n_srcs=conf["masknet"]["n_srcs"],
        sample_rate=conf["data"]["sample_rate"],
        seglen=conf["data"]["seglen"],
        minlen=conf["data"]["minlen"],
    )
    val_set = Wsj0mixVariable(
        json_dirs=valid_dirs,
        n_srcs=conf["masknet"]["n_srcs"],
        sample_rate=conf["data"]["sample_rate"],
        seglen=conf["data"]["seglen"],
        minlen=conf["data"]["minlen"],
    )
    train_loader = DataLoader(
        train_set,
        shuffle=True,
        batch_size=conf["training"]["batch_size"],
        num_workers=conf["training"]["num_workers"],
        drop_last=True,
        collate_fn=_collate_fn,
    )
    val_loader = DataLoader(
        val_set,
        shuffle=False,
        batch_size=conf["training"]["batch_size"],
        num_workers=conf["training"]["num_workers"],
        drop_last=True,
        collate_fn=_collate_fn,
    )
    model, optimizer = make_model_and_optimizer(
        conf, sample_rate=conf["data"]["sample_rate"])
    scheduler = []
    if conf["training"]["half_lr"]:
        scheduler.append(
            ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5))
    if conf["training"]["lr_decay"]:
        scheduler.append(ExponentialLR(optimizer=optimizer, gamma=0.99))
    exp_dir = conf["main_args"]["exp_dir"]
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, "conf.yml")
    with open(conf_path, "w") as outfile:
        yaml.safe_dump(conf, outfile)
    loss_func = WeightedPITLoss(n_srcs=conf["masknet"]["n_srcs"],
                                lamb=conf["loss"]["lambda"])
    # Put together in System
    system = VarSpkrSystem(
        model=model,
        loss_func=loss_func,
        optimizer=optimizer,
        train_loader=train_loader,
        val_loader=val_loader,
        scheduler=scheduler,
        config=conf,
    )

    # Define callbacks
    callbacks = []
    checkpoint_dir = os.path.join(exp_dir, "checkpoints/")
    checkpoint = ModelCheckpoint(
        dirpath=checkpoint_dir,
        filename="{epoch}-{step}",
        monitor="avg_sdr",
        mode="max",
        save_top_k=5,
        verbose=True,
    )
    callbacks.append(checkpoint)
    if conf["training"]["early_stop"]:
        callbacks.append(
            EarlyStopping(monitor="avg_sdr",
                          mode="max",
                          patience=30,
                          verbose=True))

    # Don't ask GPU if they are not available.
    gpus = -1 if torch.cuda.is_available() else None
    distributed_backend = "dp" if torch.cuda.is_available() else None

    # Train model
    trainer = pl.Trainer(
        max_epochs=conf["training"]["epochs"],
        callbacks=callbacks,
        default_root_dir=exp_dir,
        gpus=gpus,
        distributed_backend=distributed_backend,
        limit_train_batches=1.0,  # Useful for fast experiment
        gradient_clip_val=200,
        resume_from_checkpoint=conf["main_args"]["resume_from"],
    )
    trainer.fit(system)

    best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        json.dump(best_k, f, indent=0)
    # Save last model for convenience
    torch.save(system.model.state_dict(),
               os.path.join(exp_dir, "final_model.pth"))
            return
        self._epoch_end()

    def on_train_end(self) -> None:
        assert self.trainer.current_epoch == self.expected_end_epoch, 'Early Stopping Failed'


_ES_CHECK = dict(check_on_train_epoch_end=True)
_ES_CHECK_P3 = dict(patience=3, check_on_train_epoch_end=True)
_NO_WIN = dict(marks=RunIf(skip_windows=True))


@pytest.mark.parametrize(
    "callbacks, expected_stop_epoch, check_on_train_epoch_end, accelerator, num_processes",
    [
        ([EarlyStopping('abc'), EarlyStopping('cba', patience=3)], 3, False, None, 1),
        ([EarlyStopping('cba', patience=3), EarlyStopping('abc')], 3, False, None, 1),
        pytest.param([EarlyStopping('abc'), EarlyStopping('cba', patience=3)], 3, False, 'ddp_cpu', 2, **_NO_WIN),
        pytest.param([EarlyStopping('cba', patience=3), EarlyStopping('abc')], 3, False, 'ddp_cpu', 2, **_NO_WIN),
        ([EarlyStopping('abc', **_ES_CHECK), EarlyStopping('cba', **_ES_CHECK_P3)], 3, True, None, 1),
        ([EarlyStopping('cba', **_ES_CHECK_P3), EarlyStopping('abc', **_ES_CHECK)], 3, True, None, 1),
        pytest.param([EarlyStopping('abc', **_ES_CHECK),
                      EarlyStopping('cba', **_ES_CHECK_P3)], 3, True, 'ddp_cpu', 2, **_NO_WIN),
        pytest.param([EarlyStopping('cba', **_ES_CHECK_P3),
                      EarlyStopping('abc', **_ES_CHECK)], 3, True, 'ddp_cpu', 2, **_NO_WIN),
    ],
)
def test_multiple_early_stopping_callbacks(
    tmpdir,
    callbacks: List[EarlyStopping],
    expected_stop_epoch: int,
Esempio n. 8
0
def train(cfg, model, transforms, train_data, val_data):
    gc.collect()
    pl.seed_everything(cfg.seed)
    # init logger
    loggers = []
    # init logger
    if cfg.logger == 'csv':
        logger = pl.loggers.CSVLogger(save_dir='logs/', name=cfg.model_type)
        loggers.append(logger)
    elif cfg.logger == 'tensorboard':
        logger = pl.loggers.TensorBoardLogger('tb_logs', name=cfg.model_type)
        loggers.append(logger)
    elif cfg.logger == 'wandb':
        logger = pl.loggers.WandbLogger(project='Plant Pathology 2021 - FGVC8')
        loggers.append(logger)

    elif cfg.logger == 'all':
        logger1 = pl.loggers.CSVLogger(save_dir='logs/', name=cfg.model_type)
        logger2 = pl.loggers.TensorBoardLogger('tb_logs', name=cfg.model_type)
        logger3 = pl.loggers.WandbLogger(
            project='Plant Pathology 2021 - FGVC8')
        loggers.append(*[logger1, logger2, logger3])
    else:
        pass
    # init callbacks
    ckpt_min_loss = ModelCheckpoint(
        monitor='total_loss',
        save_top_k=cfg.ckpt_save_top_k,
        mode='min',
        save_last=cfg.ckpt_save_last,
        filename=os.path.join(
                "checkpoint", f"min-loss-fold={cfg.fold_i}" + "-{epoch}-{valid_loss:.4f}-{valid_f1:.4f}"),
    )
    ckpt2_max_f1 = ModelCheckpoint(
        monitor='valid_five_f1',
        save_top_k=cfg.ckpt_save_top_k,
        mode='max',
        save_last=cfg.ckpt_save_last,
        filename=os.path.join(
            "checkpoint", f"best-f1-fold={cfg.fold_i}" + "-{epoch}-{valid_loss:.4f}-{valid_f1:.4f}"),
    )
    early = EarlyStopping(
        monitor='valid_five_f1',
        patience=cfg.early_patience,
        mode='max',
        verbose=True
    )

    lr_monitor = LearningRateMonitor('step')

    swa = pl.callbacks.StochasticWeightAveraging()

    # all callbacks
    callbacks = [ckpt_min_loss, ckpt2_max_f1, early, lr_monitor, swa]

    # init model
    if cfg.two_head:
        model = LitPlantModule2(cfg)
    else:
        model = LitPlantModule(cfg)

    # init dataloaders
    dataloaders = generate_dataloaders(cfg, train_data, val_data, transforms)

    # init trainer
    trainer = pl.Trainer(
        fast_dev_run=False,
        gpus=1,
        callbacks=callbacks,
        logger=loggers,
        min_epochs=cfg.min_epochs,
        max_epochs=cfg.max_epochs,
        val_check_interval=0.25,
        progress_bar_refresh_rate=1,
        weights_summary='top',
        precision=cfg.precision,
        # limit_train_batches=1,
        # limit_val_batches=1,
        gradient_clip_val=cfg.gradient_clip_val,
        accumulate_grad_batches=int(64 / cfg.batch_size)
    )

    trainer.fit(model=model,
                train_dataloader=dataloaders['train'],
                val_dataloaders=dataloaders['val'])
    #    print(self.retriever.predict('I am beautiful lady?', ['You are a pretty girl',
    #                                               'apple is tasty',
    #                                               'He is a handsome boy'], True))


if __name__ == '__main__':
    encoder_question = BertEncoder(bert_question, max_question_len_global)
    encoder_paragarph = BertEncoder(bert_paragraph, max_paragraph_len_global)
    ret = Retriver(encoder_question, encoder_paragarph, tokenizer)
    os.makedirs('out', exist_ok=True)
    checkpoint_callback = ModelCheckpoint(
        filepath=
        'out/largebatch-crossentropy-{epoch}-{val_loss:.2f}-{val_acc:.2f}',
        save_top_k=10,
        verbose=True,
        monitor='val_acc',
        mode='max')

    early_stopping = EarlyStopping('val_acc', mode='max')

    trainer = pl.Trainer(gpus=8,
                         distributed_backend='dp',
                         val_check_interval=0.1,
                         min_epochs=1,
                         checkpoint_callback=checkpoint_callback,
                         early_stop_callback=early_stopping)

    ret_trainee = RetriverTrainer(ret)

    trainer.fit(ret_trainee)
Esempio n. 10
0
def main():
    logger = logging.getLogger(__name__)
    start_time = datetime.datetime.now()
    model_args, training_args = load_or_parse_args((ModelArgs, TrainingArgs), verbose=True)
    train_orig_df, label_enc = load_train_dataframe(training_args.data_train,
                                                    min_class_samples=training_args.min_class_samples)

    # assert training_args.test_size % training_args.batch_size == 0, "Test size should be multiple of batch size"

    # TODO: split DFs once and keep those on the disk. Reload label_enc from disk on resume.
    train_df, valid_df = train_test_split(train_orig_df, test_size=training_args.test_size,
                                          stratify=train_orig_df.landmark_id, random_state=SEED)
    num_classes = train_df.landmark_id.nunique() if training_args.min_class_samples is None else len(label_enc.classes_)
    logger.info(f'Num classes train: {num_classes}')
    logger.info(f'Num classes valid: {valid_df.landmark_id.nunique()}')

    # save checkpoints
    training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True)
    joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename)
    logger.info(f'Persisted LabelEncoder to {training_args.label_encoder_filename}')
    save_config_checkpoint(training_args.checkpoints_dir)

    logger.info('Initializing the model')
    model = LandmarkModel(model_name=model_args.model_name,
                          n_classes=num_classes,
                          loss_module=model_args.loss_module,
                          pooling_name=model_args.pooling_name,
                          args_pooling=model_args.args_pooling,
                          normalize=model_args.normalize,
                          use_fc=model_args.use_fc,
                          fc_dim=model_args.fc_dim,
                          dropout=model_args.dropout
                          )
    logger.info("Model params:")
    logger.info(pformat(model_args))
    lit_module = LandmarksPLBaseModule(hparams={**model_args.__dict__, **training_args.__dict__},
                                       model=model,
                                       optimizer=training_args.optimizer,
                                       loss=model_args.loss_module)
    # init data
    dm = LandmarksDataModule(train_df, valid_df,
                             hparams=training_args,
                             image_dir=training_args.data_path,
                             batch_size=training_args.batch_size,
                             num_workers=training_args.num_workers,
                             use_weighted_sampler=training_args.use_weighted_sampler
                             )
    # train
    dt_str = datetime.datetime.now().strftime("%y%m%d_%H-%M")
    wandb_logger = WandbLogger(name=f'{model_args.model_name.capitalize()}_GeM_ArcFace_{dt_str}',
                               save_dir='logs/',
                               project='landmarks')
    checkpoint_callback = ModelCheckpoint(monitor='val_acc',
                                          mode='max',
                                          save_top_k=2,
                                          save_last=True,
                                          verbose=True)
    # hack around to change only filename, not provide the full path (which is generated by W&B)
    checkpoint_callback.filename = '{epoch}-{val_acc:.3f}'

    early_stopping_callback = EarlyStopping('val_acc', verbose=True, mode='max')

    trainer = pl.Trainer(gpus=training_args.gpus,
                         logger=wandb_logger,
                         max_epochs=training_args.n_epochs,
                         val_check_interval=training_args.val_check_interval,
                         checkpoint_callback=checkpoint_callback,
                         progress_bar_refresh_rate=100,
                         resume_from_checkpoint=training_args.resume_checkpoint,
                         gradient_clip_val=training_args.gradient_clip_val,
                         accumulate_grad_batches=training_args.accumulate_grad_batches,
                         early_stop_callback=early_stopping_callback,
                         # fast_dev_run=True,
                         # limit_train_batches=5,
                         # limit_val_batches=5
                         )
    trainer.fit(lit_module, datamodule=dm)

    try:
        training_args.checkpoints_dir = get_wandb_logger_checkpoints_path(wandb_logger)
        logger.info(f'Saving checkpoints to the current directory: {training_args.checkpoints_dir}')
    except:
        logger.warning(f'Unable to get current checkpoints directory, using default one: '
                       f'{training_args.checkpoints_dir}')
    # save checkpoints (saved twice - in default directory above and in wandb current run folder)
    training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True)
    joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename)
    logger.info(f'Persisted LabelEncoder to {training_args.label_encoder_filename}')
    save_config_checkpoint(training_args.checkpoints_dir)

    end_time = datetime.datetime.now()
    logger.info('Duration: {}'.format(end_time - start_time))
Esempio n. 11
0
def main(hparams) -> None:
    """
    Main training routine specific for this project
    :param hparams:
    """
    set_seed(hparams.seed)
    # ------------------------
    # 1 INIT LIGHTNING MODEL AND DATA
    # ------------------------

    model = Classifier(hparams)
    
    # ------------------------
    # 2 INIT EARLY STOPPING
    # ------------------------
    early_stop_callback = EarlyStopping(
        monitor=hparams.monitor,
        min_delta=0.0,
        patience=hparams.patience,
        verbose=True,
        mode=hparams.metric_mode,
    )

    # ------------------------
    # 3 INIT LOGGERS
    # ------------------------
    # Tensorboard Callback
    tb_logger = TensorBoardLogger(
        save_dir="experiments/",
        version="version_" + datetime.now().strftime("%d-%m-%Y--%H-%M-%S"),
        name="",
    )

    # Model Checkpoint Callback
    ckpt_path = os.path.join(
        "experiments/", tb_logger.version, "checkpoints",
    )

    # --------------------------------
    # 4 INIT MODEL CHECKPOINT CALLBACK
    # -------------------------------
    checkpoint_callback = ModelCheckpoint(
        filepath=ckpt_path,
        save_top_k=hparams.save_top_k,
        verbose=True,
        monitor=hparams.monitor,
        period=1,
        mode=hparams.metric_mode,
        save_weights_only=True
    )

    # ------------------------
    # 5 INIT TRAINER
    # ------------------------
    trainer = Trainer(
        logger=tb_logger,
        checkpoint_callback=True,
        # callbacks=early_stop_callback,
        gradient_clip_val=1.0,
        gpus=hparams.gpus,
        log_gpu_memory="all",
        deterministic=True,
        check_val_every_n_epoch=1,
        fast_dev_run=hparams.fast_dev_run,
        accumulate_grad_batches=hparams.accumulate_grad_batches,
        max_epochs=hparams.max_epochs,
        min_epochs=hparams.min_epochs,
        # val_check_interval=hparams.val_check_interval,
        # distributed_backend="None",
    )

    # ------------------------
    # 6 START TRAINING
    # ------------------------
    trainer.fit(model, model.data)
def test_callbacks_and_logger_not_called_with_fastdevrun(tmpdir, fast_dev_run):
    """Test that ModelCheckpoint, EarlyStopping and Logger are turned off with fast_dev_run."""
    class FastDevRunModel(BoringModel):
        def __init__(self):
            super().__init__()
            self.training_step_call_count = 0
            self.training_epoch_end_call_count = 0
            self.validation_step_call_count = 0
            self.validation_epoch_end_call_count = 0
            self.test_step_call_count = 0

        def training_step(self, batch, batch_idx):
            self.log("some_metric", torch.tensor(7.0))
            self.logger.experiment.dummy_log("some_distribution",
                                             torch.randn(7) + batch_idx)
            self.training_step_call_count += 1
            return super().training_step(batch, batch_idx)

        def training_epoch_end(self, outputs):
            self.training_epoch_end_call_count += 1
            super().training_epoch_end(outputs)

        def validation_step(self, batch, batch_idx):
            self.validation_step_call_count += 1
            return super().validation_step(batch, batch_idx)

        def validation_epoch_end(self, outputs):
            self.validation_epoch_end_call_count += 1
            super().validation_epoch_end(outputs)

        def test_step(self, batch, batch_idx):
            self.test_step_call_count += 1
            return super().test_step(batch, batch_idx)

    checkpoint_callback = ModelCheckpoint()
    checkpoint_callback.save_checkpoint = Mock()
    early_stopping_callback = EarlyStopping(monitor="foo")
    early_stopping_callback._evaluate_stopping_criteria = Mock()
    trainer_config = dict(
        default_root_dir=tmpdir,
        fast_dev_run=fast_dev_run,
        val_check_interval=2,
        logger=True,
        log_every_n_steps=1,
        callbacks=[checkpoint_callback, early_stopping_callback],
    )

    def _make_fast_dev_run_assertions(trainer, model):
        # check the call count for train/val/test step/epoch
        assert model.training_step_call_count == fast_dev_run
        assert model.training_epoch_end_call_count == 1
        assert model.validation_step_call_count == 0 if model.validation_step is None else fast_dev_run
        assert model.validation_epoch_end_call_count == 0 if model.validation_step is None else 1
        assert model.test_step_call_count == fast_dev_run

        # check trainer arguments
        assert trainer.max_steps == fast_dev_run
        assert trainer.num_sanity_val_steps == 0
        assert trainer.max_epochs == 1
        assert trainer.val_check_interval == 1.0
        assert trainer.check_val_every_n_epoch == 1

        # there should be no logger with fast_dev_run
        assert isinstance(trainer.logger, DummyLogger)

        # checkpoint callback should not have been called with fast_dev_run
        assert trainer.checkpoint_callback == checkpoint_callback
        checkpoint_callback.save_checkpoint.assert_not_called()
        assert not os.path.exists(checkpoint_callback.dirpath)

        # early stopping should not have been called with fast_dev_run
        assert trainer.early_stopping_callback == early_stopping_callback
        early_stopping_callback._evaluate_stopping_criteria.assert_not_called()

    train_val_step_model = FastDevRunModel()
    trainer = Trainer(**trainer_config)
    trainer.fit(train_val_step_model)
    trainer.test(train_val_step_model)

    assert trainer.state.finished, f"Training failed with {trainer.state}"
    _make_fast_dev_run_assertions(trainer, train_val_step_model)

    # -----------------------
    # also called once with no val step
    # -----------------------
    train_step_only_model = FastDevRunModel()
    train_step_only_model.validation_step = None

    trainer = Trainer(**trainer_config)
    trainer.fit(train_step_only_model)
    trainer.test(train_step_only_model)

    assert trainer.state.finished, f"Training failed with {trainer.state}"
    _make_fast_dev_run_assertions(trainer, train_step_only_model)
Esempio n. 13
0
            'num_features': training_input.shape[3],
            'num_timesteps_input': num_timesteps_input,
            'num_timesteps_output': num_timesteps_output,
            'gcn_type': gcn_type,
            'gcn_package': gcn_package,
            'gcn_partition': gcn_partition
        })

    net = WrapperNet(hparams)

    net.init_data(training_input, training_target, val_input, val_target,
                  test_input, test_target)

    net.init_graph(A, edge_index, edge_weight)

    early_stop_callback = EarlyStopping(patience=early_stop_rounds)
    logger = TestTubeLogger(save_dir=log_dir, name=log_name)

    trainer = pl.Trainer(gpus=gpus,
                         max_epochs=epochs,
                         distributed_backend=backend,
                         early_stop_callback=early_stop_callback,
                         logger=logger,
                         track_grad_norm=2)
    trainer.fit(net)

    print('Training time {}'.format(time.time() - start_time))

    # # Currently, there are some issues for testing under ddp setting, so switch it to dp setting
    # # change the below line with your own checkpoint path
    # net = WrapperNet.load_from_checkpoint('logs/ddp_exp/version_1/checkpoints/_ckpt_epoch_2.ckpt')
Esempio n. 14
0
def main(conf):
    train_set = WhamDataset(conf['data']['train_dir'],
                            conf['data']['task'],
                            sample_rate=conf['data']['sample_rate'],
                            nondefault_nsrc=conf['data']['nondefault_nsrc'])
    val_set = WhamDataset(conf['data']['valid_dir'],
                          conf['data']['task'],
                          sample_rate=conf['data']['sample_rate'],
                          nondefault_nsrc=conf['data']['nondefault_nsrc'])

    train_loader = DataLoader(train_set,
                              shuffle=True,
                              batch_size=conf['training']['batch_size'],
                              num_workers=conf['training']['num_workers'],
                              drop_last=True)
    val_loader = DataLoader(val_set,
                            shuffle=False,
                            batch_size=conf['training']['batch_size'],
                            num_workers=conf['training']['num_workers'],
                            drop_last=True)
    # Update number of source values (It depends on the task)
    conf['masknet'].update({'n_src': train_set.n_src})

    # Define model and optimizer
    model = ConvTasNet(**conf['filterbank'], **conf['masknet'])
    optimizer = make_optimizer(model.parameters(), **conf['optim'])
    # Define scheduler
    scheduler = None
    if conf['training']['half_lr']:
        scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                      factor=0.5,
                                      patience=5)
    # Just after instantiating, save the args. Easy loading in the future.
    exp_dir = conf['main_args']['exp_dir']
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, 'conf.yml')
    with open(conf_path, 'w') as outfile:
        yaml.safe_dump(conf, outfile)

    # Define Loss function.
    loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from='pw_mtx')
    system = System(model=model,
                    loss_func=loss_func,
                    optimizer=optimizer,
                    train_loader=train_loader,
                    val_loader=val_loader,
                    scheduler=scheduler,
                    config=conf)

    # Define callbacks
    checkpoint_dir = os.path.join(exp_dir, 'checkpoints/')
    checkpoint = ModelCheckpoint(checkpoint_dir,
                                 monitor='val_loss',
                                 mode='min',
                                 save_top_k=5,
                                 verbose=1)
    early_stopping = False
    if conf['training']['early_stop']:
        early_stopping = EarlyStopping(monitor='val_loss',
                                       patience=10,
                                       verbose=1)

    # Don't ask GPU if they are not available.
    if not torch.cuda.is_available():
        print('No available GPU were found, set gpus to None')
        conf['main_args']['gpus'] = None
    trainer = pl.Trainer(
        max_epochs=conf['training']['epochs'],
        checkpoint_callback=checkpoint,
        early_stop_callback=early_stopping,
        default_save_path=exp_dir,
        gpus=conf['main_args']['gpus'],
        distributed_backend='dp',
        train_percent_check=1.0,  # Useful for fast experiment
        gradient_clip_val=5.)
    trainer.fit(system)

    best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()}
    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        json.dump(best_k, f, indent=0)

    # Save best model (next PL version will make this easier)
    best_path = [b for b, v in best_k.items() if v == min(best_k.values())][0]
    state_dict = torch.load(best_path)
    system.load_state_dict(state_dict=state_dict['state_dict'])
    system.cpu()

    to_save = system.model.serialize()
    to_save.update(train_set.get_infos())
    torch.save(to_save, os.path.join(exp_dir, 'best_model.pth'))
Esempio n. 15
0
    def validation_epoch_end(self, outputs):
        losses = [8, 4, 2, 3, 4, 5, 8, 10]
        val_loss = losses[self.current_epoch]
        self.log('abc', torch.tensor(val_loss))
        self.log('cba', torch.tensor(0))

    def on_train_end(self) -> None:
        assert self.trainer.current_epoch == self.expected_end_epoch, 'Early Stopping Failed'


@pytest.mark.parametrize(
    "callbacks, expected_stop_epoch, accelerator, num_processes",
    [
        ([
            EarlyStopping(monitor='abc'),
            EarlyStopping(monitor='cba', patience=3)
        ], 3, None, 1),
        ([
            EarlyStopping(monitor='cba', patience=3),
            EarlyStopping(monitor='abc')
        ], 3, None, 1),
        pytest.param([
            EarlyStopping(monitor='abc'),
            EarlyStopping(monitor='cba', patience=3)
        ],
                     3,
                     'ddp_cpu',
                     2,
                     marks=RunIf(skip_windows=True)),
        pytest.param([
Esempio n. 16
0
def main(conf):
    train_set = WhamDataset(conf['data']['train_dir'], conf['data']['task'],
                            sample_rate=conf['data']['sample_rate'],
                            nondefault_nsrc=conf['data']['nondefault_nsrc'])
    val_set = WhamDataset(conf['data']['valid_dir'], conf['data']['task'],
                          sample_rate=conf['data']['sample_rate'],
                          nondefault_nsrc=conf['data']['nondefault_nsrc'])

    train_loader = DataLoader(train_set, shuffle=True,
                              batch_size=conf['data']['batch_size'],
                              num_workers=conf['data']['num_workers'],
                              drop_last=True)
    val_loader = DataLoader(val_set, shuffle=True,
                            batch_size=conf['data']['batch_size'],
                            num_workers=conf['data']['num_workers'],
                            drop_last=True)
    # Update number of source values (It depends on the task)
    conf['masknet'].update({'n_src': train_set.n_src})

    # Define model and optimizer in a local function (defined in the recipe).
    # Two advantages to this : re-instantiating the model and optimizer
    # for retraining and evaluating is straight-forward.
    model, optimizer = make_model_and_optimizer(conf)
    # Define scheduler
    scheduler = None
    if conf['training']['half_lr']:
        scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5,
                                      patience=5)
    # Just after instantiating, save the args. Easy loading in the future.
    exp_dir = conf['main_args']['exp_dir']
    os.makedirs(exp_dir, exist_ok=True)
    conf_path = os.path.join(exp_dir, 'conf.yml')
    with open(conf_path, 'w') as outfile:
        yaml.safe_dump(conf, outfile)

    # Define Loss function.
    loss_func = PITLossWrapper(pairwise_neg_sisdr, mode='pairwise')
    system = System(model=model, loss_func=loss_func, optimizer=optimizer,
                    train_loader=train_loader, val_loader=val_loader,
                    scheduler=scheduler, config=conf)

    # Define callbacks
    checkpoint_dir = os.path.join(exp_dir, 'checkpoints/')
    checkpoint = ModelCheckpoint(checkpoint_dir, monitor='val_loss',
                                 mode='min', save_top_k=5, verbose=1)
    early_stopping = False
    if conf['training']['early_stop']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=10,
                                       verbose=1)

    # Don't ask GPU if they are not available.
    if not torch.cuda.is_available():
        print('No available GPU were found, set gpus to None')
        conf['main_args']['gpus'] = None
    trainer = pl.Trainer(max_nb_epochs=conf['training']['epochs'],
                         checkpoint_callback=checkpoint,
                         early_stop_callback=early_stopping,
                         default_save_path=exp_dir,
                         gpus=conf['main_args']['gpus'],
                         distributed_backend='dp',
                         train_percent_check=1.0,  # Useful for fast experiment
                         gradient_clip_val=5.)
    trainer.fit(system)

    with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f:
        json.dump(checkpoint.best_k_models, f, indent=0)
Esempio n. 17
0
)

validation = TimeSeriesDataSet.from_dataset(training,
                                            data,
                                            min_prediction_idx=training_cutoff)
batch_size = 128
train_dataloader = training.to_dataloader(train=True,
                                          batch_size=batch_size,
                                          num_workers=2)
val_dataloader = validation.to_dataloader(train=False,
                                          batch_size=batch_size,
                                          num_workers=2)

early_stop_callback = EarlyStopping(monitor="val_loss",
                                    min_delta=1e-4,
                                    patience=10,
                                    verbose=False,
                                    mode="min")
trainer = pl.Trainer(
    max_epochs=100,
    gpus=0,
    weights_summary="top",
    gradient_clip_val=0.1,
    early_stop_callback=early_stop_callback,
    limit_train_batches=15,
    # limit_val_batches=1,
    # fast_dev_run=True,
    # logger=logger,
    # profiler=True,
)
Esempio n. 18
0
        return self._val_dataloader


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument(
            "--data-dir",
            type=str,
            required=True
            )
    args = parser.parse_args()

    early_stop_callback = EarlyStopping(
            monitor="val_loss",
            min_delta=0.0,
            patience=1,
            verbose=True,
            mode="min",
            )

    trainer = pl.Trainer(
             gpus=1,
             early_stop_callback=early_stop_callback,
             # train_percent_check=0.001,
             # val_percent_check=0.001,
             # max_nb_epochs=1
             )

    model = Model(args)

    trainer.fit(model)
Esempio n. 19
0
def main(hparams):
    neptune_logger = NeptuneLogger(
        api_key=
        "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiN2I2ZWM0NmQtNjg0NS00ZjM5LTkzNTItN2I4Nzc0YTUzMmM0In0=",
        project_name="hirune924/kaggle-PANDA",
        close_after_fit=False,
        upload_source_files=['*.py', '*.ipynb'],
        params=vars(hparams),
        experiment_name=hparams.experiment_name,  # Optional,
        #tags=["pytorch-lightning", "mlp"]  # Optional,
    )
    '''
    comet_logger = CometLogger(
        api_key="QCxbRVX2qhQj1t0ajIZl2nk2c",
        workspace='hirune924',  # Optional
        save_dir='.',  # Optional
        project_name="kaggle-panda",  # Optional
        #rest_api_key=os.environ.get('COMET_REST_API_KEY'),  # Optional
        #experiment_name='default'  # Optional
    )'''
    tb_logger = loggers.TensorBoardLogger(save_dir=hparams.log_dir,
                                          name='default',
                                          version=None)

    logger_list = [tb_logger, neptune_logger
                   ] if hparams.distributed_backend != 'ddp' else tb_logger

    checkpoint_callback = ModelCheckpoint(filepath=os.path.join(
        hparams.log_dir, '{epoch}-{avg_val_loss}-{val_qwk}'),
                                          save_top_k=10,
                                          verbose=True,
                                          monitor='avg_val_loss',
                                          mode='min',
                                          save_weights_only=True,
                                          period=1)

    # default used by the Trainer
    early_stop_callback = EarlyStopping(monitor='avg_val_loss',
                                        patience=20,
                                        min_delta=0.0,
                                        strict=True,
                                        verbose=True,
                                        mode='min')

    seg_model = get_seg_model_from_name(model_name=hparams.seg_model_name,
                                        in_channels=5,
                                        num_classes=2,
                                        pretrained=True)
    seg_ckpt_pth = glob.glob(
        os.path.join(hparams.seg_ckpt_dir,
                     'fold' + str(hparams.fold) + '*.ckpt'))
    seg_model = load_pytorch_model(seg_ckpt_pth[0], seg_model)
    if hparams.marge_type == 'cat':
        in_channels = 7
    elif hparams.marge_type == 'add':
        in_channels = 3
    cls_model = get_cls_model_from_name(model_name=hparams.cls_model_name,
                                        in_channels=in_channels,
                                        num_classes=1,
                                        pretrained=True)
    pl_model = PLImageSegmentationClassificationSystem(seg_model, cls_model,
                                                       hparams)

    ###
    if hparams.auto_lr_find:
        trainer = Trainer()
        lr_finder = trainer.lr_find(pl_model)
        print(lr_finder.results)
        print(lr_finder.suggestion())
        pl_model.learning_rate = lr_finder.suggestion()


###

    trainer = Trainer(gpus=hparams.gpus,
                      max_epochs=hparams.max_epochs,
                      min_epochs=hparams.min_epochs,
                      max_steps=None,
                      min_steps=None,
                      checkpoint_callback=checkpoint_callback,
                      early_stop_callback=early_stop_callback,
                      logger=logger_list,
                      accumulate_grad_batches=1,
                      precision=hparams.precision,
                      amp_level='O1',
                      auto_lr_find=False,
                      benchmark=True,
                      check_val_every_n_epoch=hparams.check_val_every_n_epoch,
                      distributed_backend=hparams.distributed_backend,
                      num_nodes=1,
                      fast_dev_run=False,
                      gradient_clip_val=0.0,
                      log_gpu_memory=None,
                      log_save_interval=100,
                      num_sanity_val_steps=5,
                      overfit_pct=0.0)

    # fit model !
    trainer.fit(pl_model)
Esempio n. 20
0
num_workers = 4

DATA_DIR = "C:\\Users\\ahrn1e19\\multiclass\\image_multiclass"
root_path = 'H:\\MSc-project'
checkpoint_callback = ModelCheckpoint(
    filepath=root_path,
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

early_stop_callback = EarlyStopping(
   monitor='val_loss',
   min_delta=0.00,
   patience=10,
   verbose=False,
   mode='auto'
)



model = torch.hub.load('pytorch/vision:v0.6.0', 'inception_v3', num_classes=5, aux_logits=False, transform_input=False, pretrained=False)
model.train()

# model
pl_model = InceptionV3(model)

# most basic trainer, uses good defaults
trainer = Trainer(default_root_dir=root_path, gpus=1, max_epochs=35,
                  checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback)
def test_min_steps_override_early_stopping_functionality(tmpdir, step_freeze: int, min_steps: int, min_epochs: int):
    """Excepted Behaviour:
    IF `min_steps` was set to a higher value than the `trainer.global_step` when `early_stopping` is being triggered,
    THEN the trainer should continue until reaching `trainer.global_step` == `min_steps`, and stop.

    IF `min_epochs` resulted in a higher number of steps than the `trainer.global_step`
        when `early_stopping` is being triggered,
    THEN the trainer should continue until reaching
        `trainer.global_step` == `min_epochs * len(train_dataloader)`, and stop.
    This test validate this expected behaviour

    IF both `min_epochs` and `min_steps` are provided and higher than the `trainer.global_step`
        when `early_stopping` is being triggered,
    THEN the highest between `min_epochs * len(train_dataloader)` and `min_steps` would be reached.

    Caveat: IF min_steps is divisible by len(train_dataloader), then it will do min_steps + len(train_dataloader)

    This test validate those expected behaviours
    """

    _logger.disabled = True

    original_loss_value = 10
    limit_train_batches = 3
    patience = 3

    class Model(BoringModel):

        def __init__(self, step_freeze):
            super(Model, self).__init__()

            self._step_freeze = step_freeze

            self._loss_value = 10.0
            self._eps = 1e-1
            self._count_decrease = 0
            self._values = []

        def training_step(self, batch, batch_idx):
            output = self.layer(batch)
            loss = self.loss(batch, output)
            return {"loss": loss}

        def validation_step(self, batch, batch_idx):
            return {"test_val_loss": self._loss_value}

        def validation_epoch_end(self, outputs):
            _mean = np.mean([x['test_val_loss'] for x in outputs])
            if self.trainer.global_step <= self._step_freeze:
                self._count_decrease += 1
                self._loss_value -= self._eps
            self._values.append(_mean)
            self.log('test_val_loss', _mean)

    model = Model(step_freeze)
    model.training_step_end = None
    model.test_dataloader = None
    early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True)
    trainer = Trainer(
        default_root_dir=tmpdir,
        callbacks=[early_stop_callback],
        limit_train_batches=limit_train_batches,
        limit_val_batches=2,
        min_steps=min_steps,
        min_epochs=min_epochs,
    )
    trainer.fit(model)

    # Make sure loss was properly decreased
    assert abs(original_loss_value - (model._count_decrease) * model._eps - model._loss_value) < 1e-6

    pos_diff = (np.diff(model._values) == 0).nonzero()[0][0]

    # Compute when the latest validation epoch end happened
    latest_validation_epoch_end = (pos_diff // limit_train_batches) * limit_train_batches
    if pos_diff % limit_train_batches == 0:
        latest_validation_epoch_end += limit_train_batches

    # Compute early stopping latest step
    by_early_stopping = latest_validation_epoch_end + (1 + limit_train_batches) * patience

    # Compute min_epochs latest step
    by_min_epochs = min_epochs * limit_train_batches

    # Make sure the trainer stops for the max of all minimum requirements
    assert trainer.global_step == max(min_steps, by_early_stopping, by_min_epochs), (
        trainer.global_step,
        max(min_steps, by_early_stopping, by_min_epochs),
        step_freeze,
        min_steps,
        min_epochs,
    )

    _logger.disabled = False
                         download=True,
                         transform=transforms.ToTensor())

    test_loader = DataLoader(test_dataset)

    logger.info(f"Done!"
                f"\n# of train examples: {n_train}"
                f"\n# of val examples: {n_val}"
                f"\n# of test examples: {len(test_dataset)}")

    # init model
    model = LitModel(args)

    if args.patience is not None:
        early_stop_ckpt = EarlyStopping(monitor='val_loss',
                                        verbose=True,
                                        patience=args.patience)
    else:
        early_stop_ckpt = None


    profiler = SimpleProfiler()

    lightning_log_pth = '/lightning_logs'

    if not os.path.isdir(lightning_log_pth):
        logger.warning(f"Unable to find {lightning_log_pth} to log to! "
                       f"If not running Grid then ignore.")
        save_dir = ''
    else:
        save_dir = lightning_log_pth
Esempio n. 23
0
    if (not os.path.exists(args.ckpt_path)):
        print('Creating CKPT Dir')
        os.mkdir(args.ckpt_path)
    '''
    checkpoint_callback = ModelCheckpoint(
        filepath=args.ckpt_path,
        save_top_k=True,
        verbose=True,
        monitor='val_loss',
        mode='min',
        prefix=''
    )
    '''
    early_stop_callback = EarlyStopping(monitor='val_loss',
                                        min_delta=0.00005,
                                        patience=3,
                                        verbose=False,
                                        mode='min')

    for trial in range(args.num_runs):
        args.__dict__["rand"] = trial + 1

        trainer = pl.Trainer(default_save_path=args.ckpt_path,
                             distributed_backend=args.distributed_backend,
                             gpus=len(args.gpus.split(',')),
                             max_epochs=args.e,
                             early_stop_callback=early_stop_callback)

        model = LMFineTuner(args)
        trainer.fit(model)
        trainer.test(model)
Esempio n. 24
0
                    default=True)
parser.add_argument("-optim", default='adam', type=str)
parser.add_argument("-lr", default=1e-5, type=float)
parser.add_argument("-lr_bert", default=5e-4, type=float)
parser.add_argument("-beta1", default=0.9, type=float)
parser.add_argument("-beta2", default=0.999, type=float)
parser.add_argument("-warmup_steps", default=8000, type=int)
parser.add_argument("-warmup_steps_bert", default=8000, type=int)
parser.add_argument("-max_grad_norm", default=0, type=float)

args = parser.parse_args()
print("\nArguments...\n")
print(args)

print("\nCreating callbacks...\n")
early_stop_callback = EarlyStopping('val_p1', patience=5)
modelfilepath = '../trainedmodels/' + args.model_type + "_" + args.save_mname + '-{epoch:02d}-{val_loss:.2f}'
checkpoint_callback = ModelCheckpoint(filepath=modelfilepath,
                                      save_top_k=10,
                                      monitor='val_p5')
logger = TensorBoardLogger("../tb_logs",
                           name=args.model_type,
                           version=args.save_mname)

print("\nLoading Model...\n")
model = LitNet(Eval(), args, mode=args.model_type)
trainer = Trainer(gpus=[int(item) for item in args.gpus.split(',')],
                  min_epochs=1,
                  max_epochs=args.epochs,
                  distributed_backend='dp',
                  profiler=True,
Esempio n. 25
0
def main(argv):
    if not os.path.exists(FLAGS.logs_dir):
        os.makedirs(FLAGS.logs_dir)

    set_seed(FLAGS.seed)
    id2class, intent_examples = read_nlu_data()

    if FLAGS.do_train:
        if not os.path.exists(FLAGS.output_dir):
            os.makedirs(FLAGS.output_dir)

        model = NluClassifier(id2class, intent_examples)

        early_stop_callback = EarlyStopping(
            monitor=FLAGS.monitor,
            min_delta=0.0,
            patience=FLAGS.patience,
            verbose=True,
            mode=FLAGS.metric_mode,
        )

        checkpoint_callback = ModelCheckpoint(filepath=FLAGS.output_dir,
                                              save_top_k=3,
                                              monitor=FLAGS.monitor,
                                              mode=FLAGS.metric_mode,
                                              prefix='nlu_')

        trainer = pl.Trainer(
            default_root_dir='logs',
            gpus=(FLAGS.gpus if torch.cuda.is_available() else 0),
            distributed_backend='dp',
            max_epochs=FLAGS.epochs,
            fast_dev_run=FLAGS.debug,
            logger=pl.loggers.TensorBoardLogger('logs/', name='nlu',
                                                version=0),
            checkpoint_callback=checkpoint_callback,
            early_stop_callback=early_stop_callback)

        trainer.fit(model)

    if FLAGS.do_predict:
        from sanic import Sanic, response
        server = Sanic()

        checkpoints = list(
            sorted(
                glob(os.path.join(FLAGS.output_dir, "nlu_*.ckpt"),
                     recursive=True)))
        model = NluClassifier.load_from_checkpoint(
            checkpoint_path=checkpoints[-1],
            id2class=id2class,
            intent_examples=intent_examples)
        model.eval()
        model.freeze()

        @server.route("/parse", methods=['POST'])
        async def parse(request):
            texts = request.json
            prediction = model.predict(texts)
            return response.json(prediction)

        server.run(host="0.0.0.0", port=5000, debug=True)
Esempio n. 26
0
def train(exp_name, gpus):
    print("Start")
    file = open('data/nela-covid-2020/combined/headlines_cnn_bart_split.pkl',
                'rb')
    # file = open('data/nela-covid-2020/combined/headlines_contentmorals_cnn_bart_split.pkl', 'rb')
    data = pickle.load(file)
    file.close()
    print("Data Loaded")

    # create datasets
    # train_dataset = NewsDataset(data['train'][0:1])
    train_dataset = NewsDataset(data['train'])
    val_dataset = NewsDataset(data['val'])
    test_dataset = NewsDataset(data['test'])

    embedding_dataset = EmbeddingDataset()

    train_loader = DataLoader(train_dataset, batch_size=32, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=32, num_workers=4)

    # train_loader = DataLoader(train_dataset, batch_size=16, num_workers=4)
    # val_loader = DataLoader(val_dataset, batch_size=16, num_workers=4)

    # train_loader = DataLoader(embedding_dataset, batch_size=32, num_workers=4)
    # train_loader = DataLoader(embedding_dataset, batch_size=512, num_workers=4)
    # val_loader = DataLoader(embedding_dataset, batch_size=64, num_workers=4)

    # ------------
    # training
    # ------------
    LEARNING_RATE = 1e-5
    hparams = {'lr': LEARNING_RATE}
    model = OneHotMoralClassifier(hparams, use_mask=False)
    # model = CustomMoralClassifier(hparams)
    # model = MoralClassifier(hparams)
    # model = PseudoEmbedding(hparams)
    early_stop_callback = EarlyStopping(monitor='val_loss',
                                        min_delta=0.00,
                                        patience=3,
                                        verbose=True,
                                        mode='auto')
    checkpoint_callback = ModelCheckpoint(dirpath=os.path.join(
        "./experiments", exp_name, "checkpoints"),
                                          save_top_k=1,
                                          monitor='train_loss',
                                          mode='min')
    trainer = Trainer(
        gpus=gpus,
        # auto_lr_find=False, # use to explore LRs
        # distributed_backend='dp',
        max_epochs=20,
        callbacks=[early_stop_callback, checkpoint_callback],
    )

    # LR Exploration
    # lr_finder = trainer.tuner.lr_find(model, train_loader, val_loader)
    # print(lr_finder.results)
    # fig = lr_finder.plot(suggest=True)
    # # fig.show()
    # # fig.savefig('lr.png')
    # new_lr = lr_finder.suggestion()
    # print(new_lr)

    trainer.fit(model, train_loader, val_loader)
    print("Training Done")
Esempio n. 27
0
    val_dataset = create_dataset(val['X'],
                                 val['y'],
                                 model_name,
                                 256,
                                 num_classes=2)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=64,
                                  num_workers=3,
                                  shuffle=True)
    val_dataloader = DataLoader(val_dataset,
                                batch_size=64,
                                num_workers=3,
                                shuffle=False)

    early_stopping = EarlyStopping('val_accuracy', patience=6, mode='max')
    model_checkpoint = ModelCheckpoint(monitor='val_accuracy',
                                       mode='max',
                                       save_top_k=1)

    trainer = pl.Trainer(
        deterministic=True,
        weights_save_path=f'checkpoints/{checkpoint_directory}/',
        logger=wandb_logger,
        early_stop_callback=early_stopping,
        checkpoint_callback=model_checkpoint,
        distributed_backend='dp',
        gpus=None,
        # gradient_clip_val=0.5,
        num_sanity_val_steps=-1,
        min_epochs=100)
Esempio n. 28
0
def get_early_stop_callback(patience=10):
    return EarlyStopping(monitor='val_loss',
                         patience=patience,
                         verbose=True,
                         mode='min')
Esempio n. 29
0
def get_early_stopping_callback(metric, patience):
    return EarlyStopping(monitor=f"val_{metric}", mode="max", patience=patience, verbose=True,)
Esempio n. 30
0
        else:
            suggested_lr = float(lr_find_config.loc[args.model_name, 'lr'])
            print(f'Reading LR {suggested_lr} from archive config.')
        model.lr = suggested_lr
        # Need to manually update, similar to doc.
        # Reference: https://pytorch-lightning.readthedocs.io/en/latest/lr_finder.html
        model.hparams.lr = suggested_lr

    if args.logger_platform == 'wandb':
        logger = WandbLogger(project="ptb-xl")
    elif args.logger_platform == 'tensorboard':
        logger = TensorBoardLogger(args.log_dir, name='')
        model.log_dir = args.log_dir

    early_stopping_callback = EarlyStopping(
        verbose=True, monitor='val_epoch_loss', mode='min',
        patience=5) if args.early_stopping else None
    checkpoint_callback = args.checkpoint_models and int(
        os.environ.get('LOCAL_RANK', 0)) == 0
    progress_bar_callback = ProgressBar()
    batch_gradient_verification_callback = BatchGradientVerificationCallback(
        output_mapping=lambda output: output['loss'])
    batch_norm_verification_callback = BatchNormVerificationCallback()
    module_monitor_callback = ModuleDataMonitor(submodules=True)

    # Resetting trainer due to some issue with threading otherwise
    trainer = Trainer.from_argparse_args(
        args,
        checkpoint_callback=checkpoint_callback,
        deterministic=True,
        logger=logger,