Example #1
0
def test_lr_logger_multi_lrs(tmpdir, logging_interval):
    """ Test that learning rates are extracted and logged for multi lr schedulers. """
    tutils.reset_seed()

    model = EvalModelTemplate()
    model.configure_optimizers = model.configure_optimizers__multiple_schedulers

    lr_logger = LearningRateLogger(logging_interval=logging_interval)
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=0.1,
        limit_train_batches=0.5,
        callbacks=[lr_logger],
    )
    result = trainer.fit(model)
    assert result

    assert lr_logger.lrs, 'No learning rates logged'
    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
        'Number of learning rates logged does not match number of lr schedulers'
    assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \
        'Names of learning rates not set correctly'

    if logging_interval == 'step':
        expected_number_logged = trainer.global_step
    if logging_interval == 'epoch':
        expected_number_logged = trainer.max_epochs

    assert all(len(lr) == expected_number_logged for lr in lr_logger.lrs.values()), \
        'Length of logged learning rates do not match the expected number'
Example #2
0
def main(hparams):
    cifar10_download.main()

    if not th.cuda.is_available():
        hparams.cuda = False

    hparams.gpus = '0,' if hparams.cuda else None
    
    seed_everything(hparams.seed)

    # If only train on 1 GPU. Must set_device otherwise PyTorch always store model on GPU 0 first
    if type(hparams.gpus) == str:
        if len(hparams.gpus) == 2: # GPU number and comma e.g. '0,' or '1,'
            torch.cuda.set_device(int(hparams.gpus[0]))
    
    # Model
    classifier = CIFAR10_Module(hparams)
    
    # Trainer
    lr_logger = LearningRateLogger()
    logger = TensorBoardLogger("logs", name=hparams.classifier)
    trainer = Trainer(callbacks=[lr_logger], gpus=hparams.gpus, max_epochs=hparams.max_epochs,
                      deterministic=True, early_stop_callback=False, logger=logger, checkpoint_callback=False, fast_dev_run=hparams.debug)
    if not hparams.eval:
        trainer.fit(classifier)
    else:
        trainer.test(classifier)
    if hparams.save_model:
        model = classifier.student_models[0] if hparams.num_students else classifier.teacher_model
        th.save(model.state_dict(), 'logs/{}.pt'.format(hparams.classifier))
Example #3
0
def test_lr_logger_multi_lrs(tmpdir):
    """ Test that learning rates are extracted and logged for multi lr schedulers. """
    tutils.reset_seed()

    model = EvalModelTemplate()
    model.configure_optimizers = model.configure_optimizers__multiple_schedulers

    lr_logger = LearningRateLogger()
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_val_batches=0.1,
        limit_train_batches=0.5,
        callbacks=[lr_logger]
    )
    result = trainer.fit(model)
    assert result

    assert lr_logger.lrs, 'No learning rates logged'
    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
        'Number of learning rates logged does not match number of lr schedulers'
    assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \
        'Names of learning rates not set correctly'
    assert all(len(lr) == trainer.max_epochs for k, lr in lr_logger.lrs.items()), \
        'Length of logged learning rates exceeds the number of epochs'
def train():
    config = get_transformer_encoder_config()
    model = TransformerModelEncoderLightning(config)
    gpu = 1 if torch.cuda.is_available() else None
    # define learning rate logger
    lr_logger = LearningRateLogger()

    tensorlogger = TensorBoardLogger("ts_logger", "transformer_encoder")
    # define early stopping callback
    early_stopping_callback = EarlyStopping(patience=3,
                                            verbose=True,
                                            mode="min")
    # define model checkpoint callback
    model_checkpoint_callback = ModelCheckpoint(
        filepath=join(tensorlogger.log_dir, "{epoch:02d}-{val_loss:.4f}"),
        period=1,
        save_top_k=3,
    )
    trainer = pl.Trainer(max_epochs=10,
                         gpus=gpu,
                         gradient_clip_val=0.5,
                         row_log_interval=200,
                         check_val_every_n_epoch=1,
                         reload_dataloaders_every_epoch=True,
                         callbacks=[lr_logger],
                         logger=tensorlogger,
                         checkpoint_callback=model_checkpoint_callback,
                         early_stop_callback=early_stopping_callback,
                         progress_bar_refresh_rate=1)
    trainer.fit(model)
Example #5
0
def main():
    from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

    args = get_args()
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    model = LNNP(args)
    checkpoint_callback = ModelCheckpoint(
        filepath=args.log_dir,
        monitor="val_loss",
        save_top_k=8,
        period=args.eval_interval,
    )
    lr_logger = LearningRateLogger()
    tb_logger = pl.loggers.TensorBoardLogger(args.log_dir)
    trainer = pl.Trainer(gpus=args.gpus,
                         max_epochs=args.num_epochs,
                         distributed_backend=args.distributed_backend,
                         num_nodes=args.num_nodes,
                         default_root_dir=args.log_dir,
                         auto_lr_find=False,
                         resume_from_checkpoint=args.load_model,
                         checkpoint_callback=checkpoint_callback,
                         callbacks=[lr_logger],
                         logger=tb_logger,
                         reload_dataloaders_every_epoch=False)

    trainer.fit(model)

    # run test set after completing the fit
    trainer.test()
Example #6
0
def main(hparams):

    checkpoint = ModelCheckpoint(filepath='weights/{epoch}_fold=' +
                                 str(hparams.fold),
                                 save_top_k=1,
                                 monitor='val_loss',
                                 mode='min',
                                 verbose=True,
                                 prefix='')

    logger = TensorBoardLogger(
        'pn_logs', name='fold={fold}'.format(fold=str(hparams.fold)))
    lr_logger = LearningRateLogger(logging_interval='step')
    model = Pneumothorax(hparams)
    trainer = pl.Trainer(
        max_epochs=hparams.epochs,
        gpus=[hparams.gpus],
        checkpoint_callback=checkpoint,
        accumulate_grad_batches=2,
        logger=logger,
        callbacks=[lr_logger],
        precision=16,
    )

    trainer.fit(model)
def main(hparams):
    
    seed_everything(0)
    
    # If only train on 1 GPU. Must set_device otherwise PyTorch always store model on GPU 0 first
    if type(hparams.gpus) == str:
        if len(hparams.gpus) == 2: # GPU number and comma e.g. '0,' or '1,'
            torch.cuda.set_device(int(hparams.gpus[0]))
    
    # Model
    classifier = CIFAR10_Module(hparams)
    
    # Trainer
    lr_logger = LearningRateLogger()
    logger = TensorBoardLogger("logs", name=hparams.classifier)
    trainer = Trainer(callbacks=[lr_logger], gpus=hparams.gpus, max_epochs=hparams.max_epochs,
                      deterministic=True, early_stop_callback=False, logger=logger)
    trainer.fit(classifier)

    # Load best checkpoint
    checkpoint_path = os.path.join(os.getcwd(), 'logs', hparams.classifier, 'version_' + str(classifier.logger.version),'checkpoints')
    classifier = CIFAR10_Module.load_from_checkpoint(os.path.join(checkpoint_path, os.listdir(checkpoint_path)[0]))
    
    # Save weights from checkpoint
    statedict_path = os.path.join(os.getcwd(), 'cifar10_models', 'state_dicts', hparams.classifier + '.pt')
    torch.save(classifier.model.state_dict(), statedict_path)
    
    # Test model
    trainer.test(classifier)
Example #8
0
def get_trainer_kwargs(args):
    version = args.version
    if args.name is not None:
        version = f'{version}-{args.name}'
    if args.resume_from_checkpoint is not None and '/' not in args.resume_from_checkpoint:
        if args.resume_from_checkpoint.endswith(args.name):
            version = f'{args.version}-{args.resume_from_checkpoint}'
        else:
            version = f'{version}-{args.resume_from_checkpoint}'
    args.version = version

    os.makedirs(Path('data') / args.lang / 'runs' / args.version,
                exist_ok=True)
    logger = pl.loggers.TensorBoardLogger(save_dir=str(
        Path('data') / args.lang),
                                          name='runs',
                                          version=args.version)
    lr_logger = LearningRateLogger()
    checkpoint_callback = ModelCheckpoint(filepath=None,
                                          monitor='val_loss',
                                          mode='min',
                                          verbose=True,
                                          save_top_k=5,
                                          save_last=True,
                                          period=0)

    return {
        'logger': logger,
        'default_root_dir': 'data',
        'callbacks': [lr_logger],
        'checkpoint_callback': checkpoint_callback,
        'replace_sampler_ddp': False
    }
def test_train_pipeline(fix_seed, config, gpus):
    config = OmegaConf.create(config)

    train_dataloader, test_dataloader = get_data_loaders(config=config)
    lr_logger = LearningRateLogger()
    model = build_model(model_conf=config.model)
    runner = Runner(model=model, config=config.runner)

    trainer = Trainer(
        distributed_backend=config.runner.trainer.distributed_backend,
        fast_dev_run=True,
        gpus=gpus,
        amp_level="O2",
        row_log_interval=10,
        callbacks=[lr_logger],
        max_epochs=1,
        weights_summary="top",
        reload_dataloaders_every_epoch=False,
        resume_from_checkpoint=None,
        benchmark=False,
        deterministic=True,
        num_sanity_val_steps=5,
        overfit_batches=0.0,
        precision=32,
        profiler=True,
    )

    trainer.fit(model=runner,
                train_dataloader=train_dataloader,
                val_dataloaders=test_dataloader)
Example #10
0
def train(dataset_name: str,
          model_name: str,
          expt_dir: str,
          data_folder: str,
          num_workers: int = 0,
          is_test: bool = False,
          resume_from_checkpoint: str = None):
    seed_everything(SEED)
    dataset_main_folder = data_folder
    vocab = Vocabulary.load(join(dataset_main_folder, "vocabulary.pkl"))

    if model_name == "code2seq":
        config_function = get_code2seq_test_config if is_test else get_code2seq_default_config
        config = config_function(dataset_main_folder)
        model = Code2Seq(config, vocab, num_workers)
        model.half()
    #elif model_name == "code2class":
    #	config_function = get_code2class_test_config if is_test else get_code2class_default_config
    #	config = config_function(dataset_main_folder)
    #	model = Code2Class(config, vocab, num_workers)
    else:
        raise ValueError(f"Model {model_name} is not supported")

    # define logger
    wandb_logger = WandbLogger(project=f"{model_name}-{dataset_name}",
                               log_model=True,
                               offline=True)
    wandb_logger.watch(model)
    # define model checkpoint callback
    model_checkpoint_callback = ModelCheckpoint(
        filepath=join(expt_dir, "{epoch:02d}-{val_loss:.4f}"),
        period=config.hyperparams.save_every_epoch,
        save_top_k=3,
    )
    # define early stopping callback
    early_stopping_callback = EarlyStopping(
        patience=config.hyperparams.patience, verbose=True, mode="min")
    # use gpu if it exists
    gpu = 1 if torch.cuda.is_available() else None
    # define learning rate logger
    lr_logger = LearningRateLogger()
    trainer = Trainer(
        max_epochs=20,
        gradient_clip_val=config.hyperparams.clip_norm,
        deterministic=True,
        check_val_every_n_epoch=config.hyperparams.val_every_epoch,
        row_log_interval=config.hyperparams.log_every_epoch,
        logger=wandb_logger,
        checkpoint_callback=model_checkpoint_callback,
        early_stop_callback=early_stopping_callback,
        resume_from_checkpoint=resume_from_checkpoint,
        gpus=gpu,
        callbacks=[lr_logger],
        reload_dataloaders_every_epoch=True,
    )
    trainer.fit(model)
    trainer.save_checkpoint(join(expt_dir, 'Latest.ckpt'))

    trainer.test()
def generic_train(
        model: BaseTransformer,
        args: argparse.Namespace,
        early_stopping_callback=False,
        logger=True,  # can pass WandbLogger() here
        extra_callbacks=[],
        checkpoint_callback=None,
        logging_callback=None,
        **extra_train_kwargs):
    pl.seed_everything(args.seed)

    # init model
    odir = Path(model.hparams.output_dir)
    odir.mkdir(exist_ok=True)

    # add custom checkpoints
    if checkpoint_callback is None:
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            filepath=args.output_dir,
            prefix="checkpoint",
            monitor="val_loss",
            mode="min",
            save_top_k=1)
    if logging_callback is None:
        logging_callback = LoggingCallback()

    train_params = {}

    # TODO: remove with PyTorch 1.6 since pl uses native amp
    if args.fp16:
        train_params["precision"] = 16
        train_params["amp_level"] = args.fp16_opt_level

    if args.gpus > 1:
        train_params["distributed_backend"] = "ddp"

    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches

    lr_logger = LearningRateLogger(logging_interval='step')

    #         deterministic=True,
    trainer = pl.Trainer.from_argparse_args(
        args,
        weights_summary='full',
        callbacks=[logging_callback, lr_logger],
        logger=logger,
        checkpoint_callback=checkpoint_callback,
        early_stop_callback=early_stopping_callback,
        num_sanity_val_steps=4,
        **train_params,
    )

    trainer.lr_logger = lr_logger

    if args.do_train:
        trainer.fit(model)

    return trainer
Example #12
0
def init_trainer():
    """ Init a Lightning Trainer using from_argparse_args
    Thus every CLI command (--gpus, distributed_backend, ...) become available.
    """
    parser = ArgumentParser()
    parser = Trainer.add_argparse_args(parser)
    args = parser.parse_args()
    lr_logger = LearningRateLogger()
    return Trainer.from_argparse_args(args, callbacks=[lr_logger])
def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    setup_seed(cfg.random_seed)

    model = LightningModel(cfg)

    checkpoint_callback = ModelCheckpoint(
        filepath=f"{cfg.checkpoint_path}/{cfg.name}/{cfg.version}/"
        f"{cfg.name}_{cfg.version}_{{epoch}}_{{avg_val_loss:.3f}}_{{ade:.3f}}_{{fde:.3f}}_{{fiou:.3f}}",
        save_last=True,
        save_top_k=8,
        verbose=True,
        monitor='fiou',
        mode='max',
        prefix='')

    lr_logger_callback = LearningRateLogger(logging_interval='step')

    logger = TensorBoardLogger(save_dir=cfg.log_path,
                               name=cfg.name,
                               version=cfg.version)
    logger.log_hyperparams(model.hparams)

    profiler = SimpleProfiler() if cfg.simple_profiler else AdvancedProfiler()
    check_val_every_n_epoch = cfg.check_val_every_n_epoch if hasattr(
        cfg, 'check_val_every_n_epoch') else 1

    trainer = pl.Trainer(
        gpus=cfg.num_gpus,
        max_epochs=cfg.max_epochs,
        logger=logger,
        profiler=profiler,  # this line won't work in multi-gpu setting.
        weights_summary="top",
        gradient_clip_val=cfg.gradient_clip_val,
        callbacks=[lr_logger_callback],
        checkpoint_callback=checkpoint_callback,
        resume_from_checkpoint=cfg.resume_from_checkpoint,
        accumulate_grad_batches=cfg.batch_size_times,
        check_val_every_n_epoch=check_val_every_n_epoch)

    if (not (args.train or args.test)) or args.train:
        shutil.copy(
            args.config,
            os.path.join(cfg.log_path, cfg.name, cfg.version,
                         args.config.split('/')[-1]))

        if cfg.load_from_checkpoint is not None:
            model_ckpt = partial_state_dict(model, cfg.load_from_checkpoint)
            model.load_state_dict(model_ckpt)
        trainer.fit(model)

    if args.test:
        if cfg.test_checkpoint is not None:
            model_ckpt = partial_state_dict(model, cfg.test_checkpoint)
            model.load_state_dict(model_ckpt)
        trainer.test(model)
Example #14
0
def setup_callbacks_loggers(args):
    
    log_path = Path('/home/yyousfi1/LogFiles/comma/')
    name = args.backbone
    version = args.version
    tb_logger = TensorBoardLogger(log_path, name=name, version=version)
    lr_logger = LearningRateLogger(logging_interval='epoch')
    ckpt_callback = ModelCheckpoint(filepath=Path(tb_logger.log_dir)/'checkpoints/{epoch:02d}_{val_loss:.4f}', 
                                    save_top_k=10, save_last=True)
   
    return ckpt_callback, tb_logger, lr_logger
Example #15
0
def main(args):
    logger = pl_loggers.WandbLogger(experiment="example", save_dir=None)
    early_stop = EarlyStopping(monitor="val_loss")
    checkpoint_callback = ModelCheckpoint(dirpath="ckpts/", monitor="val_loss")
    model = ExampleModel(args)
    lr_logger = LearningRateLogger()
    trainer = Trainer.from_argparse_args(
        args,
        logger=logger,
        callbacks=[early_stop, lr_logger],
        checkpoint_callback=checkpoint_callback)
    trainer.fit(model)
def setup_callbacks_loggers(args):

    log_path = Path('/home/yyousfi1/LogFiles/OneHotConv/')
    log_path = log_path / args.qf / args.stego_scheme / args.payload
    name = args.backbone
    version = args.version
    tb_logger = TensorBoardLogger(log_path, name=name, version=version)
    lr_logger = LearningRateLogger(logging_interval='epoch')
    ckpt_callback = ModelCheckpoint(filepath=Path(tb_logger.log_dir) /
                                    'checkpoints/{epoch:02d}_{val_FC_acc:.3f}',
                                    save_top_k=5,
                                    save_last=True)

    return ckpt_callback, tb_logger, lr_logger
Example #17
0
def main(cfg: DictConfig) -> None:
    print(cfg.pretty())
    neptune_logger = CustomNeptuneLogger(params=flatten_dict(
        OmegaConf.to_container(cfg, resolve=True)),
                                         **cfg.logging.neptune_logger)
    tb_logger = loggers.TensorBoardLogger(**cfg.logging.tb_logger)

    lr_logger = LearningRateLogger()

    # TODO change to cyclicLR per epochs
    my_callback = MyCallback(cfg)

    model = get_model(cfg)
    if cfg.model.ckpt_path is not None:
        ckpt_pth = glob.glob(utils.to_absolute_path(cfg.model.ckpt_path))
        model = load_pytorch_model(ckpt_pth[0], model)

    seed_everything(2020)

    # TODO change to enable logging losses
    lit_model = O2UNetSystem(hparams=cfg, model=model)

    checkpoint_callback_conf = OmegaConf.to_container(
        cfg.callbacks.model_checkpoint, resolve=True)
    checkpoint_callback = ModelCheckpoint(**checkpoint_callback_conf)

    early_stop_callback_conf = OmegaConf.to_container(cfg.callbacks.early_stop,
                                                      resolve=True)
    early_stop_callback = EarlyStopping(**early_stop_callback_conf)

    trainer = Trainer(
        checkpoint_callback=checkpoint_callback,
        early_stop_callback=early_stop_callback,
        logger=[tb_logger, neptune_logger],
        # logger=[tb_logger],
        callbacks=[lr_logger, my_callback],
        **cfg.trainer)

    # TODO change to train with all data

    datasets = get_datasets(OmegaConf.to_container(cfg, resolve=True))
    train_dataset = datasets["train"]
    valid_dataset = datasets["valid"]
    trainer.fit(
        lit_model,
        train_dataloader=DataLoader(train_dataset,
                                    **cfg["training"]["dataloader"]["train"]),
        val_dataloaders=DataLoader(valid_dataset,
                                   **cfg["training"]["dataloader"]["valid"]))
Example #18
0
def test_lr_logger_no_lr(tmpdir):
    tutils.reset_seed()

    model = EvalModelTemplate()

    lr_logger = LearningRateLogger()
    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=2,
                      limit_val_batches=0.1,
                      train_percent_check=0.5,
                      callbacks=[lr_logger])

    with pytest.warns(RuntimeWarning):
        result = trainer.fit(model)
        assert result
def test_tbd_remove_in_v0_11_0_trainer():
    with pytest.deprecated_call(match='will be removed in v0.11.0'):
        LearningRateLogger()

    with pytest.deprecated_call(match='will be removed in v0.11.0'):
        trainer = Trainer(row_log_interval=8)
    assert trainer.log_every_n_steps == 8
    with pytest.deprecated_call(match='will be removed in v0.11.0'):
        assert trainer.row_log_interval == 8

    with pytest.deprecated_call(match='will be removed in v0.11.0'):
        trainer = Trainer(log_save_interval=9)
    assert trainer.flush_logs_every_n_steps == 9
    with pytest.deprecated_call(match='will be removed in v0.11.0'):
        assert trainer.log_save_interval == 9
def init_trainer():
    """ Init a Lightning Trainer using from_argparse_args
    Thus every CLI command (--gpus, distributed_backend, ...) become available.
    """
    parser = ArgumentParser()
    parser = Trainer.add_argparse_args(parser)
    args = parser.parse_args()
    lr_logger = LearningRateLogger()
    early_stopping = EarlyStopping(monitor='val_loss',
                                   mode='min',
                                   min_delta=0.001,
                                   patience=10,
                                   verbose=True)
    return Trainer.from_argparse_args(args,
                                      callbacks=[lr_logger, early_stopping])
Example #21
0
def train(file_path,
          train_ratio=0.8,
          optimizer="AdamW",
          intent_optimizer_lr=1e-4,
          entity_optimizer_lr=2e-4,
          epochs=20,
          batch_size=None,
          gpu_num=0,
          distributed_backend=None,
          checkpoint_prefix='morphine_model_'):
    early_stopping = EarlyStopping('val_loss')
    lr_logger = LearningRateLogger()
    checkpoint_callback = model_checkpoint.ModelCheckpoint(
        prefix=checkpoint_prefix)

    prepare_data_per_node = True
    if 0 <= gpu_num < 2: prepare_data_per_node = False

    if batch_size is None:
        trainer = Trainer(auto_scale_batch_size="power",
                          max_epochs=epochs,
                          gpus=gpu_num,
                          distributed_backend=distributed_backend,
                          early_stop_callback=early_stopping,
                          callbacks=[lr_logger],
                          checkpoint_callback=checkpoint_callback,
                          prepare_data_per_node=prepare_data_per_node)
    else:
        trainer = Trainer(max_epochs=epochs,
                          gpus=gpu_num,
                          distributed_backend=distributed_backend,
                          early_stop_callback=early_stopping,
                          callbacks=[lr_logger],
                          checkpoint_callback=checkpoint_callback,
                          prepare_data_per_node=prepare_data_per_node)

    model_args = {}
    model_args["epochs"] = epochs
    model_args["batch_size"] = batch_size
    model_args["nlu_data"] = open(file_path, encoding="utf-8").readlines()
    model_args["train_ratio"] = train_ratio
    model_args["optimizer"] = optimizer
    model_args["intent_optimizer_lr"] = intent_optimizer_lr
    model_args["entity_optimizer_lr"] = entity_optimizer_lr

    hparams = Namespace(**model_args)
    model = MorphineClassifier(hparams)
    trainer.fit(model)
Example #22
0
def train(omegaConf: DictConfig) -> LightningModule:
    # Misc part
    if omegaConf['runner']['verbose'] is True:
        print(OmegaConf.to_yaml(omegaConf))

    pl.seed_everything(omegaConf['runner']['seed'])

    # Runner part
    runner = make_runner(omegaConf['runner'])

    if "auto_lr_find" in omegaConf['trainer'] and omegaConf['trainer']['auto_lr_find'] is True:
        runner = custom_lr_finder(runner, omegaConf)

    # When we are here, the omegaConf has already been checked by OmegaConf
    # so we can extract primitives to use with other libs
    config = OmegaConf.to_container(omegaConf)
    assert isinstance(config, dict)

    config['trainer']['default_root_dir'] = check_default_root_dir(config)

    config['trainer']['checkpoint_callback'] = build_checkpoint_callback(config)

    if 'logger' in config['trainer']:
        config['trainer']['logger'] = build_logger(config)

    if 'deterministic' in config['trainer']:
        config['trainer']['deterministic'] = True

    if 'profiler' in config['trainer'] and config['trainer']['profiler'] is True:
        config['trainer']['profiler'] = AdvancedProfiler()

    if 'scheduler' in config['runner'] and config['runner']['scheduler'] is not None:
        lr_monitor = LearningRateLogger(logging_interval='step')
        config['trainer']['callbacks'] = [lr_monitor]

    # ###
    # # Early stopping
    # # It is breaking neptune logging somehow, it seems that it overrides by 1 the current timestep
    # ###
    # early_stop_callback = EarlyStopping(
    #     monitor='val_accuracy', min_delta=0.00, patience=10, verbose=False, mode='max'
    # )
    # config['trainer']['early_stop_callback'] = early_stop_callback

    trainer = pl.Trainer(**config['trainer'])
    trainer.fit(runner)

    return runner
Example #23
0
def train_regression(hparams):
    if hparams.model == "UNetDS_Attention":
        net = unet_regr.UNetDS_Attention(hparams=hparams)
    elif hparams.model == "UNet_Attention":
        net = unet_regr.UNet_Attention(hparams=hparams)
    elif hparams.model == "UNet":
        net = unet_regr.UNet(hparams=hparams)
    elif hparams.model == "UNetDS":
        net = unet_regr.UNetDS(hparams=hparams)
    else:
        raise NotImplementedError(f"Model '{hparams.model}' not implemented")

    torchsummary.summary(net, (12, 288, 288), device="cpu")
    # return
    default_save_path = "lightning/precip_regression"

    checkpoint_callback = ModelCheckpoint(
        filepath=os.getcwd() + "/" + default_save_path + "/" +
        net.__class__.__name__ + "/{epoch}-{val_loss:.6f}",
        save_top_k=-1,
        verbose=False,
        monitor='val_loss',
        mode='min',
        prefix=net.__class__.__name__ + "_rain_threshhold_50_")
    lr_logger = LearningRateLogger()
    tb_logger = loggers.TensorBoardLogger(save_dir=default_save_path,
                                          name=net.__class__.__name__)

    earlystopping_callback = EarlyStopping(
        monitor='val_loss',
        mode='min',
        patience=hparams.
        es_patience,  # is effectively half (due to a bug in pytorch-lightning)
    )
    trainer = pl.Trainer(fast_dev_run=hparams.fast_dev_run,
                         gpus=hparams.gpus,
                         weights_summary=None,
                         max_epochs=hparams.epochs,
                         default_save_path=default_save_path,
                         checkpoint_callback=checkpoint_callback,
                         early_stop_callback=earlystopping_callback,
                         logger=tb_logger,
                         callbacks=[lr_logger],
                         resume_from_checkpoint=hparams.resume_from_checkpoint,
                         val_check_interval=hparams.val_check_interval,
                         overfit_pct=hparams.overfit_pct)
    trainer.fit(net)
Example #24
0
def train(model_name: str,
          n_cr: int,
          num_workers: int = 0,
          is_test: bool = False,
          resume_from_checkpoint: str = None):
    seed_everything(SEED)

    if model_name == "improved_gan":
        config_function = get_gan_test_config if is_test else get_gan_default_config
        config = config_function(n_cr)
        model = GAN(config, num_workers, improved=True)
    elif model_name == "default_gan":
        config_function = get_gan_test_config if is_test else get_gan_default_config
        config = config_function(n_cr)
        model = GAN(config, num_workers, improved=False)
    else:
        raise ValueError(f"Model {model_name} is not supported")
    # define logger
    wandb_logger = WandbLogger(project="GAN", log_model=True, offline=is_test)
    wandb_logger.watch(model, log="all")
    # define model checkpoint callback
    model_checkpoint_callback = ModelCheckpoint(
        filepath=join(wandb.run.dir, "{epoch:02d}-{val_loss:.4f}"),
        period=config.save_every_epoch,
        save_top_k=3,
    )
    # use gpu if it exists
    gpu = 1 if torch.cuda.is_available() else None
    # define learning rate logger
    lr_logger = LearningRateLogger()
    trainer = Trainer(
        max_epochs=config.n_epochs,
        deterministic=True,
        check_val_every_n_epoch=config.val_every_epoch,
        row_log_interval=config.log_every_epoch,
        logger=wandb_logger,
        checkpoint_callback=model_checkpoint_callback,
        resume_from_checkpoint=resume_from_checkpoint,
        gpus=gpu,
        callbacks=[lr_logger],
        reload_dataloaders_every_epoch=True,
    )

    trainer.fit(model)

    trainer.test()
def main(cfg: DictConfig):
    LOG.info("Config:\n" + OmegaConf.to_yaml(cfg))
    seed_everything(cfg.seed)
    system = EpicActionRecognitionSystem(cfg)
    if not cfg.get("log_graph", True):
        # MTRN can't be traced due to the model stochasticity so causes a JIT tracer
        # error, we allow you to prevent the tracer from running to log the graph when
        # the summary writer is created
        system.example_input_array = None  # type: ignore
    data_module = EpicActionRecogintionDataModule(cfg)
    lr_logger = LearningRateLogger(logging_interval="step")
    checkpoint_callback = ModelCheckpoint(save_last=True)
    # with ipdb.launch_ipdb_on_exception():
    trainer = Trainer(callbacks=[lr_logger],
                      checkpoint_callback=checkpoint_callback,
                      **cfg.trainer)
    trainer.fit(system, datamodule=data_module)
Example #26
0
def main(hparams):
    pl.seed_everything(42)
    checkpoint_callback = ModelCheckpoint(save_top_k=-1)

    wandb_logger = WandbLogger(project='video-colorization',
                               tags=["colornet"],
                               name='SLURM',
                               log_model=True)
    hparams.logger = wandb_logger
    lr_logger = LearningRateLogger()

    colornet = model.ColorNet(hparams)
    trainer = pl.Trainer.from_argparse_args(
        hparams,
        checkpoint_callback=checkpoint_callback,
        callbacks=[lr_logger])
    trainer.fit(colornet)
Example #27
0
def main(hparams):
    hparams.data_path += "x"
    experiments = {"0": [1], "1": [1], "2": [1], "3": [1], "4": [1]}

    loss_weights = {"1": 1.0, "5": 3.25, "21": 5.5}

    for seed, labels in experiments.items():
        for amount_labels in labels:
            splitted_path = hparams.data_path.split("/")
            splitted_path[-1] = str(seed)
            hparams.data_path = "/".join(splitted_path)
            hparams.amount_labels = amount_labels
            hparams.loss_weight = loss_weights[str(amount_labels)]

            model = RecRob(hparams)

            name = "recrob"
            logger = loggers.TensorBoardLogger(save_dir=hparams.output_dir,
                                               name=name,
                                               version="{}-{}".format(
                                                   str(amount_labels),
                                                   str(seed)),
                                               log_graph=True)

            lr_logger = LearningRateLogger(logging_interval="step")
            trainer = Trainer(
                default_root_dir=logger.log_dir + "/checkpoints/",
                logger=logger,
                log_save_interval=10,
                callbacks=[lr_logger],
                gpus=hparams.gpus,
                tpu_cores=hparams.tpu_cores,
                fast_dev_run=hparams.fast_dev_run,
                max_epochs=hparams.max_epochs,
                auto_lr_find=hparams.auto_lr_find,
                gradient_clip_val=hparams.gradient_clip_val,
                check_val_every_n_epoch=hparams.check_val_every_n_epoch,
                amp_level=hparams.amp_level,
                accumulate_grad_batches=hparams.accumulate_grad_batches)

            print("Hyperparameter:")
            print("_______________")
            print(json.dumps(vars(hparams), indent=4))
            trainer.fit(model)
            test_result = trainer.test(model)
            trainer.logger.save()
Example #28
0
    def run(self, args=None):
        args = self.parse_args(args)
        seed_everything(args.seed)

        pprint.pprint('args')
        pprint.pprint(args.__dict__)
        pprint.pprint('*********************')

        checkpoint_callback = ModelCheckpoint(monitor='valid_loss',
                                              verbose=True,
                                              save_last=True)

        logger.info(args)

        lr_logger = LearningRateLogger()

        trainer = Trainer(
            default_root_dir=args.default_root_dir,
            progress_bar_refresh_rate=args.progress_bar_refresh_rate,
            min_epochs=args.min_epochs,
            max_epochs=args.max_epochs,
            val_check_interval=args.val_check_interval,
            limit_val_batches=args.limit_val_batches,
            gpus=args.gpus,
            distributed_backend=args.distributed_backend,
            row_log_interval=1,
            amp_level=args.amp_level,
            precision=args.precision,
            num_nodes=args.num_nodes,
            tpu_cores=args.tpu_cores,
            accumulate_grad_batches=args.accumulate_grad_batches,
            checkpoint_callback=checkpoint_callback,
            resume_from_checkpoint=args.resume_from_checkpoint,
            fast_dev_run=args.fast_dev_run,
            callbacks=[lr_logger],
        )

        model = self.get_model(args)
        logger.info(f'Start Training model {model}')

        logger.info('')
        trainer.fit(model)
        logger.info('Training loop finished.')

        return trainer
Example #29
0
def main(args: Namespace, model_cls) -> None:
    if args.seed is not None:
        pl.seed_everything(args.seed)

    if args.distributed_backend == 'ddp':
        # When using a single GPU per process and per
        # DistributedDataParallel, we need to divide the batch size
        # ourselves based on the total number of GPUs we have
        args.batch_size = int(args.batch_size / max(1, args.gpus))
        args.workers = int(args.workers / max(1, args.gpus))

    model = model_cls(**vars(args))
    lr_logger = LearningRateLogger(logging_interval='step')
    trainer = pl.Trainer.from_argparse_args(args, callbacks=[lr_logger])

    if args.evaluate:
        trainer.test(model)
    else:
        trainer.fit(model)
Example #30
0
def test_lr_logger_param_groups(tmpdir):
    """ Test that learning rates are extracted and logged for single lr scheduler"""
    tutils.reset_seed()

    model = EvalModelTemplate()
    model.configure_optimizers = model.configure_optimizers__param_groups

    lr_logger = LearningRateLogger()
    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=5,
                      val_percent_check=0.1,
                      train_percent_check=0.5,
                      callbacks=[lr_logger])
    results = trainer.fit(model)

    assert lr_logger.lrs, 'No learning rates logged'
    assert len(lr_logger.lrs) == 2 * len(trainer.lr_schedulers), \
        'Number of learning rates logged does not match number of param groups'
    assert all([k in ['lr-Adam/pg1', 'lr-Adam/pg2'] for k in lr_logger.lrs.keys()]), \
        'Names of learning rates not set correctly'