コード例 #1
0
ファイル: main.py プロジェクト: denix56/TADGAN
def main(dataset, gpus):
    batch_size = 64

    data = NABTraf(batch_size=batch_size, data_path=dataset)
    net = TadGAN(in_size=1,
                 weight_decay=1e-6,
                 iterations_critic=5,
                 lr=0.0005,
                 use_gru=True)
    net.example_input_array = torch.ones(batch_size, 100, 1, dtype=torch.float)
    logger = TensorBoardLogger('logs', name='tadgan', log_graph=True)

    #     early_stop_callback = EarlyStopping(
    #        monitor='F1',
    #        min_delta=0.00,
    #        patience=3,
    #        verbose=True,
    #        mode='max'
    #     )

    trainer = pl.Trainer(
        plugins=[DDPPlugin(find_unused_parameters=True)],
        fast_dev_run=False,
        weights_summary='full',
        log_gpu_memory=True,
        gpus=gpus,
        accelerator='ddp',
        logger=logger,
        check_val_every_n_epoch=5,
        max_epochs=100,
        callbacks=[
            GPUStatsMonitor(),
            # early_stop_callback
        ])
    trainer.fit(net, datamodule=data)
コード例 #2
0
def test_gpu_stats_monitor_no_queries(tmpdir):
    """
    Test GPU logger doesn't fail if no "nvidia-smi" queries are to be performed.
    """
    model = BoringModel()
    gpu_stats = GPUStatsMonitor(
        memory_utilization=False,
        gpu_utilization=False,
        intra_step_time=True,
        inter_step_time=True,
    )
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=2,
        limit_val_batches=0,
        log_every_n_steps=1,
        gpus=1,
        callbacks=[gpu_stats],
    )
    with mock.patch(
            "pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics"
    ) as log_metrics_mock:
        trainer.fit(model)

    assert log_metrics_mock.mock_calls[2:] == [
        mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=0),
        mock.call({"batch_time/inter_step (ms)": mock.ANY}, step=1),
        mock.call({"batch_time/intra_step (ms)": mock.ANY}, step=1),
    ]
コード例 #3
0
def get_trainer(args):
    pl.seed_everything(args.seed)

    # loggers
    root_dir = Path(args.default_root_dir).expanduser().resolve()
    root_dir.mkdir(parents=True, exist_ok=True)
    tb_save_dir = root_dir / "tb"
    tb_logger = TensorBoardLogger(save_dir=tb_save_dir)
    loggers = [tb_logger]
    logger.info(f"Run tensorboard --logdir {tb_save_dir}")

    # callbacks
    ckpt_cb = ModelCheckpoint(verbose=True)
    lr_cb = LearningRateMonitor(logging_interval="step")
    pb_cb = ProgressBar(refresh_rate=args.progress_bar_refresh_rate)
    callbacks = [lr_cb, pb_cb]

    callbacks.append(ckpt_cb)

    gpu_cb = GPUStatsMonitor()
    callbacks.append(gpu_cb)

    plugins = []
    trainer = pl.Trainer.from_argparse_args(args,
                                            logger=loggers,
                                            callbacks=callbacks,
                                            plugins=plugins)

    return trainer
コード例 #4
0
def test_gpu_stats_monitor(tmpdir):
    """
    Test GPU stats are logged using a logger.
    """
    model = EvalModelTemplate()
    gpu_stats = GPUStatsMonitor()
    logger = CSVLogger(tmpdir)

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        gpus=1,
        callbacks=[gpu_stats],
        logger=logger
    )

    results = trainer.fit(model)
    assert results

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    with open(path_csv, 'r') as fp:
        lines = fp.readlines()

    header = lines[0].split()

    fields = [
        'utilization.gpu',
        'memory.used',
        'memory.free',
        'utilization.memory'
    ]

    for f in fields:
        assert any([f in h for h in header])
コード例 #5
0
def test_gpu_stats_monitor(tmpdir):
    """Test GPU stats are logged using a logger."""
    model = BoringModel()
    with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"):
        gpu_stats = GPUStatsMonitor(intra_step_time=True)
    logger = CSVLogger(tmpdir)
    log_every_n_steps = 2

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_train_batches=7,
        log_every_n_steps=log_every_n_steps,
        gpus=1,
        callbacks=[gpu_stats],
        logger=logger,
    )

    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    met_data = np.genfromtxt(path_csv, delimiter=",", names=True, deletechars="", replace_space=" ")

    batch_time_data = met_data["batch_time/intra_step (ms)"]
    batch_time_data = batch_time_data[~np.isnan(batch_time_data)]
    assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps

    fields = ["utilization.gpu", "memory.used", "memory.free", "utilization.memory"]

    for f in fields:
        assert any(f in h for h in met_data.dtype.names)
コード例 #6
0
def test_gpu_stats_monitor_cpu_machine(tmpdir):
    """Test GPUStatsMonitor on CPU machine."""
    with pytest.raises(
            MisconfigurationException,
            match="NVIDIA driver is not installed"), pytest.deprecated_call(
                match="GPUStatsMonitor` callback was deprecated in v1.5"):
        GPUStatsMonitor()
コード例 #7
0
def test_gpu_stats_monitor_cpu_machine(tmpdir):
    """
    Test GPUStatsMonitor on CPU machine.
    """
    with pytest.raises(MisconfigurationException,
                       match='NVIDIA driver is not installed'):
        GPUStatsMonitor()
コード例 #8
0
def cli_main():
    pl.seed_everything(1234)

    # ------------
    # args
    # ------------
    parser = ArgumentParser()
    parser.add_argument('--batch_size', default=32, type=int)
    parser.add_argument('--hidden_dim', type=int, default=128)
    parser = pl.Trainer.add_argparse_args(parser)
    parser = LitClassifier.add_model_specific_args(parser)
    args = parser.parse_args()

    # ------------
    # data
    # ------------
    dataset = MNIST(_DATASETS_PATH, train=True, download=True, transform=transforms.ToTensor())
    mnist_test = MNIST(_DATASETS_PATH, train=False, download=True, transform=transforms.ToTensor())
    mnist_train, mnist_val = random_split(dataset, [55000, 5000])

    train_loader = DataLoader(mnist_train, batch_size=args.batch_size)
    val_loader = DataLoader(mnist_val, batch_size=args.batch_size)
    test_loader = DataLoader(mnist_test, batch_size=args.batch_size)

    # ------------
    # model
    # ------------
    model = LitClassifier(Backbone(hidden_dim=args.hidden_dim), args.learning_rate)

    # ------------
    # training
    # ------------
    experiment_dir = Path.cwd()
    checkpoint_callback = ModelCheckpoint(dirpath=experiment_dir)
    trainer = pl.Trainer.from_argparse_args(args)
    trainer.fit(model, train_loader, val_loader,gradient_clip_val=0.5,
                         benchmark=True,
                         callbacks=[GPUStatsMonitor(),checkpoint_callback],)

    # ------------
    # testing
    # ------------
    result = trainer.test(test_dataloaders=test_loader, gradient_clip_val=0.5,
                         benchmark=True,
                         callbacks=[GPUStatsMonitor(),checkpoint_callback],)
    print(result)
コード例 #9
0
def test_gpu_stats_monitor_parse_gpu_stats():
    logs = GPUStatsMonitor._parse_gpu_stats([1, 2], [[3, 4, 5], [6, 7]], [("gpu", "a"), ("memory", "b")])
    expected = {
        "device_id: 1/gpu (a)": 3,
        "device_id: 1/memory (b)": 4,
        "device_id: 2/gpu (a)": 6,
        "device_id: 2/memory (b)": 7,
    }
    assert logs == expected
コード例 #10
0
def test_gpu_stats_monitor_no_gpu_warning(tmpdir):
    """Test GPUStatsMonitor raises a warning when not training on GPU device."""
    model = BoringModel()
    with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"):
        gpu_stats = GPUStatsMonitor()

    trainer = Trainer(default_root_dir=tmpdir, callbacks=[gpu_stats], max_steps=1, gpus=None)

    with pytest.raises(MisconfigurationException, match="not running on GPU"):
        trainer.fit(model)
コード例 #11
0
def test_gpu_stats_monitor_no_logger(tmpdir):
    """Test GPUStatsMonitor with no logger in Trainer."""
    model = BoringModel()
    with pytest.deprecated_call(match="GPUStatsMonitor` callback was deprecated in v1.5"):
        gpu_stats = GPUStatsMonitor()

    trainer = Trainer(default_root_dir=tmpdir, callbacks=[gpu_stats], max_epochs=1, gpus=1, logger=False)

    with pytest.raises(MisconfigurationException, match="Trainer that has no logger."):
        trainer.fit(model)
コード例 #12
0
def test_gpu_stats_monitor_parse_gpu_stats():
    logs = GPUStatsMonitor._parse_gpu_stats('1,2', [[3, 4, 5], [6, 7]],
                                            [('gpu', 'a'), ('memory', 'b')])
    expected = {
        'gpu_id: 1/gpu (a)': 3,
        'gpu_id: 1/memory (b)': 4,
        'gpu_id: 2/gpu (a)': 6,
        'gpu_id: 2/memory (b)': 7
    }
    assert logs == expected
コード例 #13
0
def test_gpu_stats_monitor_no_gpu_warning(tmpdir):
    """
    Test GPUStatsMonitor raises a warning when not training on GPU device.
    """
    model = EvalModelTemplate()
    gpu_stats = GPUStatsMonitor()

    trainer = Trainer(default_root_dir=tmpdir,
                      callbacks=[gpu_stats],
                      max_steps=1,
                      gpus=None)

    with pytest.raises(MisconfigurationException, match='not running on GPU'):
        trainer.fit(model)
コード例 #14
0
def test_gpu_stats_monitor_no_logger(tmpdir):
    """
    Test GPUStatsMonitor with no logger in Trainer.
    """
    model = EvalModelTemplate()
    gpu_stats = GPUStatsMonitor()

    trainer = Trainer(default_root_dir=tmpdir,
                      callbacks=[gpu_stats],
                      max_epochs=1,
                      gpus=1,
                      logger=False)

    with pytest.raises(MisconfigurationException,
                       match='Trainer that has no logger.'):
        trainer.fit(model)
コード例 #15
0
def test_gpu_stats_monitor_no_gpu_warning(tmpdir):
    """
    Test GPUStatsMonitor raises a warning when not training on GPU device.
    """
    model = EvalModelTemplate()
    gpu_stats = GPUStatsMonitor()

    trainer = Trainer(default_root_dir=tmpdir,
                      callbacks=[gpu_stats],
                      max_steps=1,
                      gpus=None)

    with pytest.warns(
            RuntimeWarning,
            match='not running on GPU. Logged utilization will be independent'
    ):
        trainer.fit(model)
コード例 #16
0
def test_gpu_stats_monitor(tmpdir):
    """
    Test GPU stats are logged using a logger.
    """
    model = BoringModel()
    gpu_stats = GPUStatsMonitor(intra_step_time=True)
    logger = CSVLogger(tmpdir)
    log_every_n_steps = 2

    trainer = Trainer(default_root_dir=tmpdir,
                      max_epochs=2,
                      limit_train_batches=7,
                      log_every_n_steps=log_every_n_steps,
                      gpus=1,
                      callbacks=[gpu_stats],
                      logger=logger)

    trainer.fit(model)
    assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"

    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
    met_data = np.genfromtxt(path_csv,
                             delimiter=',',
                             names=True,
                             deletechars='',
                             replace_space=' ')

    batch_time_data = met_data['batch_time/intra_step (ms)']
    batch_time_data = batch_time_data[~np.isnan(batch_time_data)]
    assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps

    fields = [
        'utilization.gpu',
        'memory.used',
        'memory.free',
        'utilization.memory',
    ]

    for f in fields:
        assert any([f in h for h in met_data.dtype.names])
コード例 #17
0
ファイル: main.py プロジェクト: robahall/sentProject
def main(hparams):
    exp_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    tb_logger = TensorBoardLogger(save_dir='logs/', version=f'v_{exp_time}')

    gpu_stats = GPUStatsMonitor()

    data = IMDBDataModule(hparams=hparams)
    data.prepare_data()
    data.setup()

    model = RNN(input_dim=data.dims, hparams=hparams)

    trainer = Trainer(logger=tb_logger,
                      gpus=hparams.gpus,
                      max_epochs=hparams.max_epochs,
                      callbacks=[gpu_stats])

    trainer.fit(model, data)

    trainer.test(datamodule=data)

    torch.save(trainer.model.state_dict(), f'models/model_{exp_time}.pth')
コード例 #18
0
    def objective(trial):

        if hparams.version is None:
            hparams.version = str(uuid1())

        # main LightningModule
        pretrain_system = PreTrainSystem(
            learning_rate=trial.suggest_loguniform("learning_rate", 1e-5, 1e-2),
            beta_1=hparams.beta_1,
            beta_2=hparams.beta_2,
            weight_decay=trial.suggest_uniform("weight_decay", 1e-5, 1e-2),
            optimizer=hparams.optimizer,
            batch_size=hparams.batch_size,
            multiplier=hparams.multiplier,
            scheduler_patience=hparams.scheduler_patience,
        )

        pretrain_checkpoints = ModelCheckpoint(
            dirpath=MODEL_CHECKPOINTS_DIR,
            monitor="Val/loss_epoch",
            verbose=True,
            mode="min",
            save_top_k=hparams.save_top_k,
        )

        pretrain_early_stopping = EarlyStopping(
            monitor="Val/loss_epoch",
            min_delta=0.00,
            patience=hparams.patience,
            verbose=False,
            mode="min",
        )

        pretrain_gpu_stats_monitor = GPUStatsMonitor(temperature=True)

        log_recoloring_to_tensorboard = LogPairRecoloringToTensorboard()

        optuna_pruning = PyTorchLightningPruningCallback(monitor="Val/loss_epoch", trial=trial)

        logger = TensorBoardLogger(
            S3_LIGHTNING_LOGS_DIR,
            name=hparams.name,
            version=hparams.version,
            log_graph=True,
            default_hp_metric=False,
        )

        trainer = Trainer.from_argparse_args(
            hparams,
            logger=logger,
            checkpoint_callback=pretrain_checkpoints,
            callbacks=[
                pretrain_early_stopping,
                log_recoloring_to_tensorboard,
                pretrain_gpu_stats_monitor,
                optuna_pruning,
            ],
            profiler="simple",
        )

        datamodule = PreTrainDataModule(
            batch_size=pretrain_system.hparams.batch_size,
            multiplier=pretrain_system.hparams.multiplier,
            shuffle=hparams.shuffle,
            num_workers=hparams.num_workers,
            size=hparams.size,
            pin_memory=hparams.pin_memory,
            train_batch_from_same_image=hparams.train_batch_from_same_image,
            val_batch_from_same_image=hparams.val_batch_from_same_image,
            test_batch_from_same_image=hparams.test_batch_from_same_image,
        )

        # trainer.tune(pretrain_system, datamodule=datamodule)

        trainer.fit(pretrain_system, datamodule=datamodule)

        # get best checkpoint
        best_model_path = pretrain_checkpoints.best_model_path

        pretrain_system = PreTrainSystem.load_from_checkpoint(best_model_path)

        test_result = trainer.test(pretrain_system, datamodule=datamodule)

        pretrain_system.hparams.test_metric_name = test_result[0]["Test/loss_epoch"]
        logger.log_hyperparams(pretrain_system.hparams)
        logger.finalize(status="success")

        # upload best model to S3
        S3_best_model_path = os.path.join(
            S3_MODEL_CHECKPOINTS_RELATIVE_DIR,
            hparams.name,
            ".".join([hparams.version, best_model_path.split(".")[-1]]),
        )
        upload_to_s3(best_model_path, S3_best_model_path)

        return test_result[0]["Test/loss_epoch"]
コード例 #19
0
def main(hparams):
    if hparams.logging_location == "s3":
        logging_dir = os.path.join(S3_LIGHTNING_LOGS_DIR, hparams.name)
    else:
        logging_dir = os.path.join(LIGHTNING_LOGS_DIR, hparams.name)

    # main LightningModule
    if hparams.checkpoint_path is not None:
        pretrain_system = PreTrainSystem.load_from_checkpoint(
            hparams.adversarial_system)
    else:
        pretrain_system = PreTrainSystem(**vars(hparams))

    pretrain_checkpoints = ModelCheckpoint(
        dirpath=os.path.join(MODEL_CHECKPOINTS_DIR, hparams.version),
        monitor="Val/loss",
        verbose=True,
        mode="min",
        save_top_k=hparams.save_top_k,
    )

    pretrain_early_stopping = EarlyStopping(
        monitor="Val/loss",
        min_delta=0.00,
        patience=hparams.patience,
        verbose=False,
        mode="min",
    )

    gpu_stats = GPUStatsMonitor(temperature=True)

    log_recolored_to_tensorboard = LogPairRecoloringToTensorboard()
    log_hyperparams_to_tensorboard = LogHyperparamsToTensorboard(
        hp_metric="Test/loss")

    notify = Notify(test_metric_name="Test/loss")

    logger = TensorBoardLogger(
        logging_dir,
        name=hparams.name,
        version=hparams.version,
        log_graph=True,
        default_hp_metric=False,
    )

    trainer = Trainer.from_argparse_args(
        hparams,
        resume_from_checkpoint=hparams.checkpoint_path,
        logger=logger,
        checkpoint_callback=pretrain_checkpoints,
        callbacks=[
            pretrain_early_stopping,
            log_recolored_to_tensorboard,
            log_hyperparams_to_tensorboard,
            gpu_stats,
            notify,
        ],
        profiler="simple",
        benchmark=True,
    )

    datamodule = PreTrainDataModule(**vars(hparams))

    trainer.fit(pretrain_system, datamodule=datamodule)

    # lightning automatically uses the best model checkpoint for testing
    trainer.test(pretrain_system, datamodule=datamodule)

    if hparams.upload_model_to_s3:
        # upload best model to S3
        best_model_path = pretrain_checkpoints.best_model_path
        S3_best_model_path = os.path.join(
            S3_MODEL_CHECKPOINTS_RELATIVE_DIR,
            hparams.name,
            ".".join([hparams.version,
                      best_model_path.split(".")[-1]]),
        )
        upload_to_s3(best_model_path, S3_best_model_path)
コード例 #20
0
def train(cfg: DictConfig) -> None:
    """
    Train a model for image classification

    Args:
        cfg: hydra configuration
    """
    # Load pre-existing config file
    if os.path.exists("config.yaml"):
        logging.info("Loading pre-existing config file")
        cfg = OmegaConf.load("config.yaml")
    else:
        # copy initial config to a separate file to avoid overwriting it
        # when hydra resumes training and initializes again
        shutil.copy2(".hydra/config.yaml", "config.yaml")

    # Check for checkpoint
    ckpt_path = os.path.join(os.getcwd(), cfg.checkpoint.params.dirpath, "last.ckpt")
    if os.path.exists(ckpt_path):
        logging.info(f"Loading existing checkpoint @ {ckpt_path}")
    else:
        logging.info("No existing ckpt found. Training from scratch")
        ckpt_path = None

    # Display configuration
    logger.info(OmegaConf.to_yaml(cfg))
    # Seed everything
    seed_everything(cfg.training.seed)
    # Load datamodule
    data = DataModule(cfg)
    # Callbacks
    callbacks = [
        CustomModelCheckpoint(**cfg.checkpoint.params),
        LearningRateMonitor(),
        LitProgressBar(),
    ]
    if cfg.trainer.params.gpus:
        callbacks.append(GPUStatsMonitor())
    # Logger
    trainer_logger = load_obj(cfg.logger.class_name)(**cfg.logger.params)
    # Load model
    model = load_obj(cfg.model.class_name)(cfg)

    # Save model id
    with open("id", "w") as f:
        f.write(cfg.id)

    # Instantiate trainer
    trainer = Trainer(
        resume_from_checkpoint=ckpt_path,
        callbacks=callbacks,
        logger=trainer_logger,
        **cfg.trainer.params,
    )

    # Display model architecture alongside parameters and data
    logger.info(model)
    logger.info(data)
    logger.info(f"random seed: {cfg.training.seed}")
    # Fit trainer
    trainer.fit(model, datamodule=data)
コード例 #21
0
def main(
    experiment_type: str,
    affordance_type: AffordanceType,
    ngrams: int,
    return_words: int,
    definition_length: int,
    model_type: str,
    model_path: Optional[str],
    save_path: Optional[str],
    gpus: int,
    batch_size: int,
    learning_rate: float,
    fix_valid_set: bool,
    evaluation_only: bool,
    notebook: bool,
):
    # Device
    if gpus > 0 and not torch.cuda.is_available():
        gpus = 0
        warn('GPU is not available on this machine. Using CPU instead.')
    device = torch.device('cuda') if gpus > 0 else torch.device('cpu')

    # Training data
    train_set = PiqaDataset("train", fix=fix_valid_set)
    valid_set = PiqaDataset("valid", fix=fix_valid_set)
    test_set = PiqaDataset("test", fix=fix_valid_set)

    # Model & Tokenizer
    try:
        model = PIQAModel.get(model_type)(learning_rate=learning_rate,
                                          model_type=model_type)
        tokenizer = PIQATokenizer.get(model_type)(experiment_type,
                                                  ngrams,
                                                  return_words,
                                                  definition_length,
                                                  affordance_type,
                                                  model_type,
                                                  tqdm_arg=notebook)
    except TypeError:
        raise RuntimeError(f'{model_type} has not been implemented.')

    # Load finetuned weights
    if model_path is not None:
        model.load_state_dict(torch.load(model_path))

    # Just evaluation
    if evaluation_only:
        model.eval()
    else:
        model.train()

    # Pre-tokenize data sets
    collate_fn = lambda x: tokenizer.collate_fn(
        x, pad_token=tokenizer.pad_token_id)
    if tokenizer._type == 'affordance':
        all_sets_path = Path(
            f'./data/{tokenizer._type}_{ngrams}_{return_words}_{definition_length}_{affordance_type}.pkl'
        )
    elif tokenizer._type == 'definition':
        all_sets_path = Path(
            f'./data/{tokenizer._type}_{ngrams}_{return_words}_{definition_length}.pkl'
        )
    else:
        all_sets_path = Path(f'./data/{tokenizer._type}.pkl')

    if all_sets_path.exists():
        with open(all_sets_path, 'rb') as f:
            all_sets = pickle.load(f)
        train_set = all_sets['train']
        test_set = all_sets['test']
        valid_set = all_sets['valid']
    else:
        train_set = tokenizer.pretokenize_data_set(train_set)
        valid_set = tokenizer.pretokenize_data_set(valid_set)
        test_set = tokenizer.pretokenize_data_set(test_set)
        with open(all_sets_path, 'wb') as f:
            pickle.dump(
                {
                    'train': train_set,
                    'test': test_set,
                    'valid': valid_set
                }, f)

    valid_set = tokenizer.tokenize_data_set(valid_set)
    test_set = tokenizer.tokenize_data_set(test_set)
    train_set = tokenizer.tokenize_data_set(train_set)
    trainloader = DataLoader(train_set,
                             shuffle=True,
                             collate_fn=collate_fn,
                             batch_size=batch_size)
    validloader = DataLoader(valid_set,
                             shuffle=False,
                             collate_fn=collate_fn,
                             batch_size=batch_size)
    testloader = DataLoader(test_set, shuffle=False, collate_fn=collate_fn)

    # Load callbacks
    callbacks = []
    callbacks.append(
        EarlyStopping('val_accuracy',
                      min_delta=0.001,
                      patience=5,
                      mode='max',
                      verbose=True))
    if save_path is not None:
        callbacks.append(
            ModelCheckpoint(save_path, filename='{epoch}-{val_loss:.2f}'))
    if gpus > 0:
        callbacks.append(
            GPUStatsMonitor(True, False, False, False, False, False))

    # Training
    trainer = pl.Trainer(gpus=gpus,
                         auto_scale_batch_size=False,
                         callbacks=callbacks)
    trainer.fit(model, trainloader, validloader)

    print("Finished Training")
コード例 #22
0
def main(hparams):
    if hparams.checkpoints_location == "s3":
        checkpoints_dir = os.path.join(S3_MODEL_CHECKPOINTS_DIR, hparams.name,
                                       hparams.version)
    else:
        checkpoints_dir = os.path.join(MODEL_CHECKPOINTS_DIR, hparams.name,
                                       hparams.version)

    if hparams.logging_location == "s3":
        logging_dir = os.path.join(S3_LIGHTNING_LOGS_DIR, hparams.name)
    else:
        logging_dir = os.path.join(LIGHTNING_LOGS_DIR, hparams.name)

    # load generator pretrained with PreTrainSystem
    generator = PreTrainSystem.load_from_checkpoint(
        PRETRAINED_MODEL_CHECKPOINT_PATH).generator

    # main LightningModule
    if hparams.checkpoint_path is not None:
        adversarial_system = AdversarialMSESystem.load_from_checkpoint(
            hparams.checkpoint_path)
    else:
        adversarial_system = AdversarialMSESystem(**vars(hparams),
                                                  generator=generator)

    adversarial_checkpoints = ModelCheckpoint(
        dirpath=checkpoints_dir,
        monitor="Val/adv_loss",
        verbose=True,
        mode="min",
        save_top_k=-1,
    )

    gpu_stats = GPUStatsMonitor(temperature=True)

    log_recolored_to_tensorboard = LogAdversarialMSEToTensorboard()
    log_hyperparams_to_tensorboard = LogHyperparamsToTensorboard(
        hp_metric=None)

    notify = Notify()

    logger = TensorBoardLogger(
        logging_dir,
        name=hparams.name,
        version=hparams.version,
        log_graph=True,
        default_hp_metric=False,
    )

    # trainer
    trainer = Trainer.from_argparse_args(
        hparams,
        resume_from_checkpoint=hparams.checkpoint_path,
        logger=logger,
        checkpoint_callback=adversarial_checkpoints,
        callbacks=[
            log_recolored_to_tensorboard,
            log_hyperparams_to_tensorboard,
            gpu_stats,
            notify,
        ],
        profiler="simple",
        benchmark=True,
        enable_pl_optimizer=True,
    )

    datamodule = GANDataModule(**vars(hparams))

    trainer.fit(adversarial_system, datamodule=datamodule)

    # lightning automatically uses the best model checkpoint for testing
    trainer.test(adversarial_system, datamodule=datamodule)
コード例 #23
0
    def build(self,**kwargs):
        """
        Reponsável por criar os argumentos da classe
        """
        # Checagem das Chamadas
        self.build_called = True

        # Rcuperando Caminhos
        self.data_dirpath = self.config['dirpaths']['data_dirpath']
        self.log_dirpath = self.config['dirpaths']['log_dirpath']
        self.cwd_dirpath = self.config['dirpaths']['cwd_dirpath']

        # Rcuperando Parâmetros
        self.hparams = self.config['params']['hparams']
        self.lightning_params = self.config['params']['lightning_params']
        self.early_stop_callback_params = self.config['params']['early_stop_callback_params']
        self.prepare_data_params = self.config['params']['prepare_data_params']
        #-
        self.test_size_from_dev = self.prepare_data_params['test_size_from_dev']
        #-
        self.model_name = self.hparams['model_name']
        self.num_gen_sentences = self.hparams['num_gen_sentences']
        self.no_repeat_ngram_size = self.hparams['no_repeat_ngram_size']
        self.train_batch_size = self.hparams['train_batch_size']
        self.eval_batch_size = self.hparams['eval_batch_size']
        self.source_max_length = self.hparams['source_max_length']
        self.target_max_length = self.hparams['target_max_length']
        self.temperature = self.hparams['temperature']
        self.top_p = self.hparams['top_p']
        self.learning_rate = self.hparams['learning_rate']
        self.eps = self.hparams['eps']
        self.seed = self.hparams['seed']
        #-
        self.num_gpus = self.lightning_params['num_gpus'] if torch.cuda.is_available() else 0
        self.profiler = self.lightning_params['profiler']
        self.max_epochs = self.lightning_params['max_epochs']
        self.accumulate_grad_batches = self.lightning_params['accumulate_grad_batches']
        self.check_val_every_n_epoch = self.lightning_params['check_val_every_n_epoch']
        self.progress_bar_refresh_rate = self.lightning_params['progress_bar_refresh_rate']
        self.gradient_clip_val = self.lightning_params['gradient_clip_val']
        self.fast_dev_run = self.lightning_params['fast_dev_run']
        #-
        self.monitor = self.early_stop_callback_params['monitor']
        self.min_delta = self.early_stop_callback_params['min_delta']
        self.patience = self.early_stop_callback_params['patience']
        self.verbose = self.early_stop_callback_params['verbose']
        self.mode = self.early_stop_callback_params['mode']

        # Criando parâmetros adicionais
        self.tokenizer = T5Tokenizer.from_pretrained(self.config['params']['hparams']['model_name'])
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.MODEL = None

        # Trainer
        if self.fast_dev_run:
            self.TRAINER = pl.Trainer(
                gpus=self.num_gpus,
                checkpoint_callback=False,
                fast_dev_run=True  # Disable checkpoint saving.
            )
        else:

            checkpoint_callback = ModelCheckpoint(
            dirpath=self.data_dirpath, save_top_k=-1
            )

            early_stop_callback = EarlyStopping(
                monitor=self.early_stop_callback_params['monitor'],
                min_delta=self.early_stop_callback_params['min_delta'],
                patience=self.early_stop_callback_params['patience'],
                verbose=self.early_stop_callback_params['verbose'],
                mode=self.early_stop_callback_params['mode']
                )

            
            callbacks = [early_stop_callback,checkpoint_callback]
            if self.num_gpus>0:
                gpu_stats = GPUStatsMonitor()
                callbacks.append(gpu_stats)
                tb_logger = pl.loggers.TensorBoardLogger(f"{self.log_dirpath}")
            else:
                tb_logger = None


            self.TRAINER = pl.Trainer(
            gpus= self.lightning_params['num_gpus'],
            profiler=self.lightning_params['profiler'],
            max_epochs=self.lightning_params['max_epochs'],
            accumulate_grad_batches = self.lightning_params['accumulate_grad_batches'],
            check_val_every_n_epoch=self.lightning_params['check_val_every_n_epoch'],
            progress_bar_refresh_rate=self.lightning_params['progress_bar_refresh_rate'],
            callbacks = callbacks,
            resume_from_checkpoint=None,
            logger = tb_logger
            )
コード例 #24
0
def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_integers(
        device_count_mock, is_available_mock):
    gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 2])
    expected = ["2", "4"]
    assert gpu_ids == expected
コード例 #25
0
def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_unset(
        device_count_mock, is_available_mock):
    gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 0])
    expected = ["1", "0"]
    assert gpu_ids == expected
コード例 #26
0
def create_lightning_trainer(container: LightningContainer,
                             resume_from_checkpoint: Optional[Path] = None,
                             num_nodes: int = 1,
                             multiple_trainloader_mode: str = "max_size_cycle") -> \
        Tuple[Trainer, StoringLogger]:
    """
    Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
    and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second
    return value.
    :param container: The container with model and data.
    :param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
    :param num_nodes: The number of nodes to use in distributed training.
    :return: A tuple [Trainer object, diagnostic logger]
    """
    logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")
    num_gpus = container.num_gpus_per_node()
    effective_num_gpus = num_gpus * num_nodes
    strategy = None
    if effective_num_gpus == 0:
        accelerator = "cpu"
        devices = 1
        message = "CPU"
    else:
        accelerator = "gpu"
        devices = num_gpus
        message = f"{devices} GPU"
        if effective_num_gpus > 1:
            # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of
            # GPU memory).
            # Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin
            # prints out lengthy warnings about the performance impact of find_unused_parameters.
            strategy = DDPPlugin(find_unused_parameters=container.pl_find_unused_parameters)
            message += "s per node with DDP"
    logging.info(f"Using {message}")
    tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="")
    loggers = [tensorboard_logger, AzureMLLogger(False)]
    storing_logger = StoringLogger()
    loggers.append(storing_logger)
    # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.
    precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32
    # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
    # https://pytorch.org/docs/stable/notes/randomness.html
    # Note that switching to deterministic models can have large performance downside.
    if container.pl_deterministic:
        deterministic = True
        benchmark = False
    else:
        deterministic = False
        benchmark = True

    # The last checkpoint is considered the "best" checkpoint. For large segmentation
    # models, this still appears to be the best way of choosing them because validation loss on the relatively small
    # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
    # not for the HeadAndNeck model.
    # Note that "last" is somehow a misnomer, it should rather be "latest". There is a "last" checkpoint written in
    # every epoch. We could use that for recovery too, but it could happen that the job gets preempted right during
    # writing that file, and we would end up with an invalid file.
    last_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
                                               save_last=True,
                                               save_top_k=0)
    recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
                                                   filename=AUTOSAVE_CHECKPOINT_FILE_NAME,
                                                   every_n_val_epochs=container.autosave_every_n_val_epochs,
                                                   save_last=False)
    callbacks: List[Callback] = [
        last_checkpoint_callback,
        recovery_checkpoint_callback,
    ]
    if container.monitor_loading:
        # TODO antonsc: Remove after fixing the callback.
        raise NotImplementedError("Monitoring batch loading times has been temporarily disabled.")
        # callbacks.append(BatchTimeCallback())
    if num_gpus > 0 and container.monitor_gpu:
        logging.info("Adding monitoring for GPU utilization")
        callbacks.append(GPUStatsMonitor(intra_step_time=True, inter_step_time=True))
    # Add the additional callbacks that were specified in get_trainer_arguments for LightningContainers
    additional_args = container.get_trainer_arguments()
    # Callbacks can be specified via the "callbacks" argument (the legacy behaviour) or the new get_callbacks method
    if "callbacks" in additional_args:
        more_callbacks = additional_args.pop("callbacks")
        if isinstance(more_callbacks, list):
            callbacks.extend(more_callbacks)  # type: ignore
        else:
            callbacks.append(more_callbacks)  # type: ignore
    callbacks.extend(container.get_callbacks())
    is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
    progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate
    if progress_bar_refresh_rate is None:
        progress_bar_refresh_rate = 50
        logging.info(f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. "
                     f"To change, modify the pl_progress_bar_refresh_rate field of the container.")
    if is_azureml_run:
        callbacks.append(AzureMLProgressBar(refresh_rate=progress_bar_refresh_rate,
                                            write_to_logging_info=True,
                                            print_timestamp=False))
    else:
        callbacks.append(TQDMProgressBar(refresh_rate=progress_bar_refresh_rate))
    # Read out additional model-specific args here.
    # We probably want to keep essential ones like numgpu and logging.
    trainer = Trainer(default_root_dir=str(container.outputs_folder),
                      deterministic=deterministic,
                      benchmark=benchmark,
                      accelerator=accelerator,
                      strategy=strategy,
                      max_epochs=container.num_epochs,
                      # Both these arguments can be integers or floats. If integers, it is the number of batches.
                      # If float, it's the fraction of batches. We default to 1.0 (processing all batches).
                      limit_train_batches=container.pl_limit_train_batches or 1.0,
                      limit_val_batches=container.pl_limit_val_batches or 1.0,
                      num_sanity_val_steps=container.pl_num_sanity_val_steps,
                      check_val_every_n_epoch=container.pl_check_val_every_n_epoch,
                      callbacks=callbacks,
                      logger=loggers,
                      num_nodes=num_nodes,
                      devices=devices,
                      precision=precision,
                      sync_batchnorm=True,
                      detect_anomaly=container.detect_anomaly,
                      profiler=container.pl_profiler,
                      resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
                      multiple_trainloader_mode=multiple_trainloader_mode,
                      **additional_args)
    return trainer, storing_logger
コード例 #27
0
def test_gpu_stats_monitor_get_gpu_ids_cuda_visible_devices_uuids(
        device_count_mock, is_available_mock):
    gpu_ids = GPUStatsMonitor._get_gpu_ids([1, 2])
    expected = ["GPU-56d78e9f", "GPU-02a46c8e"]
    assert gpu_ids == expected
コード例 #28
0
def training_loop(
        run_dir='.',  # Output directory.
        training_set_kwargs={},  # Options for training set.
        data_loader_kwargs={},  # Options for torch.utils.data.DataLoader.
        G_kwargs={},  # Options for generator network.
        D_kwargs={},  # Options for discriminator network.
        G_opt_kwargs={},  # Options for generator optimizer.
        D_opt_kwargs={},  # Options for discriminator optimizer.
        augment_kwargs=None,  # Options for augmentation pipeline. None = disable.
        loss_kwargs={},  # Options for loss function.
        metrics=[],  # Metrics to evaluate during training.
        random_seed=0,  # Global random seed.
        num_gpus=1,  # Number of GPUs participating in the training.
        #rank                    = 0,        # Rank of the current process in [0, num_gpus[.
    batch_size=4,  # Total batch size for one training iteration. Can be larger than batch_gpu * num_gpus.
        batch_gpu=4,  # Number of samples processed at a time by one GPU.
        ema_kimg=10,  # Half-life of the exponential moving average (EMA) of generator weights.
        ema_rampup=None,  # EMA ramp-up coefficient.
        G_reg_interval=4,  # How often to perform regularization for G? None = disable lazy regularization.
        D_reg_interval=16,  # How often to perform regularization for D? None = disable lazy regularization.
        augment_p=0,  # Initial value of augmentation probability.
        ada_target=None,  # ADA target value. None = fixed p.
        ada_interval=4,  # How often to perform ADA adjustment?
        ada_kimg=500,  # ADA adjustment speed, measured in how many kimg it takes for p to increase/decrease by one unit.
        total_kimg=25000,  # Total length of the training, measured in thousands of real images.
        kimg_per_tick=4,  # Progress snapshot interval.
        image_snapshot_ticks=50,  # How often to save image snapshots? None = disable.
        network_snapshot_ticks=50,  # How often to save network snapshots? None = disable.
        resume_pkl=None,  # Network pickle to resume training from.
        cudnn_benchmark=True,  # Enable torch.backends.cudnn.benchmark?
        allow_tf32=False,  # Enable torch.backends.cuda.matmul.allow_tf32 and torch.backends.cudnn.allow_tf32?
        abort_fn=None,  # Callback function for determining whether to abort training. Must return consistent results across ranks.
        progress_fn=None,  # Callback function for updating training progress. Called for all ranks.
):
    # Initialize.
    start_time = time.time()
    #device = torch.device('cuda', rank)
    #np.random.seed(random_seed * num_gpus + rank)
    #torch.manual_seed(random_seed * num_gpus + rank)
    #torch.backends.cudnn.benchmark = cudnn_benchmark    # Improves training speed.
    seed_everything(random_seed)
    torch.backends.cuda.matmul.allow_tf32 = allow_tf32  # Allow PyTorch to internally use tf32 for matmul
    torch.backends.cudnn.allow_tf32 = allow_tf32  # Allow PyTorch to internally use tf32 for convolutions
    conv2d_gradfix.enabled = True  # Improves training speed.
    grid_sample_gradfix.enabled = True  # Avoids errors with the augmentation pipe.

    # Load training set.
    # if rank == 0:
    #     print('Loading training set...')
    # training_set = dnnlib.util.construct_class_by_name(**training_set_kwargs) # subclass of training.dataset.Dataset
    # training_set_sampler = misc.InfiniteSampler(dataset=training_set, rank=rank, num_replicas=num_gpus, seed=random_seed)
    # training_set_iterator = iter(torch.utils.data.DataLoader(dataset=training_set, sampler=training_set_sampler, batch_size=batch_size//num_gpus, **data_loader_kwargs))
    # if rank == 0:
    #     print()
    #     print('Num images: ', len(training_set))
    #     print('Image shape:', training_set.image_shape)
    #     print('Label shape:', training_set.label_shape)
    #     print()

    # Construct networks.
    # if rank == 0:
    #     print('Constructing networks...')
    training_set_pl = StyleGANDataModule(batch_gpu, training_set_kwargs,
                                         data_loader_kwargs)
    training_set = training_set_pl.training_set

    common_kwargs = dict(c_dim=training_set.label_dim,
                         img_resolution=training_set.resolution,
                         img_channels=training_set.num_channels)
    G = dnnlib.util.construct_class_by_name(
        **G_kwargs, **common_kwargs)  # subclass of torch.nn.Module
    D = dnnlib.util.construct_class_by_name(
        **D_kwargs, **common_kwargs)  # subclass of torch.nn.Module
    # # Resume from existing pickle.
    # if (resume_pkl is not None) and (rank == 0):
    #     print(f'Resuming from "{resume_pkl}"')
    #     with dnnlib.util.open_url(resume_pkl) as f:
    #         resume_data = legacy.load_network_pkl(f)
    #     for name, module in [('G', G), ('D', D), ('G_ema', G_ema)]:
    #         misc.copy_params_and_buffers(resume_data[name], module, require_all=False)

    # # Print network summary tables.
    # if rank == 0:
    #     z = torch.empty([batch_gpu, G.z_dim], device=device)
    #     c = torch.empty([batch_gpu, G.c_dim], device=device)
    #     img = misc.print_module_summary(G, [z, c])
    #     misc.print_module_summary(D, [img, c])

    # Setup augmentation.
    # if rank == 0:
    #     print('Setting up augmentation...')
    augment_pipe = None
    ada_stats = None
    if (augment_kwargs is not None) and (augment_p > 0
                                         or ada_target is not None):
        augment_pipe = dnnlib.util.construct_class_by_name(
            **augment_kwargs)  # subclass of torch.nn.Module
        augment_pipe.p.copy_(torch.as_tensor(augment_p))
        # if ada_target is not None:
        #     ada_stats = training_stats.Collector(regex='Loss/signs/real')

    fid50k = FID(max_real=None, num_gen=50000)
    ema_kimg /= num_gpus
    ada_kimg /= num_gpus
    kimg_per_tick /= num_gpus

    gpu_stats = GPUStatsMonitor(intra_step_time=True)

    net = StyleGAN2(G=G,
                    D=D,
                    G_opt_kwargs=G_opt_kwargs,
                    D_opt_kwargs=D_opt_kwargs,
                    augment_pipe=augment_pipe,
                    datamodule=training_set_pl,
                    G_reg_interval=G_reg_interval,
                    D_reg_interval=D_reg_interval,
                    ema_kimg=ema_kimg,
                    ema_rampup=ema_rampup,
                    ada_target=ada_target,
                    ada_interval=ada_interval,
                    ada_kimg=ada_kimg,
                    metrics=[fid50k],
                    kimg_per_tick=kimg_per_tick,
                    random_seed=random_seed,
                    **loss_kwargs)

    trainer = pl.Trainer(gpus=num_gpus,
                         accelerator='ddp',
                         weights_summary='full',
                         fast_dev_run=10,
                         benchmark=cudnn_benchmark,
                         max_steps=total_kimg // (batch_size) * 1000,
                         plugins=[
                             DDPPlugin(broadcast_buffers=False,
                                       find_unused_parameters=True)
                         ],
                         callbacks=[gpu_stats],
                         accumulate_grad_batches=num_gpus)
    trainer.fit(net, datamodule=training_set_pl)