Beispiel #1
0
def test_device_stats_monitor_warning_when_psutil_not_available(monkeypatch):
    """Test that warning is raised when psutil is not available."""
    import pytorch_lightning.callbacks.device_stats_monitor as imports

    monkeypatch.setattr(imports, "_PSUTIL_AVAILABLE", False)
    monitor = DeviceStatsMonitor()
    trainer = Trainer()
    assert trainer.strategy.root_device == torch.device("cpu")
    # TODO: raise an exception from v1.9
    with pytest.warns(UserWarning, match="psutil` is not installed"):
        monitor.setup(trainer, Mock(), "fit")
def test_device_stats_monitor_tpu(tmpdir):
    """Test TPU stats are logged using a logger."""

    model = BoringModel()
    device_stats = DeviceStatsMonitor()

    class DebugLogger(CSVLogger):
        @rank_zero_only
        def log_metrics(self,
                        metrics: Dict[str, float],
                        step: Optional[int] = None) -> None:
            fields = ["avg. free memory (MB)", "avg. peak memory (MB)"]
            for f in fields:
                assert any(f in h for h in metrics.keys())

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=1,
        tpu_cores=8,
        log_every_n_steps=1,
        callbacks=[device_stats],
        logger=DebugLogger(tmpdir),
        enable_checkpointing=False,
        enable_progress_bar=False,
    )

    trainer.fit(model)
def test_device_stats_gpu_from_nvidia(tmpdir):
    """Test GPU stats are logged using a logger with Pytorch < 1.8.0."""
    model = BoringModel()
    device_stats = DeviceStatsMonitor()

    class DebugLogger(CSVLogger):
        @rank_zero_only
        def log_metrics(self,
                        metrics: Dict[str, float],
                        step: Optional[int] = None) -> None:
            fields = [
                "utilization.gpu", "memory.used", "memory.free",
                "utilization.memory"
            ]
            for f in fields:
                assert any(f in h for h in metrics.keys())

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_train_batches=7,
        log_every_n_steps=1,
        gpus=1,
        callbacks=[device_stats],
        logger=DebugLogger(tmpdir),
        enable_checkpointing=False,
        enable_progress_bar=False,
    )

    trainer.fit(model)
def test_device_stats_gpu_from_torch(tmpdir):
    """Test GPU stats are logged using a logger with Pytorch >= 1.8.0."""
    model = BoringModel()
    device_stats = DeviceStatsMonitor()

    class DebugLogger(CSVLogger):
        @rank_zero_only
        def log_metrics(self,
                        metrics: Dict[str, float],
                        step: Optional[int] = None) -> None:
            fields = [
                "allocated_bytes.all.freed", "inactive_split.all.peak",
                "reserved_bytes.large_pool.peak"
            ]
            for f in fields:
                assert any(f in h for h in metrics.keys())

    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=2,
        limit_train_batches=7,
        log_every_n_steps=1,
        gpus=1,
        callbacks=[device_stats],
        logger=DebugLogger(tmpdir),
        enable_checkpointing=False,
        enable_progress_bar=False,
    )

    trainer.fit(model)
Beispiel #5
0
def test_device_stats_cpu(cpu_stats_mock, tmpdir, cpu_stats):
    """Test CPU stats are logged when no accelerator is used."""
    model = BoringModel()
    CPU_METRIC_KEYS = (_CPU_VM_PERCENT, _CPU_SWAP_PERCENT, _CPU_PERCENT)

    class DebugLogger(CSVLogger):
        def log_metrics(self,
                        metrics: Dict[str, float],
                        step: Optional[int] = None) -> None:
            enabled = cpu_stats is not False
            for f in CPU_METRIC_KEYS:
                has_cpu_metrics = any(f in h for h in metrics)
                assert has_cpu_metrics if enabled else not has_cpu_metrics

    device_stats = DeviceStatsMonitor(cpu_stats=cpu_stats)
    trainer = Trainer(
        default_root_dir=tmpdir,
        max_epochs=1,
        limit_train_batches=2,
        limit_val_batches=0,
        log_every_n_steps=1,
        callbacks=device_stats,
        logger=DebugLogger(tmpdir),
        enable_checkpointing=False,
        enable_progress_bar=False,
        accelerator="cpu",
    )
    trainer.fit(model)

    expected = 4 if cpu_stats is not False else 0  # (batch_start + batch_end) * train_batches
    assert cpu_stats_mock.call_count == expected
Beispiel #6
0
def test_device_stats_monitor_no_logger(tmpdir):
    """Test DeviceStatsMonitor with no logger in Trainer."""

    model = BoringModel()
    device_stats = DeviceStatsMonitor()

    trainer = Trainer(
        default_root_dir=tmpdir,
        callbacks=[device_stats],
        max_epochs=1,
        logger=False,
        enable_checkpointing=False,
        enable_progress_bar=False,
    )

    with pytest.raises(MisconfigurationException, match="Trainer that has no logger."):
        trainer.fit(model)
def main(args: Namespace) -> None:
    """
    The executable logic for this controller.

    For the training loop:
    - Instantiates a data object using ``cp_latent_data.QuackLatentDataModule``.
    - Instantiates a model using ``cp_latent_classifier.QuackLatentClassifier``.
    - Instantiates a strategy plugin using ``ray_lightning.ray_ddp.RayPlugin``.
    - Instantiates callback objects:

        - A logger using ``pytorch_lightning.loggers.comet.CometLogger``
        - A learning rate monitor using ``pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor``
        - A checkpoint creator using ``pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint``
        - An early stopping monitor using ``pytorch_lightning.callbacks.early_stopping.EarlyStopping``

    Then using these objects, instantiates a training control object using ``pytorch_lightning.trainer.trainer.Trainer``

    For inference with a trained model, just the logger and the ray strategy are used along with an instance of
    ``densenet.CensoredDataWriter`` which when composed with Trainer prepares the prediction loop to output its results
    to file on each iteration.

    Parameters
    ----------
    args: Namespace
         Command line arguments.  Possible arguments are:

         --data_dir
            *str* default='./data'  The top directory of the data storage tree.
         --batch_size
            *int* default=4 The batch size used for processing data.
         --num_workers
            *int* default=0 The number of worker processes used by the data loader.
         --evaluate
            *bool* Flag to output undetermined data from the inference loop. True when present, otherwise False
         --checkpoint_path
            *str* A checkpoint used for manual restart. Only the weights are used.
         --storage_path
            *str* default='./data/encoded' A path for storing the outputs from inference.
         --l_rate
            *float* default=1e-1 Hyperparameter passed to QuackAutoEncoder.
         --l_rate_min
            *float* default=1e-3 Hyperparameter passed to QuackAutoEncoder.
         --l_rate_max_epoch
            *int* default=-1 Hyperparameter passed to QuackAutoEncoder.
         --exp_label
            *str* default='autoencoder-train' Label passed to the logger.
         --ray_nodes
            *int* default=4 Number of parallel nodes passed to the Ray plugin.
         --freeze
            *bool* Flag to construct so that the image analyzing layers of the pre-trained Densenet are frozen for
            training.

    Returns
    -------
    void

    """
    data = QuackLatentDataModule(
        args.data_dir,
        batch_size=args.batch_size,
        workers=args.num_workers
    )
    model = QuackLatentClassifier(
            initial_size=256,
            learning_rate=args.l_rate,
            learning_rate_min=args.l_rate_min,
            lr_max_epochs=args.l_rate_max_epoch
    )
    if args.checkpoint_path is not None:
        model = QuackLatentClassifier.load_from_checkpoint(
            args.checkpoint_path,
            initial_size=256,
        )
    ray_plugin = RayPlugin(
        num_workers=args.ray_nodes,
        num_cpus_per_worker=1,
        use_gpu=False,
        find_unused_parameters=False
    )
    date_time = strftime("%d %b %Y %H:%M", gmtime())
    device_logger = DeviceStatsMonitor()
    checkpoint_storage = Path(args.storage_path)
    checkpoint_storage.mkdir(parents=True, exist_ok=True)
    # API configuration for comet: https://www.comet.ml/docs/python-sdk/advanced/#python-configuration
    comet_logger = CometLogger(
        project_name="censored-planet",
        experiment_name=f'{args.exp_label}: {date_time}',
    )
    if args.predict:
        writer_callback = CensoredDataWriter(
            write_interval='batch',
            storage_path=args.storage_path
        )
        trainer = Trainer.from_argparse_args(
            args,
            logger=comet_logger,
            callbacks=[writer_callback, device_logger],
            strategy=ray_plugin,
        )
        model.freeze()
        print('Ready for inference...')
        trainer.predict(model, datamodule=data, return_predictions=False)
        return
    else:
        lr_monitor = LearningRateMonitor(logging_interval='epoch')
        checkpoint_callback = ModelCheckpoint(
            monitor="val_loss",
            mode='min',
            save_top_k=3,
            save_last=True,
            auto_insert_metric_name=True,
            filename='latent_checkpoint-{step}-{val_loss:02.2f}',
            dirpath=checkpoint_storage,
        )
        early_stopping_callback = EarlyStopping(
            monitor="val_loss",
            mode='min',
            patience=10,
            stopping_threshold=0.05,
            check_finite=True,  # Stops training if the monitored metric becomes NaN or infinite.
        )
        trainer = Trainer.from_argparse_args(
            args,
            logger=comet_logger,
            strategy=ray_plugin,
            callbacks=[early_stopping_callback, checkpoint_callback, device_logger, lr_monitor],
            weights_save_path=checkpoint_storage
        )
        print('Ready for training...')
        trainer.fit(model, datamodule=data)
        print('Post fit testing...')
        trainer.test(model, datamodule=data)
def main(args: Namespace) -> None:
    """
    The executable logic for this controller.

    For the training loop:

    - Instantiates a data object using ``cp_tokenized_data.QuackTokenizedDataModule``.
    - Instantiates a model using ``autoencoder.QuackAutoEncoder``.
    - Instantiates a strategy plugin using ``ray_lightning.ray_ddp.RayPlugin``.
    - Instantiates callback objects:

        - A logger using ``pytorch_lightning.loggers.comet.CometLogger``
        - A learning rate monitor using ``pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor``
        - A checkpoint creator using ``pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint``
        - An early stopping monitor using ``pytorch_lightning.callbacks.early_stopping.EarlyStopping``

    Then using these objects, instantiates a training control object using ``pytorch_lightning.trainer.trainer.Trainer``

    For inference with a trained model, just the logger and the ray strategy are used along with an instance of
    autoencoder.AutoencoderWriter which when composed with Trainer prepares the prediction loop to output its results
    to file on each iteration.

    Parameters
    ----------
    args: Namespace
         Command line arguments.  Possible arguments are:

         --data_dir
            *str* default='./data'  The top directory of the data storage tree.

         --batch_size
            *int* default=4 The batch size used for processing data.

         --num_workers
            *int* default=0 The number of worker processes used by the data loader.

         --embed_size
            *int* default=128 Hyperparameter passed to QuackAutoEncoder.

         --hidden_size
            *int* default=512 Hyperparameter passed to QuackAutoEncoder.

         --encode
            *bool* Flag to run the inference loop instead of train. True when present, otherwise False

         --filtered
            *bool* Flag to output labeled data from the inference loop. True when present, otherwise False

         --evaluate
            *bool* Flag to output undetermined data from the inference loop. True when present, otherwise False

         --checkpoint_path
            *str* A checkpoint used for manual restart. Only the weights are used.

         --storage_path
            *str* default='./data/encoded' A path for storing the outputs from inference.

         --l_rate
            *float* default=1e-1 Hyperparameter passed to QuackAutoEncoder.

         --l_rate_min
            *float* default=1e-3 Hyperparameter passed to QuackAutoEncoder.

         --l_rate_max_epoch
            *int* default=-1 Hyperparameter passed to QuackAutoEncoder.

         --exp_label
            *str* default='autoencoder-train' Label passed to the logger.

         --ray_nodes
            *int* default=4 Number of parallel nodes passed to the Ray plugin.

    Returns
    -------
    void

    """
    data = QuackTokenizedDataModule(args.data_dir,
                                    batch_size=args.batch_size,
                                    workers=args.num_workers)
    # Max value of static is from the ipv4 segments.
    max_index = 256 + QuackConstants.VOCAB.value
    model = QuackAutoEncoder(num_embeddings=max_index,
                             embed_size=args.embed_size,
                             hidden_size=args.hidden_size,
                             max_decode_length=data.get_width(),
                             learning_rate=args.l_rate,
                             learning_rate_min=args.l_rate_min,
                             lr_max_epochs=args.l_rate_max_epoch)
    if args.checkpoint_path is not None:
        model = QuackAutoEncoder.load_from_checkpoint(
            args.checkpoint_path,
            learning_rate=args.l_rate,
            learning_rate_min=args.l_rate_min,
            lr_max_epochs=args.l_rate_max_epoch)
    ray_plugin = RayPlugin(num_workers=args.ray_nodes,
                           num_cpus_per_worker=1,
                           use_gpu=False,
                           find_unused_parameters=False)
    date_time = strftime("%d %b %Y %H:%M", gmtime())
    device_logger = DeviceStatsMonitor()
    checkpoint_storage = Path(args.storage_path)
    checkpoint_storage.mkdir(parents=True, exist_ok=True)
    # API configuration for comet: https://www.comet.ml/docs/python-sdk/advanced/#python-configuration
    comet_logger = CometLogger(
        project_name="censored-planet",
        experiment_name=f'{args.exp_label}: {date_time}',
    )
    if args.encode:
        source_meta = Path(args.data_dir + '/metadata.pyc')
        try:
            with source_meta.open(mode='rb') as retrieved_dict:
                source_metadata = pickle.load(retrieved_dict)
            reduction_factor = source_metadata['censored'] / source_metadata[
                'uncensored']
        except (OSError, KeyError):
            reduction_factor = 1
        writer_callback = AutoencoderWriter(
            write_interval='batch',
            storage_path=args.storage_path,
            filtered=args.filtered,
            evaluate=args.evaluate,
            reduction_threshold=reduction_factor)
        trainer = Trainer.from_argparse_args(
            args,
            logger=comet_logger,
            strategy=ray_plugin,
            callbacks=[writer_callback, device_logger])
        model.freeze()
        print('Ready for inference...')
        trainer.predict(model, datamodule=data, return_predictions=False)
        return
    else:
        lr_monitor = LearningRateMonitor(logging_interval='epoch')
        checkpoint_callback = ModelCheckpoint(
            monitor="val_loss",
            save_top_k=3,
            save_last=True,
            mode='min',
            every_n_train_steps=2000,
            auto_insert_metric_name=True,
            filename='autoenc_checkpoint_{epoch:02d}-{step}-{val_loss:02.2f}',
            dirpath=checkpoint_storage)
        early_stopping_callback = EarlyStopping(
            monitor="val_loss",
            patience=10,
            stopping_threshold=200,
            check_finite=
            True,  # Stops training if the monitored metric becomes NaN or infinite.
        )
        trainer = Trainer.from_argparse_args(
            args,
            logger=comet_logger,
            callbacks=[
                early_stopping_callback, checkpoint_callback, device_logger,
                lr_monitor
            ],
            plugins=[ray_plugin],
            weights_save_path=checkpoint_storage)
        print('Ready for training...')
        trainer.fit(model, datamodule=data)
        print('Post fit testing...')
        trainer.test(model, datamodule=data)