コード例 #1
0
        covar_module = covar_module_dict[key]
        # Put them all together
        full_train_i = torch.cat([
            torch.full_like(torch.tensor(data[d]["X"]),
                            dtype=torch.long,
                            fill_value=i) for i, d in enumerate(data)
        ])
        full_train_x = torch.cat(
            [torch.tensor(data[d]["X"]).to(torch.float) for d in data])
        full_train_y = torch.cat(
            [torch.tensor(data[d]["Y"]).to(torch.float) for d in data])

        batch_size = 32
        training_data = DataModule(full_train_x, full_train_y, full_train_i)

        logger = TensorBoardLogger("tb_logs", name="ocean_forcing")
        num_tasks = len(data)
        model = PLMultitaskGPModel(
            full_train_x,
            full_train_y,
            full_train_i,
            num_tasks,
            covar_module,
        )
        lr_monitor = LearningRateMonitor(logging_interval="step")
        early_stop_callback = EarlyStopping(
            monitor="loss",
            min_delta=0.00,
            patience=100,
            verbose=False,
            mode="min",
コード例 #2
0
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

from imipnet.lightning_module import IMIPLightning

parser = ArgumentParser()
parser = IMIPLightning.add_model_specific_args(parser)
args = parser.parse_args([
    "--loss", "outlier-balanced-bce-bce-uml", "--n_top_patches", "16",
    "--eval_set", "kitti-gray-0.5", "--preprocess", "center"
])

imip_module = IMIPLightning(args)
name = imip_module.get_new_run_name()

logger = TensorBoardLogger("./runs", name)

checkpoint_dir = os.path.join(".", "checkpoints", "simple-conv", name)
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_dir,
    save_last=True,
    verbose=True,
    monitor="eval_true_inliers",
    mode='max',
    period=0  # don't wait for a new epoch to save a better model
)

overfit_val = args.overfit_n

trainer = Trainer(logger=logger,
コード例 #3
0
ファイル: finetune.py プロジェクト: jialanxin/megnetorch
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss_weighed',
        save_top_k=3,
        mode='min',
    )

    train_set = torch.load("materials/JVASP/Train_raman_set_25_uneq_yolov1.pt")
    validate_set = torch.load(
        "materials/JVASP/Valid_raman_set_25_uneq_yolov1.pt")
    train_dataloader = DataLoader(
        dataset=train_set, batch_size=batch_size, num_workers=4, shuffle=True)
    validate_dataloader = DataLoader(
        dataset=validate_set, batch_size=batch_size, num_workers=4)

    experiment = Experiment(**model_hpparams)

    logger = TensorBoardLogger(prefix)
    if trainer_config == "tune":
        trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, logger=logger, callbacks=[
            checkpoint_callback], auto_lr_find=True)
        trainer.tune(experiment, train_dataloader, validate_dataloader)
    else:
        if checkpoint_path != None:
            trainer = pl.Trainer(resume_from_checkpoint=checkpoint_path, gpus=1 if torch.cuda.is_available(
            ) else 0, logger=logger, callbacks=[checkpoint_callback], max_epochs=epochs)
        else:
            trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, logger=logger,
                                 callbacks=[checkpoint_callback],  max_epochs=epochs)
        trainer.fit(experiment, train_dataloader, validate_dataloader)
コード例 #4
0
    # set validation data loader
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
                                              sampler=test_sampler, **kwargs)

    epochs = args.epochs
    with tempfile.TemporaryDirectory() as run_output_dir:
        ckpt_path = os.path.join(run_output_dir, "checkpoint")
        os.makedirs(ckpt_path, exist_ok=True)
        checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path)

        logs_path = os.path.join(run_output_dir, "logger")
        os.makedirs(logs_path, exist_ok=True)
        logger = TensorBoardLogger(logs_path)

        train_percent = 1.0
        val_percent = 1.0

        model = Net()
        setattr(model, 'train_dataloader', lambda: train_loader)
        setattr(model, 'val_dataloader', lambda: test_loader)

        trainer = Trainer(accelerator='horovod',
                        gpus=0,
                        callbacks=None,
                        max_epochs=epochs,
                        limit_train_batches=train_percent,
                        limit_val_batches=val_percent,
                        logger=logger,
コード例 #5
0
def get_default_logger(save_dir, version=None):
    # set up logger object without actually saving logs
    logger = TensorBoardLogger(save_dir,
                               name='lightning_logs',
                               version=version)
    return logger
コード例 #6
0
def test_tensorboard_no_name(tmpdir, name):
    """Verify that None or empty name works."""
    logger = TensorBoardLogger(save_dir=tmpdir, name=name)
    logger.log_hyperparams({"a": 1, "b": 2, 123: 3, 3.5: 4, 5j: 5})  # Force data to be written
    assert logger.root_dir == tmpdir
    assert os.listdir(tmpdir / "version_0")
コード例 #7
0
def test_tensorboard_finalize(summary_writer, tmpdir):
    """Test that the SummaryWriter closes in finalize."""
    logger = TensorBoardLogger(save_dir=tmpdir)
    logger.finalize("any")
    summary_writer().flush.assert_called()
    summary_writer().close.assert_called()
コード例 #8
0
parser.add_argument(
    "--pod_template_spec",
    type=str,
    default=None,
    help="Pod template spec",
)

parser = pl.Trainer.add_argparse_args(parent_parser=parser)

args = vars(parser.parse_args())

# Enabling Tensorboard Logger, ModelCheckpoint, Earlystopping

lr_logger = LearningRateMonitor()
tboard = TensorBoardLogger(args["tensorboard_root"])
early_stopping = EarlyStopping(monitor="val_loss",
                               mode="min",
                               patience=5,
                               verbose=True)
checkpoint_callback = ModelCheckpoint(
    dirpath=args["checkpoint_dir"],
    filename="cifar10_{epoch:02d}",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

if not args["max_epochs"]:
    args["max_epochs"] = 1
コード例 #9
0
    def train(serialized_model):
        import horovod.torch as hvd

        # Horovod: initialize library.
        hvd.init()

        if verbose:
            import horovod as _horovod
            print(
                f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}"
            )

        _checkpoint_callback = None
        require_checkpoint = False

        with remote_store.get_local_output_dir() as run_output_dir:
            logs_path = os.path.join(run_output_dir, remote_store.logs_subdir)
            os.makedirs(logs_path, exist_ok=True)
            print(f"Made directory {logs_path} for horovod rank {hvd.rank()}")
            ckpt_dir = run_output_dir
            ckpt_filename = remote_store.checkpoint_filename

            if logger is None:
                # Use default logger if no logger is supplied
                train_logger = TensorBoardLogger(logs_path)
                print(f"Setup logger: Using TensorBoardLogger: {train_logger}")

            elif isinstance(logger,
                            CometLogger) and logger._experiment_key is None:
                # Resume logger experiment key if passed correctly from CPU.
                train_logger = CometLogger(
                    save_dir=logs_path,
                    api_key=logger.api_key,
                    experiment_key=logger_experiment_key,
                )

                print(
                    f"Setup logger: Resume comet logger: {vars(train_logger)}")
            else:
                # use logger passed in.
                train_logger = logger
                train_logger.save_dir = logs_path
                print(
                    f"Setup logger: Using logger passed from estimator: {train_logger}"
                )

            # Lightning requires to add checkpoint callbacks for all ranks.
            # Otherwise we are seeing hanging in training.
            for cb in callbacks:
                if isinstance(cb, ModelCheckpoint):
                    cb.dirpath = ckpt_dir
                    cb.filename = ckpt_filename
                    _checkpoint_callback = cb
                    require_checkpoint = True
                    break
            if not _checkpoint_callback:
                # By default 'monitor'=None which saves a checkpoint only for the last epoch.
                _checkpoint_callback = ModelCheckpoint(dirpath=ckpt_dir,
                                                       filename=ckpt_filename,
                                                       verbose=True)
                callbacks.append(_checkpoint_callback)

            if remote_store.saving_runs and hvd.rank() == 0:
                # Horovod: sync checkpoint and logging files only on rank 0 to
                # prevent other ranks from corrupting them.
                class _SyncCallback(Callback):
                    def on_epoch_end(self, trainer: "pl.Trainer",
                                     pl_module: "pl.LightningModule") -> None:
                        remote_store.sync(run_output_dir)

                callbacks.append(_SyncCallback())

            model = deserialize(serialized_model)

            _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \
                int(math.floor(float(train_rows) / batch_size / hvd.size()))

            _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \
                int(math.floor(float(val_rows) / val_batch_size / hvd.size()))

            if verbose:
                print(
                    f"Training data of rank[{hvd.local_rank()}]: Epochs: {epochs}\n"
                    f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {_train_steps_per_epoch}\n"
                    f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {_val_steps_per_epoch}\n"
                    f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n"
                )

            cuda_available = torch.cuda.is_available()
            # We need to check all ranks have same device type for traning.
            # Horovod doesn't support heterogeneous allreduce for gradients.
            cuda_avail_list = hvd.allgather_object(cuda_available,
                                                   name='device type')
            if cuda_avail_list.count(cuda_available) != hvd.size():
                raise RuntimeError("All ranks don't have same device type!")

            if cuda_available:
                # Horovod: pin GPU to local rank or the assigned GPU from spark.
                torch.cuda.set_device(
                    _get_assigned_gpu_or_default(default=hvd.local_rank()))
                # Move model to GPU.
                model.cuda()

            _num_gpus = num_gpus
            if _num_gpus is None:
                _num_gpus = 1 if cuda_available else 0

            # Set bar refresh to 1 / epoch, detailed loss and metrics is avaialbe in logger,
            # no need to print in screen here. User can still override this in trainer_args
            progress_bar_refresh_rate = _train_steps_per_epoch

            kwargs = {
                'accelerator': 'horovod',
                'gpus': _num_gpus,
                'callbacks': callbacks,
                'max_epochs': epochs,
                'logger': train_logger,
                'log_every_n_steps': log_every_n_steps,
                'num_sanity_val_steps': 0,
                'reload_dataloaders_every_epoch': False,
                'progress_bar_refresh_rate': progress_bar_refresh_rate,
                'terminate_on_nan': terminate_on_nan,
                'profiler': profiler
            }
            if trainer_args:
                kwargs.update(trainer_args)

            if verbose and hvd.rank() == 0:
                print("Creating trainer with: \n ", kwargs)

            trainer = Trainer(**kwargs)

            if profiler != 'simple' and trainer.profiler:
                print(
                    f"Set profiler's logs_path for {hvd.rank()} to {logs_path}"
                )
                trainer.profiler.dirpath = logs_path
                # filename where the profiler results will be saved instead of
                # printing to stdout. The .txt extension will be used automatically.
                trainer.profiler.filename = "profile"

            if verbose and hvd.rank() == 0:
                print(f"pytorch_lightning version={pl.__version__}")

            data_module_kwargs = {
                'train_dir':
                remote_store.train_data_path,
                'val_dir':
                remote_store.val_data_path,
                'num_train_epochs':
                epochs,
                'has_val':
                should_validate is not None,
                'train_batch_size':
                batch_size,
                'val_batch_size':
                val_batch_size,
                'shuffle_size':
                calculate_shuffle_buffer_size(),
                'num_reader_epochs':
                loader_num_epochs,
                'reader_pool_type':
                reader_pool_type,
                'reader_worker_count':
                train_reader_worker_count,
                'transform_spec':
                transformation,
                'inmemory_cache_all':
                inmemory_cache_all,
                'cur_shard':
                hvd.rank(),
                'shard_count':
                hvd.size(),
                'schema_fields':
                schema_fields,
                'storage_options':
                storage_options,
                'steps_per_epoch_train':
                _train_steps_per_epoch,
                'steps_per_epoch_val':
                _val_steps_per_epoch,
                'verbose':
                verbose,
                'debug_data_loader':
                debug_data_loader,
                'train_async_data_loader_queue_size':
                train_async_data_loader_queue_size,
                'val_async_data_loader_queue_size':
                val_async_data_loader_queue_size,
            }
            if debug_data_loader and hvd.rank() == 0:
                print(
                    f"Creating data module with args:\n {data_module_kwargs}")

            dataset = data_module(**data_module_kwargs)

            trainer.fit(model, dataset)

            if hvd.rank() == 0:
                if remote_store.saving_runs and trainer.profiler:
                    # One more file sync to push profiler result.
                    remote_store.sync(logs_path)

                # rank 0 overwrites model with best checkpoint and returns.
                if require_checkpoint:
                    if verbose:
                        print("load from checkpoint best model path:",
                              _checkpoint_callback.best_model_path)
                    best_model = model.load_from_checkpoint(
                        _checkpoint_callback.best_model_path)
                else:
                    best_model = model
                serialized_checkpoint = io.BytesIO()
                module = best_model if not is_legacy else best_model._model

                output = {
                    'model': module.state_dict(),
                    'logged_metrics': trainer.logged_metrics
                }

                torch.save(output, serialized_checkpoint)

                return serialized_checkpoint
コード例 #10
0
ファイル: main.py プロジェクト: vietnamican/tinyimagenet
                                 transform=transformer['val'])
valloader = DataLoader(valdataset,
                       batch_size=2,
                       pin_memory=True,
                       num_workers=1)

device = 'cpu'

if __name__ == '__main__':

    mode = 'training'
    if mode == 'training':
        log_name = 'tiny_imagenet_logs/{}'.format(mode)
        logger = TensorBoardLogger(
            save_dir=os.getcwd(),
            name=log_name,
            # log_graph=True,
            # version=0
        )
        loss_callback = ModelCheckpoint(
            monitor='val_loss',
            dirpath='',
            filename='checkpoint-{epoch:02d}-{val_loss:.4f}',
            save_top_k=-1,
            mode='min',
        )
        lr_monitor = LearningRateMonitor(logging_interval='epoch')
        callbacks = [loss_callback, lr_monitor]
        model = Model(backbone=backbone.MobileNetV2(num_classes=10),
                      num_classes=10)
        # state_dict = model_zoo.load_url(
        #     model_urls['mobilenet_v2'], progress=True)
コード例 #11
0
num_classes = 3
img_size = 64

dm = ImDataModule(
    batch_size=batch_size,
    num_classes=num_classes,
    img_size=img_size,
    data_dir="/media/hdd/Datasets/DenseHaze/",
)
class_ids = dm.setup()

# # Logs

model = LitModel(num_classes)

logger = TensorBoardLogger(save_dir="logs")

trainer = pl.Trainer(
    auto_select_gpus=True,
    gpus=1,
    precision=16,
    profiler=False,
    max_epochs=100,
    callbacks=[pl.callbacks.ProgressBar()],
    enable_pl_optimizer=True,
    logger=logger,
    accumulate_grad_batches=16,
    accelerator="ddp",
    plugins="ddp_sharded",
)
コード例 #12
0
ファイル: lightning_train_net.py プロジェクト: ShahaRaz/d2go
def main(
    cfg: CfgNode,
    output_dir: Optional[str] = None,
    task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask,
    eval_only: bool = False,
    num_machines: int = 1,
    num_gpus: int = 0,
    num_processes: int = 1,
) -> TrainOutput:
    """Main function for launching a training with lightning trainer
    Args:
        cfg: D2go config node
        num_machines: Number of nodes used for distributed training
        num_gpus: Number of GPUs to train on each node
        num_processes: Number of processes on each node.
            NOTE: Automatically set to the number of GPUs when using DDP.
            Set a value greater than 1 to mimic distributed training on CPUs.
        eval_only: True if run evaluation only.
    """
    assert (num_processes == 1 or num_gpus
            == 0), "Only set num_processes > 1 when training on CPUs"

    maybe_override_output_dir(cfg, output_dir)

    task = task_cls.from_config(cfg, eval_only)
    tb_logger = TensorBoardLogger(save_dir=cfg.OUTPUT_DIR)

    trainer_params = {
        # training loop is bounded by max steps, use a large max_epochs to make
        # sure max_steps is met first
        "max_epochs":
        10**8,
        "max_steps":
        cfg.SOLVER.MAX_ITER,
        "val_check_interval":
        cfg.TEST.EVAL_PERIOD
        if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER,
        "num_nodes":
        num_machines,
        "gpus":
        num_gpus,
        "num_processes":
        num_processes,
        "accelerator":
        get_accelerator(cfg.MODEL.DEVICE),
        "callbacks":
        _get_trainer_callbacks(cfg),
        "logger":
        tb_logger,
        "num_sanity_val_steps":
        0,
        "progress_bar_refresh_rate":
        10,
    }

    last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt")
    if PathManager.exists(last_checkpoint):
        # resume training from checkpoint
        trainer_params["resume_from_checkpoint"] = last_checkpoint
        logger.info(f"Resuming training from checkpoint: {last_checkpoint}.")

    trainer = pl.Trainer(**trainer_params)
    model_configs = None
    if eval_only:
        do_test(trainer, task)
    else:
        model_configs = do_train(cfg, trainer, task)

    return TrainOutput(
        output_dir=cfg.OUTPUT_DIR,
        tensorboard_log_dir=tb_logger.log_dir,
        accuracy=task.eval_res,
        model_configs=model_configs,
    )
コード例 #13
0
if __name__ == '__main__':
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    parser = pl.Trainer.add_argparse_args(ArgumentParser())
    parser.add_argument('--batch_size', default=512, type=int)
    parser.add_argument('--buffer_maxlen', default=1000000, type=int)
    #parser.add_argument('--max_episode', default=2000, type=int)
    parser.add_argument('--max_episode_len', default=200, type=int)
    parser.add_argument('--warm_start_steps', default=10000, type=int)
    parser.add_argument('--actor_lr', default=1e-4, type=float)
    parser.add_argument('--critic_lr', default=1e-3, type=float)
    parser.add_argument('--gamma', default=0.99, type=float)
    parser.add_argument('--sync_rate', default=8, type=int)
    parser.add_argument('--loading_weights', default=True)

    cfg = parser.parse_args()
    logger = TensorBoardLogger(save_dir=os.getcwd(), name='MADDPG_logs')
    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join(os.getcwd(), 'saved_checkpoints/'),
        save_top_k=3,
        verbose=False,
        monitor="loss",
        save_weights_only=True,
    )
    trainer = pl.Trainer.from_argparse_args(
        cfg,
        gpus=1,
        #fast_dev_run=True,
        max_epochs=30001,
        profiler=True,
        logger=logger)
    #checkpoint_callback=checkpoint_callback)
コード例 #14
0
def get_logger(logdir: Path) -> TensorBoardLogger:
    return TensorBoardLogger(str(logdir), name="unet")
コード例 #15
0
ファイル: main.py プロジェクト: rjean/selfsupmotion
def main():
    """Main entry point of the program.

    Note:
        This main.py file is meant to be called using the cli,
        see the `examples/local/run.sh` file to see how to use it.

    """
    parser = argparse.ArgumentParser()
    # __TODO__ check you need all the following CLI parameters
    parser.add_argument('--config',
                        help='config file with generic hyper-parameters,  such as optimizer, '
                             'batch_size, ... -  in yaml format')
    parser.add_argument('--data', help='path to data', required=True)
    parser.add_argument('--data-module', default="hdf5", help="Data module to use. file or hdf5")
    parser.add_argument('--tmp-folder',
                        help='will use this folder as working folder - it will copy the input data '
                             'here, generate results here, and then copy them back to the output '
                             'folder')
    parser.add_argument('--output', help='path to outputs - will store files here', required=True)
    parser.add_argument('--disable-progressbar', action='store_true',
                        help='will disable the progressbar while going over the mini-batch')
    parser.add_argument('--start-from-scratch', action='store_true',
                        help='will not load any existing saved model - even if present')
    parser.add_argument('--debug', action='store_true')
    parser.add_argument("--embeddings-device",type=str,default="cuda",help="Which device to use for embeddings generation.")
    parser.add_argument('--embeddings', action='store_true',help="Skip training and generate embeddings for evaluation.")
    parser.add_argument('--embeddings-test', action='store_true',help="Skip training and generate test embeddings for evaluation.")
    parser.add_argument('--embeddings-ckpt', type=str, default=None, help="Checkpoint to load when generating embeddings.")
    #parser.add_argument("--embeddings")
    parser.add_argument("--dryrun", action="store_true", help="Dry-run by training on the validtion set. Use only to test loop code.")

    mlflow_save_dir = "./mlruns"  # make into arg?
    tbx_save_dir = "./tensorboard"  # make into arg?

    parser = pl.Trainer.add_argparse_args(parser)

    args = parser.parse_args()

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    if args.tmp_folder is not None:
        data_folder_name = os.path.basename(os.path.normpath(args.data))
        rsync_folder(args.data, args.tmp_folder)
        data_dir = os.path.join(args.tmp_folder, data_folder_name)
        output_dir = os.path.join(args.tmp_folder, 'output')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    else:
        data_dir = args.data
        output_dir = args.output

    # to intercept any print statement:
    sys.stdout = LoggerWriter(logger.info)
    sys.stderr = LoggerWriter(logger.warning)

    assert args.config is not None
    with open(args.config, 'r') as stream:
        hyper_params = load(stream, Loader=yaml.FullLoader)
    exp_name = hyper_params["exp_name"]
    output_dir = os.path.join(output_dir, exp_name)
    os.makedirs(output_dir, exist_ok=True)
    shutil.copyfile(args.config, os.path.join(output_dir, "config.backup"))
    assert "output_dir" not in hyper_params
    hyper_params["output_dir"] = output_dir
    os.makedirs(mlflow_save_dir, exist_ok=True)
    mlf_logger = MLFlowLogger(
        experiment_name=exp_name,
        save_dir=mlflow_save_dir,
    )
    if os.path.exists(os.path.join(output_dir, STAT_FILE_NAME)):
        mlf_logger._run_id = load_mlflow(output_dir)
        logger.warning(f"WILL CONTINUE LOGGING IN MLFLOW RUN ID: {mlf_logger._run_id}")
    os.makedirs(tbx_save_dir, exist_ok=True)
    tbx_logger = TensorBoardLogger(
        save_dir=tbx_save_dir,
        name=exp_name,
        default_hp_metric=False,
    )

    log_path = os.path.join(output_dir, "console.log")
    handler = logging.handlers.WatchedFileHandler(log_path)
    formatter = logging.Formatter(logging.BASIC_FORMAT)
    handler.setFormatter(formatter)
    root = logging.getLogger()
    root.setLevel(logging.INFO)
    root.addHandler(handler)

    mlflow.set_experiment(exp_name)
    mlflow.start_run(run_id=mlf_logger.run_id)
    run(args, data_dir, output_dir, hyper_params, mlf_logger, tbx_logger)
    mlflow.end_run()
    if args.tmp_folder is not None:
        rsync_folder(output_dir + os.path.sep, args.output)
コード例 #16
0
ファイル: model.py プロジェクト: wellecks/naturalproofs
def cli_main():
    torch.multiprocessing.set_sharing_strategy('file_system')

    parser = argparse.ArgumentParser()
    parser.add_argument('--expr-name', default='autoregressive')
    parser.add_argument('--datapath',
                        default='/data/tokenized__bert-base-cased.pkl')
    parser.add_argument('--encs-file', default='/path/to/encs.pt')
    parser.add_argument('--pretrained-retrieval-checkpoint',
                        default='/path/to/best.ckpt')
    parser.add_argument('--default-root-dir', default='/output')
    parser.add_argument('--checkpoint-path', default=None)
    parser.add_argument('--token-limit', type=int, default=16384)
    parser.add_argument('--buffer-size', type=int, default=10000)
    parser.add_argument('--log-every', type=int, default=100)
    parser.add_argument('--max-epochs', type=int, default=50)
    parser.add_argument('--dataloader-workers', type=int, default=0)
    parser.add_argument('--model-type', default='bert-base-cased')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--mode', type=str, default='train', choices=['train'])
    parser.add_argument('--check-val-every-n-epoch', type=int, default=5)
    parser.add_argument('--decode-max-length', type=int, default=20)

    parser.add_argument('--set-mode', type=int, default=1, choices=[0, 1])
    parser.add_argument('--order',
                        default='ground-truth',
                        choices=['ground-truth'])

    parser.add_argument('--freeze-enc', type=int, default=0, choices=[0, 1])
    parser.add_argument('--freeze-dec-emb',
                        type=int,
                        default=1,
                        choices=[0, 1])
    parser.add_argument('--freeze-dec', type=int, default=0, choices=[0, 1])

    parser.add_argument('--init-enc', type=int, default=1, choices=[0, 1])
    parser.add_argument('--init-dec-emb', type=int, default=1, choices=[0, 1])
    parser.add_argument('--init-dec', type=int, default=1, choices=[0, 1])

    parser.add_argument('--parallel', type=int, default=0, choices=[0, 1])

    # --- Trainer/lightning
    parser.add_argument('--accumulate-grad-batches', type=int, default=1)
    parser.add_argument('--gradient-clip-val', type=float, default=1.0)
    parser.add_argument('--gpus', type=int, default=1)
    parser.add_argument('--accelerator', default='ddp')
    parser.add_argument('--precision', type=int, default=16)

    parser = SequenceRetriever.add_model_specific_args(parser)
    args = parser.parse_args()

    print(args)
    pl.seed_everything(args.seed)

    tokenizer = transformers.AutoTokenizer.from_pretrained(args.model_type)

    print("Loading data (%s)" % args.datapath)
    ds_raw = pickle.load(open(args.datapath, 'rb'))
    rid2tok = ds_raw['rid2tok']
    dls = utils.get_dataloaders(ds_raw['tokenized'],
                                xpad=tokenizer.pad_token_id,
                                ypad=rid2tok['<pad>'],
                                token_limit=args.token_limit,
                                buffer_size=args.buffer_size,
                                workers=args.dataloader_workers,
                                set_mode=bool(args.set_mode),
                                order=args.order)

    if args.parallel:
        from naturalproofs.encoder_decoder.model_joint import ParallelSequenceRetriever
        model = ParallelSequenceRetriever(
            vocab_size=len(rid2tok),
            bos=rid2tok['<bos>'],
            eos=rid2tok['<eos>'],
            xpad=tokenizer.pad_token_id,
            ypad=rid2tok['<pad>'],
            lr=args.lr,
            log_every=args.log_every,
            model_type=args.model_type,
        )
    else:
        model = SequenceRetriever(vocab_size=len(rid2tok),
                                  bos=rid2tok['<bos>'],
                                  eos=rid2tok['<eos>'],
                                  xpad=tokenizer.pad_token_id,
                                  ypad=rid2tok['<pad>'],
                                  lr=args.lr,
                                  log_every=args.log_every,
                                  model_type=args.model_type,
                                  decode_max_length=args.decode_max_length)

    if args.checkpoint_path is not None:
        print("Resuming from checkpoint (%s)" % args.checkpoint_path)
        model.load_from_checkpoint(args.checkpoint_path)

    else:
        print("Initializing model using pretrained retrieval models")
        model.initialize(
            encs_file=args.encs_file,
            ckpt_file=args.pretrained_retrieval_checkpoint,
            dataset_rid2tok=rid2tok,
            init_enc=args.init_enc,
            init_dec_emb=args.init_dec_emb,
            init_dec=args.init_dec,
        )

    model.freeze_parts(
        freeze_enc=args.freeze_enc,
        freeze_dec_emb=args.freeze_dec_emb,
        freeze_dec=args.freeze_dec,
    )

    checkpoint_callback = ModelCheckpoint(
        save_last=True,
        save_top_k=1,
        monitor='val/mAP',
        mode='max',
        dirpath='%s/%s' % (args.default_root_dir, args.expr_name),
        filename='best-{val/mAP:.4f}')

    logger = TensorBoardLogger(save_dir='%s/tb_logs' % (args.default_root_dir),
                               name=args.expr_name)

    trainer = pl.Trainer(check_val_every_n_epoch=args.check_val_every_n_epoch,
                         callbacks=[checkpoint_callback],
                         default_root_dir=args.default_root_dir,
                         reload_dataloaders_every_epoch=True,
                         move_metrics_to_cpu=True,
                         gradient_clip_val=args.gradient_clip_val,
                         gpus=args.gpus,
                         accelerator=args.accelerator,
                         precision=args.precision,
                         resume_from_checkpoint=args.checkpoint_path,
                         max_epochs=args.max_epochs,
                         accumulate_grad_batches=args.accumulate_grad_batches,
                         logger=logger)

    if args.mode == 'train':
        trainer.fit(model, dls['train'], dls['valid'])
コード例 #17
0
    datamodule = ChangeDetectionDataModule(args.data_dir)

    if args.backbone_type == 'random':
        backbone = resnet.resnet18(pretrained=False)
    elif args.backbone_type == 'imagenet':
        backbone = resnet.resnet18(pretrained=True)
    elif args.backbone_type == 'pretrain':
        model = MocoV2.load_from_checkpoint(args.ckpt_path)
        backbone = deepcopy(model.encoder_q)
    else:
        raise ValueError()

    model = SiamSegment(backbone,
                        feature_indices=(0, 4, 5, 6, 7),
                        feature_channels=(64, 64, 128, 256, 512))
    model.example_input_array = (torch.zeros(
        (1, 3, 96, 96)), torch.zeros((1, 3, 96, 96)))

    experiment_name = args.backbone_type
    logger = TensorBoardLogger(save_dir=str(Path.cwd() / 'logs' / 'oscd'),
                               name=experiment_name)
    checkpoint_callback = ModelCheckpoint(filename='{epoch}',
                                          save_weights_only=True)
    trainer = Trainer(gpus=args.gpus,
                      logger=logger,
                      callbacks=[checkpoint_callback],
                      max_epochs=100,
                      weights_summary='full')
    trainer.fit(model, datamodule=datamodule)
コード例 #18
0
            self.logger.experiment.add_scalar(f"Loss/Val{i}", loss,
                                              self.current_epoch)

            self.logger.experiment.add_scalar(f"WER/Val{i}", wer_metric,
                                              self.current_epoch)

            self._log_wers(outputs, f"Val{i}")

        return {'loss': final_loss}


data_module = CVDataModule(batch_size=48)

model = DSModule({})

logger = TensorBoardLogger('logs', name='DeepSpeech2')

trainer = pl.Trainer(
    #fast_dev_run=True,
    logger=logger,
    gpus=(1 if torch.cuda.is_available() else 0))

trainer.fit(model, data_module)
"""
dataset_dev = CommonVoiceDataset(vocab=VocabEsp())

loader = DataLoader(dataset_dev, batch_size=2, collate_fn=PadCollate())

model = DeepSpeech2()

features, sentences, fl, sl = next(iter(loader))
コード例 #19
0
def test_tensorboard_log_metrics(tmpdir, step_idx):
    logger = TensorBoardLogger(tmpdir)
    metrics = {"float": 0.3, "int": 1, "FloatTensor": torch.tensor(0.1), "IntTensor": torch.tensor(1)}
    logger.log_metrics(metrics, step_idx)
コード例 #20
0
def create_lightning_trainer(config: ModelConfigBase,
                             resume_from_checkpoint: Optional[Path] = None) -> Tuple[Trainer, StoringLogger]:
    """
    Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
    and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second
    return value.
    :param config: The model configuration.
    :param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
    :return: A tuple [Trainer object, diagnostic logger]
    """
    # For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation
    # models, this still appears to be the best way of choosing them because validation loss on the relatively small
    # training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
    # not for the HeadAndNeck model.
    best_checkpoint_callback = ModelCheckpoint(dirpath=str(config.checkpoint_folder),
                                               # filename=BEST_CHECKPOINT_FILE_NAME,
                                               # monitor=f"{VALIDATION_PREFIX}{MetricType.LOSS.value}",
                                               # save_top_k=1,
                                               save_last=True)
    # Recovery checkpoints: {epoch} will turn into a string like "epoch=1"
    # Store 1 recovery checkpoint every recovery_checkpoint_save_interval epochs. Due to a bug in Lightning, this
    # will still write alternate files recovery.ckpt and recovery-v0.ckpt, which are cleaned up later in
    # cleanup_checkpoint_folder
    recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(config.checkpoint_folder),
                                                   filename=RECOVERY_CHECKPOINT_FILE_NAME,
                                                   period=config.recovery_checkpoint_save_interval
                                                   )

    num_gpus = torch.cuda.device_count() if config.use_gpu else 0
    logging.info(f"Number of available GPUs: {num_gpus}")
    if config.max_num_gpus >= 0 and config.max_num_gpus < num_gpus:
        num_gpus = config.max_num_gpus
        logging.info(f"Restricting the number of GPUs to {num_gpus}")
    # Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).
    # For unit tests, only "ddp_spawn" works
    accelerator = "ddp" if num_gpus > 1 else None
    logging.info(f"Using {num_gpus} GPUs with accelerator '{accelerator}'")
    storing_logger = StoringLogger()
    tensorboard_logger = TensorBoardLogger(save_dir=str(config.logs_folder), name="Lightning", version="")
    loggers = [storing_logger, tensorboard_logger, AzureMLLogger()]
    # This leads to problems with run termination.
    # if not is_offline_run_context(RUN_CONTEXT):
    #     mlflow_logger = MLFlowLogger(experiment_name=RUN_CONTEXT.experiment.name,
    #                                  tracking_uri=RUN_CONTEXT.experiment.workspace.get_mlflow_tracking_uri())
    #     # The MLFlow logger needs to get its ID from the AzureML run context, otherwise there will be two sets of
    #     # results for each run, one from native AzureML and one from the MLFlow logger.
    #     mlflow_logger._run_id = RUN_CONTEXT.id
    #     loggers.append(mlflow_logger)
    # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.
    precision = 32 if num_gpus == 0 else 16 if config.use_mixed_precision else 32
    # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
    # https://pytorch.org/docs/stable/notes/randomness.html
    # For the classification models, we observed only a small performance deterioration (increase in 10sec on total
    # training time of 22min) when switching to deterministic.
    if config.pl_deterministic:
        deterministic = True
        benchmark = False
    else:
        deterministic = False
        benchmark = True
    trainer = Trainer(default_root_dir=str(config.outputs_folder),
                      deterministic=deterministic,
                      benchmark=benchmark,
                      accelerator=accelerator,
                      max_epochs=config.num_epochs,
                      num_sanity_val_steps=config.pl_num_sanity_val_steps,
                      callbacks=[best_checkpoint_callback, recovery_checkpoint_callback],
                      logger=loggers,
                      progress_bar_refresh_rate=0,  # Disable the progress bar,
                      gpus=num_gpus,
                      precision=precision,
                      sync_batchnorm=True,
                      terminate_on_nan=config.detect_anomaly,
                      resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None
                      )
    return trainer, storing_logger
コード例 #21
0
ファイル: train.py プロジェクト: mcbuehler/VariTex
    pipeline = PipelineModule(opt)
    gpus = torch.cuda.device_count()
    print("Using {} GPU".format(gpus))
    print("Writing results to {}".format(opt.path_out))
    mkdir(opt.path_out)

    if opt.logger == "wandb":
        wandb.login()
        logger = pl.loggers.WandbLogger(save_dir=opt.path_out,
                                        name=opt.experiment_name,
                                        project=opt.project)
        logger.log_hyperparams(opt)
        logger.watch(pipeline)
    elif opt.logger == "tensorboard":
        logger = TensorBoardLogger(save_dir=opt.path_out,
                                   name=opt.experiment_name)

    trainer = pl.Trainer(
        logger,
        gpus=gpus,
        max_epochs=opt.max_epochs,
        default_root_dir=opt.path_out,
        terminate_on_nan=False,  # Terminate on nan is expensive
        limit_val_batches=0.25,
        callbacks=[ImageLogCallback(opt),
                   ModelCheckpoint()],
        fast_dev_run=opt.debug,
        resume_from_checkpoint=opt.checkpoint,
        weights_summary='top')

    if not opt.debug:
コード例 #22
0
def test_validation_step_log_with_tensorboard(mock_log_metrics, tmpdir):
    """
    This tests make sure we properly log_metrics to loggers
    """
    class ExtendedModel(BoringModel):

        val_losses = []

        def training_step(self, batch, batch_idx):
            output = self.layer(batch)
            loss = self.loss(batch, output)
            self.log('train_loss', loss)
            return {"loss": loss}

        def validation_step(self, batch, batch_idx):
            output = self.layer(batch)
            loss = self.loss(batch, output)
            self.val_losses.append(loss)
            self.log('valid_loss_0', loss, on_step=True, on_epoch=True)
            self.log('valid_loss_1', loss, on_step=False, on_epoch=True)
            self.log('valid_loss_2', loss, on_step=True, on_epoch=False)
            self.log('valid_loss_3', loss, on_step=False, on_epoch=False)
            return {"val_loss": loss}

        def test_step(self, batch, batch_idx):
            output = self.layer(batch)
            loss = self.loss(batch, output)
            self.log('test_loss', loss)
            return {"y": loss}

    model = ExtendedModel()
    model.validation_epoch_end = None

    # Initialize a trainer
    trainer = Trainer(
        default_root_dir=tmpdir,
        logger=TensorBoardLogger(tmpdir),
        limit_train_batches=2,
        limit_val_batches=2,
        limit_test_batches=2,
        max_epochs=2,
        progress_bar_refresh_rate=1,
    )

    # Train the model ⚡
    trainer.fit(model)

    # hp_metric + 2 steps + epoch + 2 steps + epoch
    expected_num_calls = 1 + 2 + 1 + 2 + 1

    assert len(mock_log_metrics.mock_calls) == expected_num_calls

    assert mock_log_metrics.mock_calls[0] == call({'hp_metric': -1}, 0)

    def get_metrics_at_idx(idx):
        mock_calls = list(mock_log_metrics.mock_calls)
        if isinstance(mock_calls[idx].kwargs, dict):
            return mock_calls[idx].kwargs["metrics"]
        else:
            return mock_calls[idx][2]["metrics"]

    expected = [
        'valid_loss_0_step/epoch_0', 'valid_loss_2/epoch_0', 'global_step'
    ]
    assert sorted(get_metrics_at_idx(1)) == sorted(expected)
    assert sorted(get_metrics_at_idx(2)) == sorted(expected)

    expected = model.val_losses[2]
    assert get_metrics_at_idx(1)["valid_loss_0_step/epoch_0"] == expected
    expected = model.val_losses[3]
    assert get_metrics_at_idx(2)["valid_loss_0_step/epoch_0"] == expected

    expected = ['valid_loss_0_epoch', 'valid_loss_1', 'epoch', 'global_step']
    assert sorted(get_metrics_at_idx(3)) == sorted(expected)

    expected = torch.stack(model.val_losses[2:4]).mean()
    assert get_metrics_at_idx(3)["valid_loss_1"] == expected
    expected = [
        'valid_loss_0_step/epoch_1', 'valid_loss_2/epoch_1', 'global_step'
    ]

    assert sorted(get_metrics_at_idx(4)) == sorted(expected)
    assert sorted(get_metrics_at_idx(5)) == sorted(expected)

    expected = model.val_losses[4]
    assert get_metrics_at_idx(4)["valid_loss_0_step/epoch_1"] == expected
    expected = model.val_losses[5]
    assert get_metrics_at_idx(5)["valid_loss_0_step/epoch_1"] == expected

    expected = ['valid_loss_0_epoch', 'valid_loss_1', 'epoch', 'global_step']
    assert sorted(get_metrics_at_idx(6)) == sorted(expected)

    expected = torch.stack(model.val_losses[4:]).mean()
    assert get_metrics_at_idx(6)["valid_loss_1"] == expected

    results = trainer.test(model)
    expected_callback_metrics = {
        'train_loss', 'valid_loss_0_epoch', 'valid_loss_0', 'debug_epoch',
        'valid_loss_1', 'test_loss', 'val_loss'
    }
    assert set(trainer.callback_metrics) == expected_callback_metrics
    assert set(results[0]) == {'test_loss', 'debug_epoch'}
コード例 #23
0
def main(args):
    utils.set_seed_everywhere(args.seed)

    cfg = hyperparameters.get_config(args)
    cfg.seed = args.seed
    cfg.base_dir = cfg.base_dir + "_s_" + str(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()

    time_str = datetime.now(
        timezone('US/Eastern')).strftime("%Y-%m-%d-%H-%M-%S")
    exp_dir = os.path.join(cfg.base_dir, time_str)
    checkpoint_dir = os.path.join(exp_dir, cfg.checkpoint_dir)
    log_dir = os.path.join(exp_dir, cfg.log_dir)

    cfg.log_training = args.log_training
    cfg.log_training_path = os.path.join(exp_dir, args.log_training_path)
    cfg.num_steps = args.num_steps
    cfg.device = str(torch.device("cuda" if args.cuda else "cpu"))

    save_config(cfg, exp_dir, "config.json")

    print("Log path: ", log_dir, "Checkpoint Dir: ", checkpoint_dir)

    num_timsteps = cfg.observed_steps + cfg.predicted_steps
    data_shape = {'image': (None, num_timsteps, 3, 64, 64)}
    cfg.data_shapes = data_shape

    model = KeypointModel(cfg)

    cp_callback = ModelCheckpoint(filepath=os.path.join(
        checkpoint_dir, "model_"),
                                  period=25,
                                  save_top_k=-1)

    logger = TensorBoardLogger(log_dir, name="", version=None)

    gpus = 1 if args.cuda else None

    if args.pretrained_path:
        checkpoint_path = get_latest_checkpoint(args.pretrained_path)
        import json
        model = KeypointModel.load_from_checkpoint(checkpoint_path)
        print(json.dumps(model.cfg, indent=4))

    print("On GPU Device: ", gpus)
    trainer = Trainer(
        max_epochs=10000,
        max_steps=args.num_steps,
        logger=logger,
        checkpoint_callback=cp_callback,
        gpus=gpus,
        #distributed_backend='dp',
        progress_bar_refresh_rate=1,
        #gradient_clip_val=cfg.clipnorm,
        fast_dev_run=False,
        #train_percent_check=0.1,val_percent_check=0.0,
        val_percent_check=0.3,
        track_grad_norm=2,
        num_sanity_val_steps=0,
        show_progress_bar=True)
    trainer.fit(model)
    save_path = os.path.join(checkpoint_dir,
                             "model_final_" + str(args.num_steps) + ".ckpt")
    print("Saving model finally:")
    trainer.save_checkpoint(save_path)
コード例 #24
0
ファイル: main.py プロジェクト: simeneide/vae-multi
#%% Load model
import models
vae = models.VAE(input_height=dataset.dims[2], num_labels=10, lr=0.001)

#%%
import utils
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
cb_imageplot = utils.PlotImage(batch)
cb_checkpoint = ModelCheckpoint(monitor='val/loss',
                                verbose=True,
                                save_last=True)
trainer = pl.Trainer(
    gpus=1,
    #auto_lr_find=True,
    logger=TensorBoardLogger('lightning_logs', name='coco-withtext'),
    callbacks=[cb_imageplot],
    checkpoint_callback=cb_checkpoint)

#trainer.tune(vae, train_dataloader=dataset)
#%%
trainer.fit(vae, dataset)

# %% Analysis
#vae = models.VAE.load_from_checkpoint("lightning_logs/version_14/checkpoints/epoch=348.ckpt")

# %%
#%% Test batch

out = vae(x, y)
out['image'].min()
コード例 #25
0
    cfg = parse_config(args.cfg_file)
    data_cfg = get_data_info(cfg['data'])
    cfg['data'] = data_cfg
    args.cfg = cfg
    ckpt_fd = "{}".format(
        args.output_directory) + "/{epoch:02d}_{train_loss:.3f}_{val_acc:.3f}"
    ckpt_callback = pl.callbacks.model_checkpoint.ModelCheckpoint(
        filepath=ckpt_fd, verbose=True, save_top_k=-1)
    es_cb = pl.callbacks.EarlyStopping("val_acc",
                                       mode="max",
                                       verbose=True,
                                       patience=10)
    # can't get it to work with val_mAP for now, loss should do too
    prog = pl.loggers.CSVLogger(args.log_directory)
    tb_prog = TensorBoardLogger(args.log_directory)
    loggers = [prog, tb_prog]
    if args.use_mixers:
        raise DeprecationWarning(
            "running with --use_mixers is deprecated. Not doing anything")
        exit(1)
        mixer = mixers.BackgroundAddMixer()
        mixer = mixers.UseMixerWithProb(mixer, args.mixer_prob)
        args.tr_mixer = mixer  # mixers.UseMixerWithProb(mixer, args.mixer_prob)
    else:
        args.tr_mixer = None
    tr_tfs = get_transforms_classifier(True, args.random_clip_size)
    val_tfs = get_transforms_classifier(False,
                                        args.val_clip_size,
                                        center_crop_val=True)
コード例 #26
0
ファイル: trainer.py プロジェクト: vishalbelsare/deepscm
        args.gpus = 1

    # TODO: push to lightning
    args.gradient_clip_val = float(args.gradient_clip_val)

    groups = {}
    for group in parser._action_groups:
        group_dict = {
            a.dest: getattr(args, a.dest, None)
            for a in group._group_actions
        }
        groups[group.title] = argparse.Namespace(**group_dict)

    lightning_args = groups['lightning_options']

    logger = TensorBoardLogger(lightning_args.default_root_dir,
                               name=f'{exp_args.experiment}/{exp_args.model}')
    lightning_args.logger = logger

    hparams = groups['experiment']
    model_params = groups['model']

    for k, v in vars(model_params).items():
        setattr(hparams, k, v)

    trainer = Trainer.from_argparse_args(lightning_args)

    model = model_class(**vars(model_params))
    experiment = exp_class(hparams, model)

    if not args.validate:
        warnings.filterwarnings(
コード例 #27
0
    # loss_functions = {'mse': F.mse_loss, 'bce': F.binary_cross_entropy, 'mae': F.l1_loss}
    # loss_functions = {'mse': F.mse_loss, 'bce': F.binary_cross_entropy}
    loss_functions = {'bce': F.binary_cross_entropy}

    test_jet_indices = random.sample(range(0, len(jetsPL_test)), 50)

    train_loader = jet_dataloader_train
    for hs, channels, loss_k in itertools.product(hs_s, channels_s, loss_functions):
        autoencoder = lit_conv_ae_paper.LitAE(hidden_size=hs, channels=channels, loss=loss_functions[loss_k])
        print('hidden_size', hs)
        print('channels', channels)
        print('loss', loss_k)
        #print('loss val', loss_functions[loss_k])
        model_name = f'bin_{utils.N_IMAGE_BINS}_hs_{hs}_chan_{channels}_loss_{loss_k}_ss_{utils.SAMPLE_SIZE}_ne_{utils.NUM_EPOCHS}'
        logdir = f'tb_logs_{loss_k}'
        logger = TensorBoardLogger(logdir, name=model_name)
        trainer = pl.Trainer(max_epochs = utils.NUM_EPOCHS, logger=logger)
        trainer.fit(autoencoder, train_loader)

        output_jets_train = inference_jets(jetsDL_train, autoencoder)
        output_jets_test = inference_jets(jetsDL_test, autoencoder)

        #jet observables
        #pl-dl-dlue

        #scale back, move to numpy arrays
        # jetsPL_test = [unscale_hist(j.detach().numpy()) for j in jetsPL_test]
        # jetsDL_test = [unscale_hist(j.detach().numpy()) for j in jetsDL_test]
        # output_jets = [unscale_hist(j) for j in output_jets]

        # for j in jetsPL_test:
コード例 #28
0
def main(cfg):
    # for subject in [25]:
    # cfg = load_cfg()
    run_pd = cfg['run_pd']
    subjects_list = cfg['pd_subjects'] if run_pd else cfg['healthy_subjects']
    if run_pd:
        med_str = '-on-med' if cfg['model_on_med'] else '-off-med'
    else:
        med_str = ''

    for subject in subjects_list:
        print('------------------------------------\nSubject', subject,
              '\n------------------------------------')
        # subject = 35, healthy
        # subject = 55, PD
        input_data, targets, long_labels = subject_nn_data(
            subject,
            healthy_subjects=cfg['healthy_subjects'],
            pd_subjects=cfg['pd_subjects'],
            feature_name=cfg['pred_feature'],
            data_path=cfg['data_path'],
            pd_dir=cfg['pd_dir'],
            healthy_dir=cfg['healthy_dir'],
            on_med=cfg['model_on_med'],
            use_silent_channels=cfg['use_silent_channels'],
            mask_value=cfg['mask_value'])

        freqs = ['alpha', 'beta', 'gamma']
        freqs_idx = [0, 1, 2]

        split_idx_path = osp.join(cfg['outputs_path'], cfg['splits_path'],
                                  f'{subject}{med_str}-mlp.npy')

        indices_shuf = []
        if cfg['subject_crossval']:
            indices = np.arange(input_data.shape[1])
            for i in [1, 2, 3, 4, 5]:
                np.random.shuffle(indices)
                indices_shuf.append(indices.copy())
        else:
            if osp.exists(split_idx_path):
                indices = np.load(split_idx_path)
            else:
                indices = np.arange(input_data.shape[1])
                np.random.shuffle(indices)
                np.save(split_idx_path, indices)
            indices_shuf.append(indices)

        split_idx = int(input_data.shape[1] * 0.9)

        for shuf, indices in enumerate(indices_shuf):
            for freq in freqs_idx:
                # train-val split

                train_data = FlatEEGDataset(
                    np_input=input_data[freq, indices[:split_idx], :],
                    np_targets=targets[indices[:split_idx]])
                val_data = FlatEEGDataset(
                    np_input=input_data[freq, indices[split_idx:], :],
                    np_targets=targets[indices[split_idx:]])

                # data loaders
                train_loader = DataLoader(train_data,
                                          batch_size=cfg['batch_size'],
                                          shuffle=True,
                                          num_workers=0)
                val_loader = DataLoader(val_data,
                                        batch_size=len(val_data),
                                        shuffle=False,
                                        num_workers=0)

                # model
                idx_hparams = {
                    'n_features': input_data.shape[2],
                    'n_states': len(np.unique(targets)),
                    'n_hidden_nodes': cfg['n_hidden_nodes'],
                    'n_hidden_layers': cfg['n_hidden_layers'],
                    'lr': cfg['lr'],
                    'epochs': cfg['epochs'],
                    'freq_name': freqs[freq],
                    'pred_feature': cfg['pred_feature'],
                    'input_dropout': cfg['input_dropout'],
                    'mlp_dropout': cfg['mlp_dropout'],
                    'weight_decay': cfg['weight_decay'],
                    'num_classes': 3
                }

                model = LitMlpClassifier(hparams=idx_hparams)

                # training
                cv = f'split{shuf}' if cfg['subject_crossval'] else ''
                prefix = 'pow-mean' if (cfg['mat_dict']
                                        == 'dataSorted') else 'IC-MEAN'
                hparams_str = f"bs{cfg['batch_size']}_hn{cfg['n_hidden_nodes']}_lr{cfg['lr']}"
                logger = TensorBoardLogger(
                    save_dir=osp.join(cfg['experiments_dir'],
                                      f"subject-{subject}"),
                    name=f"freq-{freqs[freq]}-single_subject",
                    version=f"MLP{cv}{med_str}-{prefix}_{hparams_str}_"
                    f"{datetime.now().strftime('%Y-%m-%d_%H%M')}")
                trainer = pl.Trainer(max_epochs=cfg['epochs'], logger=logger)
                trainer.fit(model, train_loader, val_loader)
コード例 #29
0
        supervised=args.supervised,
        out_dim=train_dataset.n_classes,
    )

    # ------------
    # model
    # ------------
    args.accelerator = "dp"
    if args.supervised:
        module = SupervisedLearning(args,
                                    encoder,
                                    output_dim=train_dataset.n_classes)
    else:
        module = ContrastiveLearning(args, encoder)

    logger = TensorBoardLogger("runs", name="CLMRv2-{}".format(args.dataset))
    if args.checkpoint_path:
        module = module.load_from_checkpoint(
            args.checkpoint_path,
            encoder=encoder,
            output_dim=train_dataset.n_classes)

    else:
        # ------------
        # training
        # ------------

        if args.supervised:
            early_stopping = EarlyStopping(monitor='Valid/loss', patience=20)
        else:
            early_stopping = None
コード例 #30
0
def get_log_dir(
    trainer: 'pytorch_lightning.Trainer',
    exp_dir: str = None,
    name: str = None,
    version: str = None,
    explicit_log_dir: str = None,
    use_datetime_version: bool = True,
) -> (Path, str, str, str):
    """
    Obtains the log_dir used for exp_manager.

    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment

    Raise:
        LoggerMisconfigurationError: If trainer is incompatible with arguments
        NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found.
        ValueError: If resume is True, and there were more than 1 checkpoint could found.
    """
    if explicit_log_dir:  # If explicit log_dir was passed, short circuit
        return check_explicit_log_dir(trainer, explicit_log_dir, exp_dir, name,
                                      version)

    # Default exp_dir to ./nemo_experiments if None was passed
    _exp_dir = exp_dir
    if exp_dir is None:
        _exp_dir = str(Path.cwd() / 'nemo_experiments')

    # If the user has already defined a logger for the trainer, use the logger defaults for logging directory
    if trainer.logger is not None:
        if trainer.logger.save_dir:
            if exp_dir:
                raise LoggerMisconfigurationError(
                    "The pytorch lightning trainer that was passed to exp_manager contained a logger, the logger's "
                    f"save_dir was not None, and exp_dir ({exp_dir}) was not None. If trainer.logger.save_dir "
                    "exists, exp_manager will use trainer.logger.save_dir as the logging directory and exp_dir "
                    "must be None.")
            _exp_dir = trainer.logger.save_dir
        if name:
            raise LoggerMisconfigurationError(
                "The pytorch lightning trainer that was passed to exp_manager contained a logger, and name: "
                f"{name} was also passed to exp_manager. If the trainer contains a "
                "logger, exp_manager will use trainer.logger.name, and name passed to exp_manager must be None."
            )
        name = trainer.logger.name
        version = f"version_{trainer.logger.version}"
    # Use user-defined exp_dir, project_name, exp_name, and versioning options
    else:
        name = name or "default"
        version = version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None)

        if version is None:
            if trainer.is_slurm_managing_tasks:
                logging.warning(
                    "Running on a slurm cluster. exp_manager will not add a version number."
                )
                version = ""
            elif is_global_rank_zero():
                if use_datetime_version:
                    version = time.strftime('%Y-%m-%d_%H-%M-%S')
                else:
                    tensorboard_logger = TensorBoardLogger(
                        save_dir=Path(_exp_dir), name=name, version=version)
                    version = f"version_{tensorboard_logger.version}"
                os.environ[NEMO_ENV_VARNAME_VERSION] = version

    log_dir = Path(_exp_dir) / Path(str(name)) / Path(str(version))
    return log_dir, str(_exp_dir), name, version