Ejemplo n.º 1
0
 def _setup_seed(self, _=None, iter_counter=None, iteration=None):
     if iter_counter is None:
         le = self._dataloader_len if self._dataloader_len is not None else 1
     else:
         le = iter_counter
     if iteration is None:
         iteration = self.state.iteration
     manual_seed(self.state.seed + iteration // le)
Ejemplo n.º 2
0
 def user_handler():
     manual_seed(22)
     _ = [
         random.random(),
         torch.rand(2),
     ]
     if with_numpy:
         _ = np.random.rand(2)
Ejemplo n.º 3
0
 def _setup_seed(self, _: Any = None, iter_counter: Optional[int] = None, iteration: Optional[int] = None) -> None:
     if iter_counter is None:
         le = self._dataloader_len if self._dataloader_len is not None else 1
     else:
         le = iter_counter
     if iteration is None:
         iteration = self.state.iteration
     manual_seed(self.state.seed + iteration // le)  # type: ignore[operator]
    def _init_distribution(self):
        self.rank = idist.get_rank()
        manual_seed(42 + self.rank)
        self.device = idist.device()

        if self.train_ds:
            if self.train_ds.sampler is not None:
                sampler = self.train_ds.sampler(self.train_ds,
                                                self.train_ds.get_label)
                isShuffle = False
            else:
                sampler = None
                isShuffle = True
            self.train_loader = idist.auto_dataloader(
                self.train_ds,
                batch_size=self.hparams.train_bs,
                num_workers=self.hparams.train_num_workers,
                shuffle=isShuffle,
                drop_last=True,
                sampler=sampler,
                **self.train_ds.additional_loader_params)

        if self.valid_ds:
            self.valid_loader = idist.auto_dataloader(
                self.valid_ds,
                batch_size=self.hparams.valid_bs,
                num_workers=self.hparams.valid_num_workers,
                shuffle=False,
                drop_last=False,
                **self.valid_ds.additional_loader_params)

        if self.test_ds:
            self.test_loader = idist.auto_dataloader(
                self.test_ds,
                batch_size=self.hparams.valid_bs,
                num_workers=self.hparams.valid_num_workers,
                shuffle=False,
                drop_last=False,
                **self.test_ds.additional_loader_params)

        if USE_AMP:
            self._init_optimizer()
            self.model = idist.auto_model(self.model)
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level="O1")
        else:
            self.model = idist.auto_model(self.model)

        if not USE_AMP:
            self._init_optimizer()

        self.optimizer = idist.auto_optim(self.optimizer)

        self._init_scheduler()

        self.criterion = self.criterion.to(self.device)
Ejemplo n.º 5
0
def evaluation(local_rank, config, logger, with_clearml):

    rank = idist.get_rank()
    device = idist.device()
    manual_seed(config.seed + local_rank)

    data_loader = config.data_loader
    model = config.model.to(device)

    # Load weights:
    state_dict = get_model_weights(config, logger, with_clearml)
    model.load_state_dict(state_dict)

    # Adapt model to dist config
    model = idist.auto_model(model)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if ("val_metrics" in config) and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator = create_evaluator(model,
                                 val_metrics,
                                 config,
                                 with_clearml,
                                 tag="val")

    # Setup Tensorboard logger
    if rank == 0:
        tb_logger = common.TensorboardLogger(
            log_dir=config.output_path.as_posix())
        tb_logger.attach_output_handler(
            evaluator,
            event_name=Events.COMPLETED,
            tag="validation",
            metric_names="all",
        )

    # Log confusion matrix to ClearML:
    if with_clearml:
        evaluator.add_event_handler(Events.COMPLETED, compute_and_log_cm,
                                    cm_metric, evaluator.state.iteration)

    state = evaluator.run(data_loader)
    utils.log_metrics(logger, 0, state.times["COMPLETED"], "Validation",
                      state.metrics)

    if idist.get_rank() == 0:
        tb_logger.close()
Ejemplo n.º 6
0
 def _setup_seed(self, _: Any = None, iter_counter: Optional[int] = None, iteration: Optional[int] = None) -> None:
     if iter_counter is None:
         le = self._dataloader_len if self._dataloader_len is not None else 1
     elif not iter_counter > 0:
         raise ValueError("iter_counter should be positive value")
     else:
         le = iter_counter
     if iteration is None:
         iteration = self.state.iteration
     manual_seed(self.state.seed + iteration // le)  # type: ignore[operator]
Ejemplo n.º 7
0
def test_psnr():
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # test for float
    manual_seed(42)
    y_pred = torch.rand(8, 3, 28, 28, device=device)
    y = y_pred * 0.8
    data_range = (y.max() - y.min()).cpu().item()
    _test_psnr(y_pred, y, data_range, device)

    # test for YCbCr
    manual_seed(42)
    y_pred = torch.randint(16, 236, (4, 1, 12, 12), dtype=torch.uint8, device=device)
    y = torch.randint(16, 236, (4, 1, 12, 12), dtype=torch.uint8, device=device)
    data_range = (y.max() - y.min()).cpu().item()
    _test_psnr(y_pred, y, data_range, device)

    # test for uint8
    manual_seed(42)
    y_pred = torch.randint(0, 256, (4, 3, 16, 16), dtype=torch.uint8, device=device)
    y = (y_pred * 0.8).to(torch.uint8)
    data_range = (y.max() - y.min()).cpu().item()
    _test_psnr(y_pred, y, data_range, device)

    # test with NHW shape
    manual_seed(42)
    y_pred = torch.rand(8, 28, 28, device=device)
    y = y_pred * 0.8
    data_range = (y.max() - y.min()).cpu().item()
    _test_psnr(y_pred, y, data_range, device)
Ejemplo n.º 8
0
def _test_keep_random_state(with_numpy):

    manual_seed(54)
    true_values = []
    for _ in range(5):
        t = [
            torch.tensor([random.random()]),
            torch.rand(2),
        ]
        if with_numpy:
            t.append(torch.from_numpy(np.random.rand(2)))
        true_values.append(t)

    @keep_random_state
    def user_handler():
        manual_seed(22)
        _ = [
            random.random(),
            torch.rand(2),
        ]
        if with_numpy:
            _ = np.random.rand(2)

    manual_seed(54)
    res_values = []
    for _ in range(5):
        r = [
            torch.tensor([random.random()]),
            torch.rand(2),
        ]
        if with_numpy:
            r.append(torch.from_numpy(np.random.rand(2)))
        res_values.append(r)
        user_handler()

    for a, b in zip(true_values, res_values):
        for i, j in zip(a, b):
            assert (i == j).all()
Ejemplo n.º 9
0
def test_concepts_snippet_resume():

    # Commented imports required in the snippet
    # import torch
    # from torch.utils.data import DataLoader

    # from ignite.engine import DeterministicEngine
    # from ignite.utils import manual_seed

    seen_batches = []
    manual_seed(seed=15)

    def random_train_data_loader(size):
        data = torch.arange(0, size)
        return DataLoader(data, batch_size=4, shuffle=True)

    def print_train_data(engine, batch):
        i = engine.state.iteration
        e = engine.state.epoch
        print("train", e, i, batch.tolist())
        seen_batches.append(batch)

    trainer = DeterministicEngine(print_train_data)

    print("Original Run")
    manual_seed(56)
    trainer.run(random_train_data_loader(40), max_epochs=2, epoch_length=5)

    original_batches = list(seen_batches)
    seen_batches = []

    print("Resumed Run")
    trainer.load_state_dict({
        "epoch": 1,
        "epoch_length": 5,
        "max_epochs": 2,
        "rng_states": None
    })
    manual_seed(56)
    trainer.run(random_train_data_loader(40))

    resumed_batches = list(seen_batches)
    seen_batches = []
    for b1, b2 in zip(original_batches[5:], resumed_batches):
        assert (b1 == b2).all()
Ejemplo n.º 10
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="ImageNet-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = "stop-on-{}".format(config["stop_iteration"])

        folder_name = "{}_backend-{}-{}_{}".format(config["model"],
                                                   idist.backend(),
                                                   idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_imagenet_dataloader(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_supervised_trainer(model, optimizer, criterion,
                                        lr_scheduler, train_loader.sampler,
                                        config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(),
        "loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info("Stop training on {} iteration".format(
                trainer.state.iteration))
            trainer.terminate()

    @trainer.on(Events.ITERATION_COMPLETED(every=20))
    def print_acc(engine):
        if rank == 0:
            print("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}"\
                    .format(engine.state.epoch, engine.state.iteration, len(train_loader),
                            engine.state.saved_batch_loss
                            ))

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
Ejemplo n.º 11
0
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
    """function to be run by idist.Parallel context manager."""

    # ----------------------
    # make a certain seed
    # ----------------------
    rank = idist.get_rank()
    manual_seed(config.seed + rank)

    # -----------------------
    # create output folder
    # -----------------------

    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")
        name = f"{config.model}-backend-{idist.backend()}-{now}"
        path = Path(config.output_dir, name)
        path.mkdir(parents=True, exist_ok=True)
        config.output_dir = path.as_posix()

    config.output_dir = Path(idist.broadcast(config.output_dir, src=0))

    # -----------------------------
    # datasets and dataloaders
    # -----------------------------
    # TODO : PLEASE provide your custom datasets and dataloaders configurations
    # we can use `idist.auto_dataloader` to handle distributed configurations
    # TODO : PLEASE replace `kwargs` with your desirable DataLoader arguments
    # See : https://pytorch.org/ignite/distributed.html#ignite.distributed.auto.auto_dataloader

    train_dataset, eval_dataset = get_datasets(path=config.data_path)

    train_dataloader = idist.auto_dataloader(
        train_dataset,
        batch_size=config.train_batch_size,
        num_workers=config.num_workers,
        shuffle=True,
        {% if use_distributed_training and not use_distributed_launcher %}
        persistent_workers=True,
        {% endif %}
    )
    eval_dataloader = idist.auto_dataloader(
        eval_dataset,
        batch_size=config.eval_batch_size,
        num_workers=config.num_workers,
        shuffle=False,
        {% if use_distributed_training and not use_distributed_launcher %}
        persistent_workers=True,
        {% endif %}
    )

    # ------------------------------------------
    # model, optimizer, loss function, device
    # ------------------------------------------

    device = idist.device()
    config.num_iters_per_epoch = len(train_dataloader)
    model, optimizer, loss_fn, lr_scheduler = initialize(config=config)

    # -----------------------------
    # trainer and evaluator
    # -----------------------------

    trainer, evaluator = create_trainers(
        config=config,
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn,
        device=device,
    )

    # ---------------------------------
    # attach metrics to evaluator
    # ---------------------------------
    accuracy = Accuracy(device=device)
    metrics = {
        "eval_accuracy": accuracy,
        "eval_loss": Loss(loss_fn, device=device),
        "eval_error": (1.0 - accuracy) * 100,
    }
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # -------------------------------------------
    # setup engines logger with python logging
    # print training configurations
    # -------------------------------------------

    logger = setup_logging(config)
    log_basic_info(logger, config)
    trainer.logger = logger
    evaluator.logger = logger

    # -------------------------------------
    # ignite handlers and ignite loggers
    # -------------------------------------

    to_save = {"model": model, "optimizer": optimizer, "trainer": trainer, "lr_scheduler": lr_scheduler}
    best_model_handler, es_handler, timer_handler = get_handlers(
        config=config,
        model=model,
        trainer=trainer,
        evaluator=evaluator,
        metric_name="eval_accuracy",
        es_metric_name="eval_accuracy",
        to_save=to_save,
        lr_scheduler=lr_scheduler,
        output_names=None,
    )

    # setup ignite logger only on rank 0
    if rank == 0:
        logger_handler = get_logger(
            config=config, trainer=trainer, evaluator=evaluator, optimizers=optimizer
        )

    # -----------------------------------
    # resume from the saved checkpoints
    # -----------------------------------

    if config.resume_from:
        resume_from(to_load=to_save, checkpoint_fp=config.resume_from)

    # --------------------------------
    # print metrics to the stderr
    # with `add_event_handler` API
    # for training stats
    # --------------------------------

    trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train")

    # ---------------------------------------------
    # run evaluation at every training epoch end
    # with shortcut `on` decorator API and
    # print metrics to the stderr
    # again with `add_event_handler` API
    # for evaluation stats
    # ---------------------------------------------

    @trainer.on(Events.EPOCH_COMPLETED(every=1))
    def _():
        evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length)
        log_metrics(evaluator, "eval")

    # --------------------------------------------------
    # let's try run evaluation first as a sanity check
    # --------------------------------------------------

    @trainer.on(Events.STARTED)
    def _():
        evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length)

    # ------------------------------------------
    # setup if done. let's run the training
    # ------------------------------------------

    trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length)

    # ------------------------------------------------------------
    # close the logger after the training completed / terminated
    # ------------------------------------------------------------

    if rank == 0:
        from ignite.contrib.handlers.wandb_logger import WandBLogger

        if isinstance(logger_handler, WandBLogger):
            # why handle differently for wandb ?
            # See : https://github.com/pytorch/ignite/issues/1894
            logger_handler.finish()
        elif logger_handler:
            logger_handler.close()

    # -----------------------------------------
    # where is my best and last checkpoint ?
    # -----------------------------------------

    if best_model_handler is not None:
        logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
Ejemplo n.º 12
0
def training(rank, config):
    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    # Define output folder:
    config.output = "/tmp/output"

    model = idist.auto_model(config.model)
    optimizer = idist.auto_optim(config.optimizer)
    criterion = config.criterion

    train_set, val_set = config.train_set, config.val_set
    train_loader = idist.auto_dataloader(train_set,
                                         batch_size=config.train_batch_size)
    val_loader = idist.auto_dataloader(val_set,
                                       batch_size=config.val_batch_size)

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device)
    trainer.logger = setup_logger("Trainer")

    metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)}

    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device)
    train_evaluator.logger = setup_logger("Train Evaluator")
    validation_evaluator = create_supervised_evaluator(model,
                                                       metrics=metrics,
                                                       device=device)
    validation_evaluator.logger = setup_logger("Val Evaluator")

    @trainer.on(Events.EPOCH_COMPLETED(every=config.val_interval))
    def compute_metrics(engine):
        train_evaluator.run(train_loader)
        validation_evaluator.run(val_loader)

    if rank == 0:
        tb_logger = TensorboardLogger(log_dir=config.output)

        tb_logger.attach_output_handler(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=100),
            tag="training",
            output_transform=lambda loss: {"batchloss": loss},
            metric_names="all",
        )

        for tag, evaluator in [("training", train_evaluator),
                               ("validation", validation_evaluator)]:
            tb_logger.attach_output_handler(
                evaluator,
                event_name=Events.EPOCH_COMPLETED,
                tag=tag,
                metric_names=["loss", "accuracy"],
                global_step_transform=global_step_from_engine(trainer),
            )

        tb_logger.attach_opt_params_handler(
            trainer,
            event_name=Events.ITERATION_COMPLETED(every=100),
            optimizer=optimizer)

    model_checkpoint = ModelCheckpoint(
        config.output,
        n_saved=2,
        filename_prefix="best",
        score_name="accuracy",
        global_step_transform=global_step_from_engine(trainer),
    )
    validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint,
                                           {"model": model})

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if rank == 0:
        tb_logger.close()
Ejemplo n.º 13
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            task.connect(config)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    logger.info(
        f"# model parameters (M): {sum([m.numel() for m in model.parameters()]) * 1e-6}"
    )

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and
    # compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics=metrics, config=config)
    train_evaluator = create_evaluator(model, metrics=metrics, config=config)

    if config["smoke_test"]:
        logger.info(
            "Reduce the size of training and test dataloader as smoke_test=True"
        )

        def get_batches(loader):
            loader_iter = iter(loader)
            return [next(loader_iter) for _ in range(5)]

        train_loader = get_batches(train_loader)
        test_loader = get_batches(test_loader)

    if config["with_pbar"] and rank == 0:
        ProgressBar(desc="Evaluation (train)",
                    persist=False).attach(train_evaluator)
        ProgressBar(desc="Evaluation (val)", persist=False).attach(evaluator)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED,
        run_validation,
    )

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 1 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=1,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(
            lambda *_: trainer.state.epoch > config["num_epochs"] // 2),
        best_model_handler,
    )

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Ejemplo n.º 14
0
def run_training(local_rank: int, config: ConfigSchema) -> Dict[str, float]:
    rank = idist.get_rank()
    if config.seed is not None:
        manual_seed(config.seed + rank)

    logger = setup_logger(name=config.experiment_name, distributed_rank=local_rank)

    log_basic_info(logger, config)

    if rank == 0:
        prepare_output_directory(config)
        logger.info("Output path: {}".format(config.output_path))

    weak_label_mgr = get_weak_label_manager(config)

    # Setup dataflow, model, optimizer, criterion
    data_loaders = get_dataflow(config, weak_label_mgr)
    train_loader = data_loaders["train"]
    config.num_iters_per_epoch = len(train_loader)

    model, optimizer, criterion = initialize(config, weak_label_mgr)

    metrics = get_metrics(criterion)
    trainer, evaluators = create_trainer_and_evaluators(
        model, optimizer, criterion, data_loaders, metrics, config, logger
    )

    if rank == 0:
        tb_logger = common.setup_tb_logging(
            config.output_path, trainer, optimizer, evaluators=evaluators
        )

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluators["val"],
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="test",
    )
    state_at_best_val = StateAtBestVal(
        score_function=lambda: evaluators["val"].state.metrics["accuracy"],
        state_function=lambda: dict(
            {"val_" + key: val for key, val in evaluators["val"].state.metrics.items()},
            **{
                "test_" + key: val
                for key, val in evaluators["test"].state.metrics.items()
            },
            epoch=trainer.state.epoch
        ),
    )
    trainer.add_event_handler(Events.EPOCH_COMPLETED, state_at_best_val)

    try:
        trainer.run(train_loader, max_epochs=config.num_epochs)
    except Exception:
        import traceback

        print(traceback.format_exc())
    else:
        assert state_at_best_val.best_state is not None
        tb_logger.writer.add_hparams(  # type: ignore
            get_hparams(config),
            {"hparam/" + key: val for key, val in state_at_best_val.best_state.items()},
        )
    finally:
        if rank == 0:
            tb_logger.close()  # type: ignore

    return evaluators["val"].state.metrics
Ejemplo n.º 15
0
def run(
    data_path="/tmp/MNIST",
    seed=3321,
    mode="xentropy",
    noise_fraction=0.35,
    batch_size=64,
    val_batch_size=1000,
    num_epochs=50,
    lr=0.01,
    momentum=0.5,
    as_pseudo_label=None,
    log_dir="/tmp/output-bootstraping-loss/mnist/",
    with_trains=False,
):
    """Training on noisy labels with bootstrapping

    Args:
        data_path (str): Path to MNIST dataset. Default, "/tmp/MNIST"
        seed (int): Random seed to setup. Default, 3321
        mode (str): Loss function mode: cross-entropy or bootstrapping (soft, hard). 
            Choices 'xentropy', 'soft_bootstrap', 'hard_bootstrap'.
        noise_fraction (float): Label noise fraction. Default, 0.35.
        batch_size (int): Input batch size for training. Default, 64.
        val_batch_size (int): input batch size for validation. Default, 1000.
        num_epochs (int): Number of epochs to train. Default, 50.
        lr (float): Learning rate. Default, 0.01.
        momentum (float): SGD momentum. Default, 0.5.
        log_dir (str): Log directory for Tensorboard log output. Default="/tmp/output-bootstraping-loss/mnist/".
        with_trains (bool): if True, experiment Trains logger is setup. Default, False.

    """
    assert torch.cuda.is_available(), "Training should running on GPU"
    device = "cuda"

    manual_seed(seed)
    logger = setup_logger(name="MNIST-Training")

    now = datetime.now().strftime("%Y%m%d-%H%M%S")

    # Setup output path
    suffix = ""
    if mode == "soft_bootstrap" and (as_pseudo_label is not None
                                     and not as_pseudo_label):
        suffix = "as_xreg"
    output_path = Path(log_dir) / "train_{}_{}_{}_{}__{}".format(
        mode, noise_fraction, suffix, now, num_epochs)

    if not output_path.exists():
        output_path.mkdir(parents=True)

    parameters = {
        "seed": seed,
        "mode": mode,
        "noise_fraction": noise_fraction,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
        "lr": lr,
        "momentum": momentum,
        "as_pseudo_label": as_pseudo_label,
    }
    log_basic_info(logger, parameters)

    if with_trains:
        from trains import Task

        task = Task.init("BootstrappingLoss - Experiments on MNIST",
                         task_name=output_path.name)
        # Log hyper parameters
        task.connect(parameters)

    train_loader, test_loader = get_data_loaders(data_path, noise_fraction,
                                                 batch_size, val_batch_size)
    model = Net().to(device)
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)

    if mode == 'xentropy':
        criterion = nn.CrossEntropyLoss()
    elif mode == 'soft_bootstrap':
        if as_pseudo_label is None:
            as_pseudo_label = True
        criterion = SoftBootstrappingLoss(beta=0.95,
                                          as_pseudo_label=as_pseudo_label)
    elif mode == 'hard_bootstrap':
        criterion = HardBootstrappingLoss(beta=0.8)
    else:
        raise ValueError(
            "Wrong mode {}, expected: xentropy, soft_bootstrap or hard_bootstrap"
            .format(mode))

    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device,
                                        non_blocking=True)

    metrics = {
        "Accuracy": Accuracy(),
        "{} loss".format(mode): Loss(criterion),
    }
    if mode is not "xentropy":
        metrics["xentropy loss"] = Loss(nn.CrossEntropyLoss())

    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED | Events.COMPLETED,
                              run_validation)

    evaluators = {"training": train_evaluator, "test": evaluator}
    tb_logger = common.setup_tb_logging(output_path,
                                        trainer,
                                        optimizer,
                                        evaluators=evaluators)

    trainer.run(train_loader, max_epochs=num_epochs)

    test_acc = evaluator.state.metrics["Accuracy"]
    tb_logger.writer.add_hparams(parameters,
                                 {"hparam/test_accuracy": test_acc})

    tb_logger.close()

    return (mode, noise_fraction, as_pseudo_label, test_acc)
Ejemplo n.º 16
0
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
    """function to be run by idist.Parallel context manager."""

    # ----------------------
    # make a certain seed
    # ----------------------
    rank = idist.get_rank()
    manual_seed(config.seed + rank)

    # -----------------------
    # create output folder
    # -----------------------

    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")
        name = f"{config.dataset}-backend-{idist.backend()}-{now}"
        path = Path(config.output_dir, name)
        path.mkdir(parents=True, exist_ok=True)
        config.output_dir = path.as_posix()

    config.output_dir = Path(idist.broadcast(config.output_dir, src=0))

    # -----------------------------
    # datasets and dataloaders
    # -----------------------------

    train_dataset, num_channels = get_datasets(config.dataset, config.data_path)

    train_dataloader = idist.auto_dataloader(
        train_dataset,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        {% if use_distributed_training and not use_distributed_launcher %}
        persistent_workers=True,
        {% endif %}
    )

    # ------------------------------------------
    # model, optimizer, loss function, device
    # ------------------------------------------

    device = idist.device()
    netD, netG, optimizerD, optimizerG, loss_fn, lr_scheduler = initialize(config, num_channels)

    # -----------------------------
    # trainer and evaluator
    # -----------------------------
    ws = idist.get_world_size()
    real_labels = torch.ones(config.batch_size // ws, device=device)
    fake_labels = torch.zeros(config.batch_size // ws, device=device)
    fixed_noise = torch.randn(config.batch_size // ws, config.z_dim, 1, 1, device=device)

    trainer = create_trainers(
        config=config,
        netD=netD,
        netG=netG,
        optimizerD=optimizerD,
        optimizerG=optimizerG,
        loss_fn=loss_fn,
        device=device,
        real_labels=real_labels,
        fake_labels=fake_labels,
    )

    # -------------------------------------------
    # setup engines logger with python logging
    # print training configurations
    # -------------------------------------------

    logger = setup_logging(config)
    log_basic_info(logger, config)
    trainer.logger = logger

    # -------------------------------------
    # ignite handlers and ignite loggers
    # -------------------------------------

    to_save = {'netD': netD, 'netG': netG, 'optimizerD': optimizerD, 'optimizerG': optimizerG, 'trainer': trainer}
    optimizers = {'optimizerD': optimizerD, 'optimizerG': optimizerG}
    best_model_handler, es_handler, timer_handler = get_handlers(
        config=config,
        model={'netD', netD, 'netG', netG},
        trainer=trainer,
        evaluator=trainer,
        metric_name='errD',
        es_metric_name='errD',
        to_save=to_save,
        lr_scheduler=lr_scheduler,
        output_names=["errD", "errG", "D_x", "D_G_z1", "D_G_z2"],
    )

    # setup ignite logger only on rank 0
    if rank == 0:
        logger_handler = get_logger(config=config, trainer=trainer, optimizers=optimizers)

    # -----------------------------------
    # resume from the saved checkpoints
    # -----------------------------------

    if config.resume_from:
        resume_from(to_load=to_save, checkpoint_fp=config.resume_from)

    # --------------------------------------------------
    # adding handlers using `trainer.on` decorator API
    # --------------------------------------------------

    @trainer.on(Events.EPOCH_COMPLETED)
    def save_fake_example(engine):
        fake = netG(fixed_noise)
        path = config.output_dir / (FAKE_IMG_FNAME.format(engine.state.epoch))
        vutils.save_image(fake.detach(), path, normalize=True)

    # --------------------------------------------------
    # adding handlers using `trainer.on` decorator API
    # --------------------------------------------------
    @trainer.on(Events.EPOCH_COMPLETED)
    def save_real_example(engine):
        img, y = engine.state.batch
        path = config.output_dir / (REAL_IMG_FNAME.format(engine.state.epoch))
        vutils.save_image(img, path, normalize=True)

    # -------------------------------------------------------------
    # adding handlers using `trainer.on` decorator API
    # -------------------------------------------------------------
    @trainer.on(Events.EPOCH_COMPLETED)
    def print_times(engine):
        if not timer_handler:
            logger.info(f"Epoch {engine.state.epoch} done. Time per batch: {timer_handler.value():.3f}[s]")
            timer_handler.reset()

    @trainer.on(Events.ITERATION_COMPLETED(every=config.log_every_iters))
    @idist.one_rank_only()
    def print_logs(engine):
        fname = config.output_dir / LOGS_FNAME
        columns = ["iteration", ] + list(engine.state.metrics.keys())
        values = [str(engine.state.iteration), ] + [str(round(value, 5)) for value in engine.state.metrics.values()]

        with open(fname, "a") as f:
            if f.tell() == 0:
                print("\t".join(columns), file=f)
            print("\t".join(values), file=f)
        message = f"[{engine.state.epoch}/{config.max_epochs}][{engine.state.iteration % len(train_dataloader)}/{len(train_dataloader)}]"
        for name, value in zip(columns, values):
            message += f" | {name}: {value}"

    # -------------------------------------------------------------
    # adding handlers using `trainer.on` decorator API
    # -------------------------------------------------------------
    @trainer.on(Events.EPOCH_COMPLETED)
    def create_plots(engine):
        try:
            import matplotlib as mpl

            mpl.use("agg")

            import matplotlib.pyplot as plt
            import pandas as pd

        except ImportError:
            warnings.warn("Loss plots will not be generated -- pandas or matplotlib not found")

        else:
            df = pd.read_csv(config.output_dir / LOGS_FNAME, delimiter="\t", index_col="iteration")
            _ = df.plot(subplots=True, figsize=(20, 20))
            _ = plt.xlabel("Iteration number")
            fig = plt.gcf()
            path = config.output_dir / PLOT_FNAME

            fig.savefig(path)

    # --------------------------------
    # print metrics to the stderr
    # with `add_event_handler` API
    # for training stats
    # --------------------------------

    trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train")

    # ------------------------------------------
    # setup if done. let's run the training
    # ------------------------------------------

    trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length)

    # ------------------------------------------------------------
    # close the logger after the training completed / terminated
    # ------------------------------------------------------------

    if rank == 0:
        from ignite.contrib.handlers.wandb_logger import WandBLogger

        if isinstance(logger_handler, WandBLogger):
            # why handle differently for wandb ?
            # See : https://github.com/pytorch/ignite/issues/1894
            logger_handler.finish()
        elif logger_handler:
            logger_handler.close()

    # -----------------------------------------
    # where is my best and last checkpoint ?
    # -----------------------------------------

    if best_model_handler is not None:
        logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
Ejemplo n.º 17
0
        splitter.is_fitted = True
        splitter.save(base_transforms)
        logging.info(f"Splitter fitted and saved. [{splitter}]")
    if not synthesizer.is_fitted:
        synthesizer.is_fitted = True
        synthesizer.save()
        logging.info(f"Synthesizer fitted and saved. [{synthesizer}]")

all_dataset_indices = splitter.all_dataset_indices
all_groups = splitter.all_groups
############# extract from fitted transformer ###############
num_edge_types = next(x for x in base_transforms.transforms
                      if isinstance(x, T.NodeConnector)).num_edge_types
logging.info(f"#Edge-Types: {num_edge_types}")
###################################
manual_seed(seed)  # set seed, ignite function
group_iterator = splitter.split()

training_status_meta_dict = get_training_status_meta_dict(
    splitter.path_to_dir, args.model, args.num_splits, args.epochs)

for split_idx, (train_groups, val_groups,
                test_groups) in enumerate(group_iterator):
    logging.info(f"Split: {split_idx + 1}")

    training_status_meta = training_status_meta_dict.get(split_idx)
    logging.info(f"Training status meta: {training_status_meta}")

    if training_status_meta.get(
            "finished", False) or (args.target_splits and
                                   (split_idx + 1) not in args.target_splits):
Ejemplo n.º 18
0
def train(cfg: DictConfig) -> None:

    # Determine device (GPU, CPU, etc.)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Model
    model = get_network(cfg)

    # Data Loaders (select example)
    if cfg.example == "classification":
        train_loader, val_loader = get_dataloaders(
            cfg, num_workers=cfg.data_loader_workers, dataset_name="mnist")
        # Your training loop
        trainer = create_classification_training_loop(model,
                                                      cfg,
                                                      "trainer",
                                                      device=device)
        # Your evaluation loop
        evaluator = create_classification_evaluation_loop(model,
                                                          cfg,
                                                          "evaluator",
                                                          device=device)
    else:
        train_loader, val_loader = get_dataloaders(
            cfg, num_workers=cfg.data_loader_workers, dataset_name="reunion")
        # Your training loop
        trainer = create_regression_training_loop(model,
                                                  cfg,
                                                  "trainer",
                                                  device=device)
        # Your evaluation loop
        evaluator = create_regression_evaluation_loop(model,
                                                      cfg,
                                                      "evaluator",
                                                      device=device)

    ld = LogDirector(cfg, engines=[trainer, evaluator])

    # Set configuration defined random seed
    manual_seed(cfg.random_seed)

    ########################################################################
    # Logging Callbacks
    ########################################################################

    # Helper to run the evaluation loop
    def run_evaluator():
        evaluator.run(val_loader)
        # NOTE: Must return the engine we want to log from
        return evaluator

    if cfg.example == "regression":

        ld.set_event_handlers(
            trainer,
            Events.EPOCH_COMPLETED(every=1),
            EngineStateAttr.OUTPUT,
            log_operations=[
                (LOG_OP.LOG_MESSAGE, ["loss"]),
                (LOG_OP.SAVE_IN_DATA_FILE, ["loss"]),
                (
                    LOG_OP.NUMBER_TO_VISDOM,
                    [
                        # First plot, key is "p1"
                        VisPlot(
                            var_name="loss",
                            plot_key="p1",
                            split="mse",
                            env="main",
                            opts={
                                "title": "Train error (loss) ",
                                "x_label": "Iters",
                                "y_label": "nll",
                            },
                        )
                    ],
                ),
            ],
        )

        ld.set_event_handlers(
            trainer,
            Events.EPOCH_COMPLETED(every=20),
            EngineStateAttr.METRICS,
            engine_producer=run_evaluator,
            log_operations=[
                (
                    LOG_OP.NUMBER_TO_VISDOM,
                    [
                        # First plot, key is "p1"
                        VisPlot(
                            var_name="loss",
                            plot_key="p2",
                            split="mse",
                            opts={
                                "title": "Eval error (loss)",
                                "x_label": "Epoch",
                                "y_label": "Loss",
                            },
                            env="main",
                        )
                    ],
                ),
                (
                    LOG_OP.NUMBER_TO_VISDOM,
                    [
                        VisPlot(
                            var_name="mape",
                            plot_key="p3",
                            split="MAPE",
                            opts={
                                "title": "Eval error (MAPE)",
                                "x_label": "Epoch",
                                "y_label": "MAPE",
                            },
                            env="main",
                        )
                    ],
                ),
            ],
        )

        def score_function(engine):
            return evaluator.state.metrics["mape"]

        handler = ModelCheckpoint("models/",
                                  "checkpoint",
                                  score_function=score_function,
                                  n_saved=5)

        trainer.add_event_handler(Events.EPOCH_COMPLETED(every=50), handler,
                                  {"mlp": model})

        ld.set_event_handlers(
            trainer,
            Events.EPOCH_COMPLETED(every=100),
            EngineStateAttr.OUTPUT,
            engine_producer=run_evaluator,
            log_operations=[(
                LOG_OP.VECTOR_TO_VISDOM,
                [
                    VisPlot(
                        var_name="ypred_first",
                        plot_key="e",
                        split="nll",
                        opts={
                            "title": "Predictions vs. Ground Truth,  # ",
                            "x_label": "Time",
                            "y_label": "Energy Consumption",
                        },
                        env="main",
                    )
                ],
            )],
        )

    if cfg.example == "classification":
        ld.set_event_handlers(
            trainer,
            Events.ITERATION_COMPLETED(every=50),
            EngineStateAttr.OUTPUT,
            do_before_logging=postprocess_image_to_log,
            log_operations=[
                (LOG_OP.SAVE_IMAGE, ["im"]),  # Save images to a folder
                (LOG_OP.LOG_MESSAGE, ["nll"
                                      ]),  # Log fields as message in logfile
                (
                    LOG_OP.SAVE_IN_DATA_FILE,
                    ["nll"],
                ),  # Log fields as separate data files
                (
                    LOG_OP.NUMBER_TO_VISDOM,
                    [
                        # First plot, key is "p1"
                        VisPlot(
                            var_name="nll",
                            plot_key="p1",
                            split="nll_1",
                            # Any opts that Visdom supports
                            opts={
                                "title": "Plot 1",
                                "xlabel": "Iters",
                                "fillarea": True,
                            },
                        ),
                        VisPlot(var_name="nll_2", plot_key="p1",
                                split="nll_2"),
                    ],
                ),
                (
                    LOG_OP.IMAGE_TO_VISDOM,
                    [
                        VisImg(
                            var_name="im",
                            img_key="1",
                            env="images",
                            opts={
                                "caption": "a current image",
                                "title": "title"
                            },
                        ),
                        VisImg(
                            var_name="im",
                            img_key="2",
                            env="images",
                            opts={
                                "caption": "a current image",
                                "title": "title"
                            },
                        ),
                    ],
                ),
            ],
        )

        ld.set_event_handlers(
            trainer,
            Events.EPOCH_COMPLETED,
            EngineStateAttr.METRICS,
            # Run the evaluation loop, then do log operations from the return engine
            engine_producer=run_evaluator,
            log_operations=[
                (
                    LOG_OP.LOG_MESSAGE,
                    ["nll", "accuracy"],
                ),  # Log fields as message in logfile
                (
                    LOG_OP.SAVE_IN_DATA_FILE,
                    ["accuracy"],
                ),  # Log fields as separate data files
                (
                    LOG_OP.NUMBER_TO_VISDOM,
                    [
                        # First plot, key is "p1"
                        VisPlot(
                            var_name="accuracy",
                            plot_key="p3",
                            split="acc",
                            # Any opts that Visdom supports
                            opts={
                                "title": "Eval Acc",
                                "xlabel": "Iters"
                            },
                        ),
                        # First plot, key is "p1"
                        VisPlot(
                            var_name="nll",
                            plot_key="p4",
                            split="nll",
                            # Any opts that Visdom supports
                            opts={
                                "title": "Eval Nll",
                                "xlabel": "Iters",
                                "fillarea": True,
                            },
                        ),
                    ],
                ),
            ],
        )

    # Execute training
    if cfg.example == "regression":
        trainer.run(train_loader, max_epochs=1000)
    else:
        trainer.run(train_loader, max_epochs=cfg.mode.train.max_epochs)
Ejemplo n.º 19
0
def training(local_rank, config, logger, with_clearml):

    rank = idist.get_rank()
    manual_seed(config.seed + local_rank)

    train_loader = config.train_loader
    val_loader = config.val_loader
    train_eval_loader = config.train_eval_loader

    model, optimizer, criterion = utils.initialize(config)

    # Setup trainer for this specific task
    trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger, with_clearml)

    # Setup evaluators
    num_classes = config.num_classes
    cm_metric = ConfusionMatrix(num_classes=num_classes)

    val_metrics = {
        "IoU": IoU(cm_metric),
        "mIoU_bg": mIoU(cm_metric),
    }

    if ("val_metrics" in config) and isinstance(config.val_metrics, dict):
        val_metrics.update(config.val_metrics)

    evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val")
    train_evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="train")

    val_interval = config.get("val_interval", 1)

    # Run validation on every val_interval epoch, in the end of the training
    # and in the begining if config.start_by_validation is True
    event = Events.EPOCH_COMPLETED(every=val_interval)
    if config.num_epochs % val_interval != 0:
        event |= Events.COMPLETED
    if config.get("start_by_validation", False):
        event |= Events.STARTED

    @trainer.on(event)
    def run_validation():
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_eval_loader)
        utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(val_loader)
        utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    score_metric_name = "mIoU_bg"
    if "es_patience" in config:
        common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name)

    # Store 2 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=utils.get_save_handler(config.output_path.as_posix(), with_clearml),
        evaluator=evaluator,
        models=model,
        metric_name=score_metric_name,
        n_saved=2,
        trainer=trainer,
        tag="val",
    )

    # Setup Tensorboard logger
    if rank == 0:
        tb_logger = common.setup_tb_logging(
            config.output_path.as_posix(),
            trainer,
            optimizer,
            evaluators={"training": train_evaluator, "validation": evaluator},
        )

        # Log validation predictions as images
        # We define a custom event filter to log less frequently the images (to reduce storage size)
        # - we plot images with masks of the middle validation batch
        # - once every 3 validations and
        # - at the end of the training
        def custom_event_filter(_, val_iteration):
            c1 = val_iteration == len(val_loader) // 2
            c2 = trainer.state.epoch % (config.get("val_interval", 1) * 3) == 0
            c2 |= trainer.state.epoch == config.num_epochs
            return c1 and c2

        # Image denormalization function to plot predictions with images
        mean = config.get("mean", (0.485, 0.456, 0.406))
        std = config.get("std", (0.229, 0.224, 0.225))
        img_denormalize = partial(data.denormalize, mean=mean, std=std)

        tb_logger.attach(
            evaluator,
            log_handler=vis.predictions_gt_images_handler(
                img_denormalize_fn=img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation",
            ),
            event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter),
        )

    # Log confusion matrix to ClearML:
    if with_clearml:
        trainer.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, trainer.state.iteration)

    trainer.run(train_loader, max_epochs=config.num_epochs)

    if idist.get_rank() == 0:
        tb_logger.close()
Ejemplo n.º 20
0
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any):
    """function to be run by idist.Parallel context manager."""

    # ----------------------
    # make a certain seed
    # ----------------------
    rank = idist.get_rank()
    manual_seed(config.seed + rank)

    # -----------------------
    # create output folder
    # -----------------------

    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")
        name = f"{config.dataset}-backend-{idist.backend()}-{now}"
        path = Path(config.output_dir, name)
        path.mkdir(parents=True, exist_ok=True)
        config.output_dir = path.as_posix()

    config.output_dir = Path(idist.broadcast(config.output_dir, src=0))

    # -----------------------------
    # datasets and dataloaders
    # -----------------------------
    # TODO : PLEASE provide your custom datasets and dataloaders configurations
    # we can use `idist.auto_dataloader` to handle distributed configurations
    # TODO : PLEASE replace `kwargs` with your desirable DataLoader arguments
    # See : https://pytorch.org/ignite/distributed.html#ignite.distributed.auto.auto_dataloader

    train_dataset, eval_dataset = get_datasets()

    train_dataloader = idist.auto_dataloader(train_dataset, **kwargs)
    eval_dataloader = idist.auto_dataloader(eval_dataset, **kwargs)

    # ------------------------------------------
    # model, optimizer, loss function, device
    # ------------------------------------------

    device = idist.device()
    model, optimizer, loss_fn, lr_scheduler = initialize()

    # -----------------------------
    # trainer and evaluator
    # -----------------------------

    trainer, evaluator = create_trainers(
        config=config,
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn,
        device=device,
    )

    # -------------------------------------------
    # update config with optimizer parameters
    # setup engines logger with python logging
    # print training configurations
    # -------------------------------------------

    config.__dict__.update(**optimizer.defaults)
    logger = setup_logging(config)
    log_basic_info(logger, config)
    trainer.logger = logger
    evaluator.logger = logger

    # -------------------------------------
    # ignite handlers and ignite loggers
    # -------------------------------------

    to_save = {"model": model, "optimizer": optimizer, "trainer": trainer, "lr_scheduler": lr_scheduler}
    best_model_handler, es_handler, timer_handler = get_handlers(
        config=config,
        model=model,
        trainer=trainer,
        evaluator=evaluator,
        metric_name=None,
        # TODO : replace with the metric name to save the best model
        # if you check `Save the best model by evaluation score` otherwise leave it None
        # metric must be in evaluator.state.metrics.
        es_metric_name=None,
        # TODO : replace with the metric name to early stop
        # if you check `Early stop the training by evaluation score` otherwise leave it None
        # metric must be in evaluator.state.metrics.
        to_save=to_save,
        lr_scheduler=lr_scheduler,
        output_names=None,
    )

    # setup ignite logger only on rank 0
    if rank == 0:
        logger_handler = get_logger(
            config=config, trainer=trainer, evaluator=evaluator, optimizers=optimizer
        )

    # -----------------------------------
    # resume from the saved checkpoints
    # -----------------------------------

    if config.resume_from:
        resume_from(to_load=to_save, checkpoint_fp=config.resume_from)

    # --------------------------------------------
    # let's trigger custom events we registered
    # we will use a `event_filter` to trigger that
    # `event_filter` has to return boolean
    # whether this event should be executed
    # here will log the gradients on the 1st iteration
    # and every 100 iterations
    # --------------------------------------------

    @trainer.on(TrainEvents.BACKWARD_COMPLETED(lambda _, ev: (ev % 100 == 0) or (ev == 1)))
    def _():
        # do something interesting
        pass

    # ----------------------------------------
    # here we will use `every` to trigger
    # every 100 iterations
    # ----------------------------------------

    @trainer.on(TrainEvents.OPTIM_STEP_COMPLETED(every=100))
    def _():
        # do something interesting
        pass

    # --------------------------------
    # print metrics to the stderr
    # with `add_event_handler` API
    # for training stats
    # --------------------------------

    trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train")

    # ---------------------------------------------
    # run evaluation at every training epoch end
    # with shortcut `on` decorator API and
    # print metrics to the stderr
    # again with `add_event_handler` API
    # for evaluation stats
    # ---------------------------------------------

    @trainer.on(Events.EPOCH_COMPLETED(every=1))
    def _():
        evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length)
        log_metrics(evaluator, "eval")

    # --------------------------------------------------
    # let's try run evaluation first as a sanity check
    # --------------------------------------------------

    @trainer.on(Events.STARTED)
    def _():
        evaluator.run(eval_dataloader, epoch_length=config.eval_epoch_length)

    # ------------------------------------------
    # setup if done. let's run the training
    # ------------------------------------------
    # TODO : PLEASE provide `max_epochs` parameters

    trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length)

    # ------------------------------------------------------------
    # close the logger after the training completed / terminated
    # ------------------------------------------------------------

    if rank == 0:
        from ignite.contrib.handlers.wandb_logger import WandBLogger

        if isinstance(logger_handler, WandBLogger):
            # why handle differently for wandb ?
            # See : https://github.com/pytorch/ignite/issues/1894
            logger_handler.finish()
        elif logger_handler:
            logger_handler.close()

    # -----------------------------------------
    # where is my best and last checkpoint ?
    # -----------------------------------------

    if best_model_handler is not None:
        logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
Ejemplo n.º 21
0
def run(
    train_batch_size,
    val_batch_size,
    epochs,
    lr,
    momentum,
    log_interval,
    log_dir,
    checkpoint_every,
    resume_from,
    crash_iteration=-1,
    deterministic=False,
):
    # Setup seed to have same model's initialization:
    manual_seed(75)

    train_loader, val_loader = get_data_loaders(train_batch_size,
                                                val_batch_size)
    model = Net()
    writer = SummaryWriter(log_dir=log_dir)
    device = "cpu"

    if torch.cuda.is_available():
        device = "cuda"

    model.to(device)  # Move model before creating optimizer
    criterion = nn.NLLLoss()
    optimizer = SGD(model.parameters(), lr=lr, momentum=momentum)
    lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

    # Setup trainer and evaluator
    if deterministic:
        tqdm.write("Setup deterministic trainer")
    trainer = create_supervised_trainer(model,
                                        optimizer,
                                        criterion,
                                        device=device,
                                        deterministic=deterministic)

    evaluator = create_supervised_evaluator(model,
                                            metrics={
                                                "accuracy": Accuracy(),
                                                "nll": Loss(criterion)
                                            },
                                            device=device)

    # Apply learning rate scheduling
    @trainer.on(Events.EPOCH_COMPLETED)
    def lr_step(engine):
        lr_scheduler.step()

    pbar = tqdm(initial=0,
                leave=False,
                total=len(train_loader),
                desc=f"Epoch {0} - loss: {0:.4f} - lr: {lr:.4f}")

    @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
    def log_training_loss(engine):
        lr = optimizer.param_groups[0]["lr"]
        pbar.desc = f"Epoch {engine.state.epoch} - loss: {engine.state.output:.4f} - lr: {lr:.4f}"
        pbar.update(log_interval)
        writer.add_scalar("training/loss", engine.state.output,
                          engine.state.iteration)
        writer.add_scalar("lr", lr, engine.state.iteration)

    if crash_iteration > 0:

        @trainer.on(Events.ITERATION_COMPLETED(once=crash_iteration))
        def _(engine):
            raise Exception(f"STOP at {engine.state.iteration}")

    if resume_from is not None:

        @trainer.on(Events.STARTED)
        def _(engine):
            pbar.n = engine.state.iteration % engine.state.epoch_length

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        pbar.refresh()
        evaluator.run(train_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            f"Training Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )
        writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("training/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    # Compute and log validation metrics
    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(engine):
        evaluator.run(val_loader)
        metrics = evaluator.state.metrics
        avg_accuracy = metrics["accuracy"]
        avg_nll = metrics["nll"]
        tqdm.write(
            f"Validation Results - Epoch: {engine.state.epoch} Avg accuracy: {avg_accuracy:.2f} Avg loss: {avg_nll:.2f}"
        )
        pbar.n = pbar.last_print_n = 0
        writer.add_scalar("valdation/avg_loss", avg_nll, engine.state.epoch)
        writer.add_scalar("valdation/avg_accuracy", avg_accuracy,
                          engine.state.epoch)

    # Setup object to checkpoint
    objects_to_checkpoint = {
        "trainer": trainer,
        "model": model,
        "optimizer": optimizer,
        "lr_scheduler": lr_scheduler
    }
    training_checkpoint = Checkpoint(
        to_save=objects_to_checkpoint,
        save_handler=DiskSaver(log_dir, require_empty=False),
        n_saved=None,
        global_step_transform=lambda *_: trainer.state.epoch,
    )
    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=checkpoint_every),
                              training_checkpoint)

    # Setup logger to print and dump into file: model weights, model grads and data stats
    # - first 3 iterations
    # - 4 iterations after checkpointing
    # This helps to compare resumed training with checkpointed training
    def log_event_filter(e, event):
        if event in [1, 2, 3]:
            return True
        elif 0 <= (event % (checkpoint_every * e.state.epoch_length)) < 5:
            return True
        return False

    fp = Path(log_dir) / ("run.log"
                          if resume_from is None else "resume_run.log")
    fp = fp.as_posix()
    for h in [log_data_stats, log_model_weights, log_model_grads]:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED(event_filter=log_event_filter),
            h,
            model=model,
            fp=fp)

    if resume_from is not None:
        tqdm.write(f"Resume from the checkpoint: {resume_from}")
        checkpoint = torch.load(resume_from)
        Checkpoint.load_objects(to_load=objects_to_checkpoint,
                                checkpoint=checkpoint)

    try:
        # Synchronize random states
        manual_seed(15)
        trainer.run(train_loader, max_epochs=epochs)
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    pbar.close()
    writer.close()
Ejemplo n.º 22
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-Training", distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        if config["stop_iteration"] is None:
            now = datetime.now().strftime("%Y%m%d-%H%M%S")
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info(f"Output path: {config['output_path']}")

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

        if config["with_clearml"]:
            try:
                from clearml import Task
            except ImportError:
                # Backwards-compatibility for legacy Trains SDK
                from trains import Task

            task = Task.init("CIFAR10-Training", task_name=output_path.stem)
            task.connect_configuration(config)
            # Log hyper parameters
            hyper_params = [
                "model",
                "batch_size",
                "momentum",
                "weight_decay",
                "num_epochs",
                "learning_rate",
                "num_warmup_epochs",
            ]
            task.connect({k: config[k] for k in hyper_params})

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler, train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_evaluator(model, metrics=metrics, config=config)
    train_evaluator = create_evaluator(model, metrics=metrics, config=config)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics)

    trainer.add_event_handler(Events.EPOCH_COMPLETED(every=config["validate_every"]) | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path, trainer, optimizer, evaluators=evaluators)

    # Store 2 best models by validation accuracy starting from num_epochs / 2:
    best_model_handler = Checkpoint(
        {"model": model},
        get_save_handler(config),
        filename_prefix="best",
        n_saved=2,
        global_step_transform=global_step_from_engine(trainer),
        score_name="test_accuracy",
        score_function=Checkpoint.get_default_score_fn("Accuracy"),
    )
    evaluator.add_event_handler(
        Events.COMPLETED(lambda *_: trainer.state.epoch > config["num_epochs"] // 2), best_model_handler
    )

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        logger.exception("")
        raise e

    if rank == 0:
        tb_logger.close()
Ejemplo n.º 23
0
def training(local_rank, cfg):

    logger = setup_logger("FixMatch Training", distributed_rank=idist.get_rank())

    if local_rank == 0:
        logger.info(cfg.pretty())

    rank = idist.get_rank()
    manual_seed(cfg.seed + rank)
    device = idist.device()

    model, ema_model, optimizer, sup_criterion, lr_scheduler = utils.initialize(cfg)

    unsup_criterion = instantiate(cfg.solver.unsupervised_criterion)

    cta = get_default_cta()

    (
        supervised_train_loader,
        test_loader,
        unsup_train_loader,
        cta_probe_loader,
    ) = utils.get_dataflow(cfg, cta=cta, with_unsup=True)

    def train_step(engine, batch):
        model.train()
        optimizer.zero_grad()

        x, y = batch["sup_batch"]["image"], batch["sup_batch"]["target"]
        if x.device != device:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

        weak_x, strong_x = (
            batch["unsup_batch"]["image"],
            batch["unsup_batch"]["strong_aug"],
        )
        if weak_x.device != device:
            weak_x = weak_x.to(device, non_blocking=True)
            strong_x = strong_x.to(device, non_blocking=True)

        # according to TF code: single forward pass on concat data: [x, weak_x, strong_x]
        le = 2 * engine.state.mu_ratio + 1
        # Why interleave: https://github.com/google-research/fixmatch/issues/20#issuecomment-613010277
        # We need to interleave due to multiple-GPU batch norm issues. Let's say we have to GPUs, and our batch is
        # comprised of labeled (L) and unlabeled (U) images. Let's use a batch size of 2 for making easier visually
        # in my following example.
        #
        # - Without interleaving, we have a batch LLUUUUUU...U (there are 14 U). When the batch is split to be passed
        # to both GPUs, we'll have two batches LLUUUUUU and UUUUUUUU. Note that all labeled examples ended up in batch1
        # sent to GPU1. The problem here is that batch norm will be computed per batch and the moments will lack
        # consistency between batches.
        #
        # - With interleaving, by contrast, the two batches will be LUUUUUUU and LUUUUUUU. As you can notice the
        # batches have the same distribution of labeled and unlabeled samples and will therefore have more consistent
        # moments.
        #
        x_cat = interleave(torch.cat([x, weak_x, strong_x], dim=0), le)
        y_pred_cat = model(x_cat)
        y_pred_cat = deinterleave(y_pred_cat, le)

        idx1 = len(x)
        idx2 = idx1 + len(weak_x)
        y_pred = y_pred_cat[:idx1, ...]
        y_weak_preds = y_pred_cat[idx1:idx2, ...]  # logits_weak
        y_strong_preds = y_pred_cat[idx2:, ...]  # logits_strong

        # supervised learning:
        sup_loss = sup_criterion(y_pred, y)

        # unsupervised learning:
        y_weak_probas = torch.softmax(y_weak_preds, dim=1).detach()
        y_pseudo = y_weak_probas.argmax(dim=1)
        max_y_weak_probas, _ = y_weak_probas.max(dim=1)
        unsup_loss_mask = (
            max_y_weak_probas >= engine.state.confidence_threshold
        ).float()
        unsup_loss = (
            unsup_criterion(y_strong_preds, y_pseudo) * unsup_loss_mask
        ).mean()

        total_loss = sup_loss + engine.state.lambda_u * unsup_loss

        total_loss.backward()

        optimizer.step()

        return {
            "total_loss": total_loss.item(),
            "sup_loss": sup_loss.item(),
            "unsup_loss": unsup_loss.item(),
            "mask": unsup_loss_mask.mean().item(),  # this should not be averaged for DDP
        }

    output_names = ["total_loss", "sup_loss", "unsup_loss", "mask"]

    trainer = trainers.create_trainer(
        train_step,
        output_names=output_names,
        model=model,
        ema_model=ema_model,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        supervised_train_loader=supervised_train_loader,
        test_loader=test_loader,
        cfg=cfg,
        logger=logger,
        cta=cta,
        unsup_train_loader=unsup_train_loader,
        cta_probe_loader=cta_probe_loader,
    )

    trainer.state.confidence_threshold = cfg.ssl.confidence_threshold
    trainer.state.lambda_u = cfg.ssl.lambda_u
    trainer.state.mu_ratio = cfg.ssl.mu_ratio

    distributed = idist.get_world_size() > 1

    @trainer.on(Events.ITERATION_COMPLETED(every=cfg.ssl.cta_update_every))
    def update_cta_rates():
        batch = trainer.state.batch
        x, y = batch["cta_probe_batch"]["image"], batch["cta_probe_batch"]["target"]
        if x.device != device:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

        policies = batch["cta_probe_batch"]["policy"]

        ema_model.eval()
        with torch.no_grad():
            y_pred = ema_model(x)
            y_probas = torch.softmax(y_pred, dim=1)  # (N, C)

            if distributed:
                for y_proba, t, policy in zip(y_probas, y, policies):
                    error = y_proba
                    error[t] -= 1
                    error = torch.abs(error).sum()
                    cta.update_rates(policy, 1.0 - 0.5 * error.item())
            else:
                error_per_op = []
                for y_proba, t, policy in zip(y_probas, y, policies):
                    error = y_proba
                    error[t] -= 1
                    error = torch.abs(error).sum()
                    for k, bins in policy:
                        error_per_op.append(pack_as_tensor(k, bins, error))
                error_per_op = torch.stack(error_per_op)
                # all gather
                tensor_list = idist.all_gather(error_per_op)
                # update cta rates
                for t in tensor_list:
                    k, bins, error = unpack_from_tensor(t)
                    cta.update_rates([(k, bins),], 1.0 - 0.5 * error)

    epoch_length = cfg.solver.epoch_length
    num_epochs = cfg.solver.num_epochs if not cfg.debug else 2
    try:
        trainer.run(
            supervised_train_loader, epoch_length=epoch_length, max_epochs=num_epochs
        )
    except Exception as e:
        import traceback

        print(traceback.format_exc())
Ejemplo n.º 24
0
def training(local_rank, config):

    config["device"] = "cuda" if config["active_gpu_ids"] else "cpu"

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)

    device = idist.device()

    logger = setup_logger(name="Carbon Black Semantic Segmentation Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]

    if rank == 0:
        if config["stop_iteration"] is None:
            now = utils.get_time_stamp()
        else:
            now = f"stop-on-{config['stop_iteration']}"

        folder_name = (
            f"{config['architecture']}-{config['encoder']}-{config['encoder_weights']}_"
            f"backend-{idist.backend()}-{idist.get_world_size()}_{now}")

        output_path = Path(output_path) / folder_name
        output_path.mkdir(parents=True, exist_ok=True)
        config["output_path"] = output_path.as_posix()
        config["task_name"] = output_path.stem

        logger.info(f"Output path: {output_path}")

        if "cuda" in idist.device().type:
            config["cuda_device_name"] = torch.cuda.get_device_name(local_rank)

        setup_trains_logging(config)

    dataloader_train, dataloader_val = get_dataloaders(config)

    config["num_iterations_per_epoch"] = len(dataloader_train)
    config["num_epochs"] = round(config["num_iterations"] /
                                 config["num_iterations_per_epoch"])
    model = modeling.get_model(config)

    optimizer = get_optimizer(model, config)
    loss = get_loss()

    lr_scheduler = get_lr_scheduler(optimizer, config)

    trainer = create_trainer(model, optimizer, loss, lr_scheduler,
                             dataloader_train.sampler, config, logger)

    metrics = get_metrics(loss)

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    evaluator_train = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": evaluator_train, "validation": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

        example_prediction_logger = ExamplePredictionLogger(
            tb_logger, model, device)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = evaluator_train.run(dataloader_train)
        data_subset = "Train"
        log_metrics(logger, epoch, state.times["COMPLETED"], data_subset,
                    state.metrics)
        log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics)

        state = evaluator.run(dataloader_val)
        data_subset = "Val"
        log_metrics(logger, epoch, state.times["COMPLETED"], data_subset,
                    state.metrics)
        log_confusion_matrix(tb_logger, epoch, data_subset, state.metrics)
        example_prediction_logger.log_visualization(dataloader_val.dataset,
                                                    epoch)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    # Store 3 best models by validation accuracy:
    common.gen_save_best_models_by_val_score(
        save_handler=get_save_handler(config),
        evaluator=evaluator,
        models={"model": model},
        metric_name="accuracy",
        n_saved=3,
        trainer=trainer,
        tag="validation",
    )

    # TODO: Add early stopping

    # In order to check training resuming we can stop training on a given iteration
    if config["stop_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["stop_iteration"]))
        def _():
            logger.info(
                f"Stop training on {trainer.state.iteration} iteration")
            trainer.terminate()

    # noinspection PyBroadException
    try:
        trainer.run(dataloader_train, max_epochs=config["num_epochs"])
    except Exception:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        # noinspection PyUnboundLocalVariable
        tb_logger.close()
Ejemplo n.º 25
0
def run(output_path, config):

    distributed = dist.is_available() and dist.is_initialized()
    rank = dist.get_rank() if distributed else 0

    manual_seed(config["seed"] + rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = utils.get_dataflow(config, distributed)
    model, optimizer = utils.get_model_optimizer(config, distributed)
    criterion = nn.CrossEntropyLoss().to(utils.device)

    le = len(train_loader)
    milestones_values = [
        (0, 0.0),
        (le * config["num_warmup_epochs"], config["learning_rate"]),
        (le * config["num_epochs"], 0.0),
    ]
    lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values)

    # Setup Ignite trainer:
    # - let's define training step
    # - add other common handlers:
    #    - TerminateOnNan,
    #    - handler to setup learning rate scheduling,
    #    - ModelCheckpoint
    #    - RunningAverage` on `train_step` output
    #    - Two progress bars on epochs and optionally on iterations

    def train_step(engine, batch):

        x = convert_tensor(batch[0], device=utils.device, non_blocking=True)
        y = convert_tensor(batch[1], device=utils.device, non_blocking=True)

        model.train()
        # Supervised part
        y_pred = model(x)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        return {
            "batch loss": loss.item(),
        }

    if config["deterministic"] and rank == 0:
        print("Setup deterministic trainer")
    trainer = Engine(train_step) if not config["deterministic"] else DeterministicEngine(train_step)
    train_sampler = train_loader.sampler if distributed else None
    to_save = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler}
    metric_names = [
        "batch loss",
    ]
    common.setup_common_training_handlers(
        trainer,
        train_sampler=train_sampler,
        to_save=to_save,
        save_every_iters=config["checkpoint_every"],
        output_path=output_path,
        lr_scheduler=lr_scheduler,
        output_names=metric_names,
        with_pbar_on_iters=config["display_iters"],
        log_every_iters=10,
    )

    if rank == 0:
        # Setup Tensorboard logger - wrapper on SummaryWriter
        tb_logger = TensorboardLogger(log_dir=output_path)
        # Attach logger to the trainer and log trainer's metrics (stored in trainer.state.metrics) every iteration
        tb_logger.attach(
            trainer,
            log_handler=OutputHandler(tag="train", metric_names=metric_names),
            event_name=Events.ITERATION_COMPLETED,
        )
        # log optimizer's parameters: "lr" every iteration
        tb_logger.attach(
            trainer, log_handler=OptimizerParamsHandler(optimizer, param_name="lr"), event_name=Events.ITERATION_STARTED
        )

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "accuracy": Accuracy(device=utils.device if distributed else None),
        "loss": Loss(criterion, device=utils.device if distributed else None),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True)
    train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=utils.device, non_blocking=True)

    def run_validation(engine):
        train_evaluator.run(train_loader)
        evaluator.run(test_loader)

    trainer.add_event_handler(Events.EPOCH_STARTED(every=config["validate_every"]), run_validation)
    trainer.add_event_handler(Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup progress bar on evaluation engines
        if config["display_iters"]:
            ProgressBar(persist=False, desc="Train evaluation").attach(train_evaluator)
            ProgressBar(persist=False, desc="Test evaluation").attach(evaluator)

        # Let's log metrics of `train_evaluator` stored in `train_evaluator.state.metrics` when validation run is done
        tb_logger.attach(
            train_evaluator,
            log_handler=OutputHandler(
                tag="train", metric_names="all", global_step_transform=global_step_from_engine(trainer)
            ),
            event_name=Events.COMPLETED,
        )

        # Let's log metrics of `evaluator` stored in `evaluator.state.metrics` when validation run is done
        tb_logger.attach(
            evaluator,
            log_handler=OutputHandler(
                tag="test", metric_names="all", global_step_transform=global_step_from_engine(trainer)
            ),
            event_name=Events.COMPLETED,
        )

        # Store 3 best models by validation accuracy:
        common.save_best_model_by_val_score(
            output_path, evaluator, model=model, metric_name="accuracy", n_saved=3, trainer=trainer, tag="test"
        )

        # Optionally log model gradients
        if config["log_model_grads_every"] is not None:
            tb_logger.attach(
                trainer,
                log_handler=GradsHistHandler(model, tag=model.__class__.__name__),
                event_name=Events.ITERATION_COMPLETED(every=config["log_model_grads_every"]),
            )

    # In order to check training resuming we can emulate a crash
    if config["crash_iteration"] is not None:

        @trainer.on(Events.ITERATION_STARTED(once=config["crash_iteration"]))
        def _(engine):
            raise Exception("STOP at iteration: {}".format(engine.state.iteration))

    resume_from = config["resume_from"]
    if resume_from is not None:
        checkpoint_fp = Path(resume_from)
        assert checkpoint_fp.exists(), "Checkpoint '{}' is not found".format(checkpoint_fp.as_posix())
        print("Resume from a checkpoint: {}".format(checkpoint_fp.as_posix()))
        checkpoint = torch.load(checkpoint_fp.as_posix())
        Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint)

    try:
        trainer.run(train_loader, max_epochs=config["num_epochs"])
    except Exception as e:
        import traceback

        print(traceback.format_exc())

    if rank == 0:
        tb_logger.close()
Ejemplo n.º 26
0
def training(local_rank, config):

    rank = idist.get_rank()
    manual_seed(config["seed"] + rank)
    device = idist.device()

    logger = setup_logger(name="CIFAR10-QAT-Training",
                          distributed_rank=local_rank)

    log_basic_info(logger, config)

    output_path = config["output_path"]
    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")

        folder_name = "{}_backend-{}-{}_{}".format(config["model"],
                                                   idist.backend(),
                                                   idist.get_world_size(), now)
        output_path = Path(output_path) / folder_name
        if not output_path.exists():
            output_path.mkdir(parents=True)
        config["output_path"] = output_path.as_posix()
        logger.info("Output path: {}".format(config["output_path"]))

        if "cuda" in device.type:
            config["cuda device name"] = torch.cuda.get_device_name(local_rank)

    # Setup dataflow, model, optimizer, criterion
    train_loader, test_loader = get_dataflow(config)

    config["num_iters_per_epoch"] = len(train_loader)
    model, optimizer, criterion, lr_scheduler = initialize(config)

    # Create trainer for current task
    trainer = create_trainer(model, optimizer, criterion, lr_scheduler,
                             train_loader.sampler, config, logger)

    # Let's now setup evaluator engine to perform model's validation and compute metrics
    metrics = {
        "Accuracy": Accuracy(),
        "Loss": Loss(criterion),
    }

    # We define two evaluators as they wont have exactly similar roles:
    # - `evaluator` will save the best model based on validation score
    evaluator = create_supervised_evaluator(model,
                                            metrics=metrics,
                                            device=device,
                                            non_blocking=True)
    train_evaluator = create_supervised_evaluator(model,
                                                  metrics=metrics,
                                                  device=device,
                                                  non_blocking=True)

    def run_validation(engine):
        epoch = trainer.state.epoch
        state = train_evaluator.run(train_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Train",
                    state.metrics)
        state = evaluator.run(test_loader)
        log_metrics(logger, epoch, state.times["COMPLETED"], "Test",
                    state.metrics)

    trainer.add_event_handler(
        Events.EPOCH_COMPLETED(every=config["validate_every"])
        | Events.COMPLETED, run_validation)

    if rank == 0:
        # Setup TensorBoard logging on trainer and evaluators. Logged values are:
        #  - Training metrics, e.g. running average loss values
        #  - Learning rate
        #  - Evaluation train/test metrics
        evaluators = {"training": train_evaluator, "test": evaluator}
        tb_logger = common.setup_tb_logging(output_path,
                                            trainer,
                                            optimizer,
                                            evaluators=evaluators)

    # Store 3 best models by validation accuracy:
    common.save_best_model_by_val_score(
        output_path=config["output_path"],
        evaluator=evaluator,
        model=model,
        metric_name="Accuracy",
        n_saved=1,
        trainer=trainer,
        tag="test",
    )

    trainer.run(train_loader, max_epochs=config["num_epochs"])

    if rank == 0:
        tb_logger.close()
Ejemplo n.º 27
0
def _test_distrib_integration(device, atol=1e-8):

    rank = idist.get_rank()
    n_iters = 100
    s = 10
    offset = n_iters * s

    # test for float
    manual_seed(42)
    y_pred = torch.rand(offset * idist.get_world_size(),
                        3,
                        28,
                        28,
                        device=device)
    y = y_pred * 0.65
    data_range = (y.max() - y.min()).cpu().item()
    _test(y_pred, y, data_range, "cpu", n_iters, s, offset, rank, atol=atol)

    # test for YCbCr
    manual_seed(42)
    y_pred = torch.randint(16,
                           236, (offset * idist.get_world_size(), 1, 12, 12),
                           dtype=torch.uint8,
                           device=device)
    cbcr_pred = torch.randint(16,
                              241,
                              (offset * idist.get_world_size(), 2, 12, 12),
                              dtype=torch.uint8,
                              device=device)
    y = torch.randint(16,
                      236, (offset * idist.get_world_size(), 1, 12, 12),
                      dtype=torch.uint8,
                      device=device)
    cbcr = torch.randint(16,
                         241, (offset * idist.get_world_size(), 2, 12, 12),
                         dtype=torch.uint8,
                         device=device)

    y_pred, y = torch.cat((y_pred, cbcr_pred), dim=1), torch.cat((y, cbcr),
                                                                 dim=1)
    data_range = (y[:, 0, ...].max() - y[:, 0, ...].min()).cpu().item()
    _test(
        y_pred=y_pred,
        y=y,
        data_range=data_range,
        metric_device="cpu",
        n_iters=n_iters,
        s=s,
        offset=offset,
        rank=rank,
        atol=atol,
        output_transform=lambda x: (x[0][:, 0, ...], x[1][:, 0, ...]),
        compute_y_channel=True,
    )

    # test for uint8
    manual_seed(42)
    y_pred = torch.randint(0,
                           256, (offset * idist.get_world_size(), 3, 16, 16),
                           device=device,
                           dtype=torch.uint8)
    y = (y_pred * 0.65).to(torch.uint8)
    data_range = (y.max() - y.min()).cpu().item()
    _test(y_pred, y, data_range, "cpu", n_iters, s, offset, rank, atol=atol)

    # test with NHW shape
    manual_seed(42)
    y_pred = torch.rand(offset * idist.get_world_size(), 28, 28, device=device)
    y = y_pred * 0.8
    data_range = (y.max() - y.min()).cpu().item()
    _test(y_pred, y, data_range, "cpu", n_iters, s, offset, rank, atol=atol)

    if torch.device(device).type != "xla":
        manual_seed(42)
        y_pred = torch.rand(offset * idist.get_world_size(),
                            3,
                            28,
                            28,
                            device=device)
        y = y_pred * 0.65
        data_range = (y.max() - y.min()).cpu().item()
        _test(y_pred,
              y,
              data_range,
              idist.device(),
              n_iters,
              s,
              offset,
              rank,
              atol=atol)

        # test for YCbCr
        manual_seed(42)
        y_pred = torch.randint(16,
                               236,
                               (offset * idist.get_world_size(), 1, 12, 12),
                               dtype=torch.uint8,
                               device=device)
        cbcr_pred = torch.randint(16,
                                  241,
                                  (offset * idist.get_world_size(), 2, 12, 12),
                                  dtype=torch.uint8,
                                  device=device)
        y = torch.randint(16,
                          236, (offset * idist.get_world_size(), 1, 12, 12),
                          dtype=torch.uint8,
                          device=device)
        cbcr = torch.randint(16,
                             241, (offset * idist.get_world_size(), 2, 12, 12),
                             dtype=torch.uint8,
                             device=device)
        y_pred, y = torch.cat((y_pred, cbcr_pred), dim=1), torch.cat((y, cbcr),
                                                                     dim=1)
        data_range = (y[:, 0, ...].max() - y[:, 0, ...].min()).cpu().item()
        _test(
            y_pred=y_pred,
            y=y,
            data_range=data_range,
            metric_device=idist.device(),
            n_iters=n_iters,
            s=s,
            offset=offset,
            rank=rank,
            atol=atol,
            output_transform=lambda x: (x[0][:, 0, ...], x[1][:, 0, ...]),
            compute_y_channel=True,
        )

        manual_seed(42)
        y_pred = torch.randint(0,
                               256,
                               (offset * idist.get_world_size(), 3, 16, 16),
                               device=device,
                               dtype=torch.uint8)
        y = (y_pred * 0.65).to(torch.uint8)
        data_range = (y.max() - y.min()).cpu().item()
        _test(y_pred,
              y,
              data_range,
              idist.device(),
              n_iters,
              s,
              offset,
              rank,
              atol=atol)

        # test with NHW shape
        manual_seed(42)
        y_pred = torch.rand(offset * idist.get_world_size(),
                            28,
                            28,
                            device=device)
        y = y_pred * 0.8
        data_range = (y.max() - y.min()).cpu().item()
        _test(y_pred,
              y,
              data_range,
              idist.device(),
              n_iters,
              s,
              offset,
              rank,
              atol=atol)
Ejemplo n.º 28
0
    def _train(save_iter=None, save_epoch=None, sd=None):
        w_norms = []
        grad_norms = []
        data = []
        chkpt = []

        manual_seed(12)
        arch = [
            nn.Conv2d(3, 10, 3),
            nn.ReLU(),
            nn.Conv2d(10, 10, 3),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(10, 5),
            nn.ReLU(),
            nn.Linear(5, 2),
        ]
        if with_dropout:
            arch.insert(2, nn.Dropout2d())
            arch.insert(-2, nn.Dropout())

        model = nn.Sequential(*arch).to(device)
        opt = SGD(model.parameters(), lr=0.001)

        def proc_fn(e, b):
            from ignite.engine.deterministic import _get_rng_states, _repr_rng_state

            s = _repr_rng_state(_get_rng_states())
            model.train()
            opt.zero_grad()
            y = model(b.to(device))
            y.sum().backward()
            opt.step()
            if debug:
                print(trainer.state.iteration, trainer.state.epoch,
                      "proc_fn - b.shape", b.shape,
                      torch.norm(y).item(), s)

        trainer = DeterministicEngine(proc_fn)

        if save_iter is not None:
            ev = Events.ITERATION_COMPLETED(once=save_iter)
        elif save_epoch is not None:
            ev = Events.EPOCH_COMPLETED(once=save_epoch)
            save_iter = save_epoch * (data_size // batch_size)

        @trainer.on(ev)
        def save_chkpt(_):
            if debug:
                print(trainer.state.iteration, "save_chkpt")
            fp = dirname / "test.pt"
            from ignite.engine.deterministic import _repr_rng_state

            tsd = trainer.state_dict()
            if debug:
                print("->", _repr_rng_state(tsd["rng_states"]))
            torch.save([model.state_dict(), opt.state_dict(), tsd], fp)
            chkpt.append(fp)

        def log_event_filter(_, event):
            if (event // save_iter == 1) and 1 <= (event % save_iter) <= 5:
                return True
            return False

        @trainer.on(Events.ITERATION_COMPLETED(event_filter=log_event_filter))
        def write_data_grads_weights(e):
            x = e.state.batch
            i = e.state.iteration
            data.append([i, x.mean().item(), x.std().item()])

            total = [0.0, 0.0]
            out1 = []
            out2 = []
            for p in model.parameters():
                n1 = torch.norm(p).item()
                n2 = torch.norm(p.grad).item()
                out1.append(n1)
                out2.append(n2)
                total[0] += n1
                total[1] += n2
            w_norms.append([i, total[0]] + out1)
            grad_norms.append([i, total[1]] + out2)

        if sd is not None:
            sd = torch.load(sd)
            model.load_state_dict(sd[0])
            opt.load_state_dict(sd[1])
            from ignite.engine.deterministic import _repr_rng_state

            if debug:
                print("-->", _repr_rng_state(sd[2]["rng_states"]))
            trainer.load_state_dict(sd[2])

        manual_seed(32)
        trainer.run(random_train_data_loader(size=data_size), max_epochs=5)
        return {
            "sd": chkpt,
            "data": data,
            "grads": grad_norms,
            "weights": w_norms
        }
Ejemplo n.º 29
0
def run(local_rank, config):

    # ----------------------
    # Make a certain seed
    # ----------------------
    rank = idist.get_rank()
    manual_seed(config.seed + rank)
    device = idist.device()

    # -----------------------
    # Create output folder
    # -----------------------
    if rank == 0:
        now = datetime.now().strftime("%Y%m%d-%H%M%S")
        name = f"{config.model}-backend-{idist.backend()}-{now}"
        path = Path(config.output_dir, name)
        path.mkdir(parents=True, exist_ok=True)
        config.output_dir = path.as_posix()

    config.output_dir = Path(idist.broadcast(config.output_dir, src=0))

    # -----------------------------
    # datasets and dataloaders
    # -----------------------------
    train_loader, test_loader = get_dataflow(config)

    # ------------------------------------------
    # model, optimizer, loss function, device
    # ------------------------------------------
    config.num_iters_per_epoch = len(train_loader)
    model, optimizer, loss_fn, lr_scheduler = initialize(config)

    # -----------------------------
    # trainer and evaluator
    # -----------------------------
    trainer, evaluator = create_trainers(
        config=config,
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn,
        device=device,
    )

    # ---------------------------------
    # attach metrics to evaluator
    # ---------------------------------
    metrics = {
        "eval_accuracy": Accuracy(output_transform=thresholded_output_transform, device=device),
        "eval_loss": Loss(loss_fn, device=device),
    }

    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # -------------------------------------------
    # setup engines logger with python logging
    # print training configurations
    # -------------------------------------------
    logger = setup_logging(config)
    log_basic_info(logger, config)
    trainer.logger = logger
    evaluator.logger = logger

    # -------------------------------------
    # ignite handlers and ignite loggers
    # -------------------------------------
    to_save = {"model": model, "optimizer": optimizer, "trainer": trainer, "lr_scheduler": lr_scheduler}
    best_model_handler, es_handler, timer_handler = get_handlers(
        config=config,
        model=model,
        trainer=trainer,
        evaluator=evaluator,
        metric_name="eval_accuracy",
        es_metric_name="eval_accuracy",
        to_save=to_save,
        lr_scheduler=lr_scheduler,
        output_names=None,
    )

    # setup ignite logger only on rank 0
    if rank == 0:
        logger_handler = get_logger(
            config=config, trainer=trainer, evaluator=evaluator, optimizers=optimizer
        )

    # -----------------------------------
    # resume from the saved checkpoints
    # -----------------------------------
    if config.resume_from:
        resume_from(to_load=to_save, checkpoint_fp=config.resume_from)

    # --------------------------------
    # print metrics to the stderr
    # with `add_event_handler` API
    # for training stats
    # --------------------------------
    trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train")

    # ---------------------------------------------
    # run evaluation at every training epoch end
    # with shortcut `on` decorator API and
    # print metrics to the stderr
    # again with `add_event_handler` API
    # for evaluation stats
    # ---------------------------------------------
    @trainer.on(Events.EPOCH_COMPLETED(every=config.validate_every))
    def _():
        evaluator.run(test_loader, epoch_length=config.eval_epoch_length)
        log_metrics(evaluator, tag="eval")

    # --------------------------------------------------
    # let's try run evaluation first as a sanity check
    # --------------------------------------------------
    @trainer.on(Events.STARTED)
    def _():
        evaluator.run(test_loader, epoch_length=config.eval_epoch_length)

    # ------------------------------------------
    # setup if done. let's run the training
    # ------------------------------------------
    trainer.run(train_loader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length)

    # ------------------------------------------------------------
    # close the logger after the training completed / terminated
    # ------------------------------------------------------------
    if rank == 0:
        from ignite.contrib.handlers.wandb_logger import WandBLogger

        if isinstance(logger_handler, WandBLogger):
            # why handle differently for wandb ?
            # See : https://github.com/pytorch/ignite/issues/1894
            logger_handler.finish()
        elif logger_handler:
            logger_handler.close()

    # -----------------------------------------
    # where is my best and last checkpoint ?
    # -----------------------------------------
    if best_model_handler is not None:
        logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)