Esempio n. 1
0
def test_best_epoch():
    mode = "min"
    reporter = Reporter()
    key1 = uuid.uuid4().hex
    stats_list = [{"aa": 0.3}, {"aa": 0.5}, {"aa": 0.2}]
    for e in range(len(stats_list)):
        reporter.set_epoch(e + 1)
        with reporter.observe(key1) as sub:
            sub.register(stats_list[e])
            sub.next()
    best_epoch = reporter.get_best_epoch(key1, "aa", mode)
    assert best_epoch == 3
Esempio n. 2
0
def test_logging(tmp_path):
    reporter = Reporter()
    key1 = uuid.uuid4().hex
    key2 = uuid.uuid4().hex
    stats_list = [
        {
            "aa": 0.3,
            "bb": 3.0
        },
        {
            "aa": 0.5,
            "bb": 3.0
        },
        {
            "aa": 0.2,
            "bb": 3.0
        },
    ]
    writer = SummaryWriter(tmp_path)
    for e in range(len(stats_list)):
        reporter.set_epoch(e + 1)
        with reporter.observe(key1) as sub:
            sub.register(stats_list[e])
            sub.next()
        with reporter.observe(key2) as sub:
            sub.register(stats_list[e])
            sub.next()
            logging.info(sub.log_message())
            logging.info(sub.log_message(-1))
            logging.info(sub.log_message(0, 1))
            sub.tensorboard_add_scalar(writer, -1)
        with pytest.raises(RuntimeError):
            logging.info(sub.log_message())

    logging.info(reporter.log_message())

    with reporter.observe(key1) as sub:
        sub.register({"aa": 0.1, "bb": 0.4})
        sub.next()
        sub.register({"aa": 0.1})
        sub.next()
Esempio n. 3
0
def test_check_early_stopping():
    mode = "min"
    reporter = Reporter()
    key1 = uuid.uuid4().hex
    stats_list = [{"aa": 0.3}, {"aa": 0.2}, {"aa": 0.4}, {"aa": 0.3}]
    patience = 1

    results = []
    for e in range(len(stats_list)):
        reporter.set_epoch(e + 1)
        with reporter.observe(key1) as sub:
            sub.register(stats_list[e])
        truefalse = reporter.check_early_stopping(patience, key1, "aa", mode)
        results.append(truefalse)
    assert results == [False, False, False, True]
Esempio n. 4
0
def test_sort_values():
    mode = "min"
    reporter = Reporter()
    key1 = uuid.uuid4().hex
    stats_list = [{"aa": 0.3}, {"aa": 0.5}, {"aa": 0.2}]
    for e in range(len(stats_list)):
        reporter.set_epoch(e + 1)
        with reporter.observe(key1) as sub:
            sub.register(stats_list[e])
    sort_values = reporter.sort_values(key1, "aa", mode)

    desired = sorted([stats_list[e]["aa"] for e in range(len(stats_list))],)

    for e in range(len(stats_list)):
        assert sort_values[e] == desired[e]
Esempio n. 5
0
def test_register(weight1, weight2):
    reporter = Reporter()
    reporter.set_epoch(1)
    with reporter.observe(uuid.uuid4().hex) as sub:
        stats1 = {
            "float": 0.6,
            "int": 6,
            "np": np.random.random(),
            "torch": torch.rand(1),
        }
        sub.register(stats1, weight1)
        sub.next()

        stats2 = {
            "float": 0.3,
            "int": 100,
            "np": np.random.random(),
            "torch": torch.rand(1),
        }
        sub.register(stats2, weight2)
        sub.next()
        assert sub.get_epoch() == 1
    with pytest.raises(RuntimeError):
        sub.register({})

    desired = {}
    for k in stats1:
        if stats1[k] is None:
            continue

        if weight1 is None:
            desired[k] = (stats1[k] + stats2[k]) / 2
        else:
            weight1 = float(weight1)
            weight2 = float(weight2)
            desired[k] = float(weight1 * stats1[k] + weight2 * stats2[k])
            desired[k] /= weight1 + weight2

    for k1, k2 in reporter.get_all_keys():
        if k2 in ("time", "total_count", "gpu_max_cached_mem_GB",
                  "gpu_cached_mem_GB"):
            continue
        np.testing.assert_allclose(reporter.get_value(k1, k2), desired[k2])
Esempio n. 6
0
def test_logging():
    reporter = Reporter()
    key1 = uuid.uuid4().hex
    key2 = uuid.uuid4().hex
    stats_list = [
        {"aa": 0.3, "bb": 3.0},
        {"aa": 0.5, "bb": 3.0},
        {"aa": 0.2, "bb": 3.0},
    ]
    for e in range(len(stats_list)):
        reporter.set_epoch(e + 1)
        with reporter.observe(key1) as sub:
            sub.register(stats_list[e])
        with reporter.observe(key2) as sub:
            sub.register(stats_list[e])
            logging.info(sub.log_message())
        with pytest.raises(RuntimeError):
            logging.info(sub.log_message())

    logging.info(reporter.log_message())
Esempio n. 7
0
def test_no_register():
    reporter = Reporter()
    with reporter.observe("train", 1):
        pass
Esempio n. 8
0
def test_zero_weight():
    reporter = Reporter()
    with reporter.observe("train", 1) as sub:
        sub.register({"a": 1}, weight=0)
Esempio n. 9
0
def test_register_nan():
    reporter = Reporter()
    with reporter.observe("train", 1) as sub:
        sub.register({"a": np.nan}, weight=1.0)
Esempio n. 10
0
def test_change_epoch():
    reporter = Reporter()
    with pytest.raises(RuntimeError):
        with reporter.observe("train", 1):
            reporter.set_epoch(2)
Esempio n. 11
0
def test_minus_epoch():
    with pytest.raises(ValueError):
        Reporter(-1)
Esempio n. 12
0
    def run(
        cls,
        model: AbsESPnetModel,
        optimizers: Sequence[torch.optim.Optimizer],
        schedulers: Sequence[Optional[AbsScheduler]],
        train_iter_factory: AbsIterFactory,
        valid_iter_factory: AbsIterFactory,
        plot_attention_iter_factory: Optional[AbsIterFactory],
        trainer_options,
        distributed_option: DistributedOption,
    ) -> None:
        """Perform training. This method performs the main process of training."""
        assert check_argument_types()
        # NOTE(kamo): Don't check the type more strictly as far trainer_options
        assert is_dataclass(trainer_options), type(trainer_options)
        assert len(optimizers) == len(schedulers), (len(optimizers),
                                                    len(schedulers))

        if isinstance(trainer_options.keep_nbest_models, int):
            keep_nbest_models = [trainer_options.keep_nbest_models]
        else:
            if len(trainer_options.keep_nbest_models) == 0:
                logging.warning("No keep_nbest_models is given. Change to [1]")
                trainer_options.keep_nbest_models = [1]
            keep_nbest_models = trainer_options.keep_nbest_models

        output_dir = Path(trainer_options.output_dir)
        reporter = Reporter()
        if trainer_options.use_amp:
            if LooseVersion(torch.__version__) < LooseVersion("1.6.0"):
                raise RuntimeError(
                    "Require torch>=1.6.0 for  Automatic Mixed Precision")
            if trainer_options.sharded_ddp:
                if fairscale is None:
                    raise RuntimeError(
                        "Requiring fairscale. Do 'pip install fairscale'")
                scaler = fairscale.optim.grad_scaler.ShardedGradScaler()
            else:
                scaler = GradScaler()
        else:
            scaler = None

        if trainer_options.resume and (output_dir / "checkpoint.pth").exists():
            cls.resume(
                checkpoint=output_dir / "checkpoint.pth",
                model=model,
                optimizers=optimizers,
                schedulers=schedulers,
                reporter=reporter,
                scaler=scaler,
                ngpu=trainer_options.ngpu,
            )

        start_epoch = reporter.get_epoch() + 1
        if start_epoch == trainer_options.max_epoch + 1:
            logging.warning(
                f"The training has already reached at max_epoch: {start_epoch}"
            )

        if distributed_option.distributed:
            if trainer_options.sharded_ddp:
                dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
                    module=model,
                    sharded_optimizer=optimizers,
                )
            else:
                dp_model = torch.nn.parallel.DistributedDataParallel(
                    model,
                    device_ids=(
                        # Perform multi-Process with multi-GPUs
                        [torch.cuda.current_device()]
                        if distributed_option.ngpu == 1
                        # Perform single-Process with multi-GPUs
                        else None),
                    output_device=(torch.cuda.current_device()
                                   if distributed_option.ngpu == 1 else None),
                    find_unused_parameters=trainer_options.unused_parameters,
                )
        elif distributed_option.ngpu > 1:
            dp_model = torch.nn.parallel.DataParallel(
                model,
                device_ids=list(range(distributed_option.ngpu)),
            )
        else:
            # NOTE(kamo): DataParallel also should work with ngpu=1,
            # but for debuggability it's better to keep this block.
            dp_model = model

        if trainer_options.use_tensorboard and (
                not distributed_option.distributed
                or distributed_option.dist_rank == 0):
            from torch.utils.tensorboard import SummaryWriter

            train_summary_writer = SummaryWriter(
                str(output_dir / "tensorboard" / "train"))
            valid_summary_writer = SummaryWriter(
                str(output_dir / "tensorboard" / "valid"))
        else:
            train_summary_writer = None

        start_time = time.perf_counter()
        for iepoch in range(start_epoch, trainer_options.max_epoch + 1):
            if iepoch != start_epoch:
                logging.info(
                    "{}/{}epoch started. Estimated time to finish: {}".format(
                        iepoch,
                        trainer_options.max_epoch,
                        humanfriendly.format_timespan(
                            (time.perf_counter() - start_time) /
                            (iepoch - start_epoch) *
                            (trainer_options.max_epoch - iepoch + 1)),
                    ))
            else:
                logging.info(
                    f"{iepoch}/{trainer_options.max_epoch}epoch started")
            set_all_random_seed(trainer_options.seed + iepoch)

            reporter.set_epoch(iepoch)
            # 1. Train and validation for one-epoch
            with reporter.observe("train") as sub_reporter:
                all_steps_are_invalid = cls.train_one_epoch(
                    model=dp_model,
                    optimizers=optimizers,
                    schedulers=schedulers,
                    iterator=train_iter_factory.build_iter(iepoch),
                    reporter=sub_reporter,
                    scaler=scaler,
                    summary_writer=train_summary_writer,
                    options=trainer_options,
                    distributed_option=distributed_option,
                )

            with reporter.observe("valid") as sub_reporter:
                cls.validate_one_epoch(
                    model=dp_model,
                    iterator=valid_iter_factory.build_iter(iepoch),
                    reporter=sub_reporter,
                    options=trainer_options,
                    distributed_option=distributed_option,
                )
            if not distributed_option.distributed or distributed_option.dist_rank == 0:
                # att_plot doesn't support distributed
                if plot_attention_iter_factory is not None:
                    with reporter.observe("att_plot") as sub_reporter:
                        cls.plot_attention(
                            model=model,
                            output_dir=output_dir / "att_ws",
                            summary_writer=train_summary_writer,
                            iterator=plot_attention_iter_factory.build_iter(
                                iepoch),
                            reporter=sub_reporter,
                            options=trainer_options,
                        )

            # 2. LR Scheduler step
            for scheduler in schedulers:
                if isinstance(scheduler, AbsValEpochStepScheduler):
                    scheduler.step(
                        reporter.get_value(
                            *trainer_options.val_scheduler_criterion))
                elif isinstance(scheduler, AbsEpochStepScheduler):
                    scheduler.step()
            if trainer_options.sharded_ddp:
                for optimizer in optimizers:
                    if isinstance(optimizer, fairscale.optim.oss.OSS):
                        optimizer.consolidate_state_dict()

            if not distributed_option.distributed or distributed_option.dist_rank == 0:
                # 3. Report the results
                logging.info(reporter.log_message())
                if trainer_options.use_matplotlib:
                    reporter.matplotlib_plot(output_dir / "images")
                if train_summary_writer is not None:
                    reporter.tensorboard_add_scalar(train_summary_writer,
                                                    key1="train")
                    reporter.tensorboard_add_scalar(valid_summary_writer,
                                                    key1="valid")
                if trainer_options.use_wandb:
                    reporter.wandb_log()

                # 4. Save/Update the checkpoint
                torch.save(
                    {
                        "model":
                        model.state_dict(),
                        "reporter":
                        reporter.state_dict(),
                        "optimizers": [o.state_dict() for o in optimizers],
                        "schedulers": [
                            s.state_dict() if s is not None else None
                            for s in schedulers
                        ],
                        "scaler":
                        scaler.state_dict() if scaler is not None else None,
                    },
                    output_dir / "checkpoint.pth",
                )

                # 5. Save and log the model and update the link to the best model
                torch.save(model.state_dict(),
                           output_dir / f"{iepoch}epoch.pth")

                # Creates a sym link latest.pth -> {iepoch}epoch.pth
                p = output_dir / "latest.pth"
                if p.is_symlink() or p.exists():
                    p.unlink()
                p.symlink_to(f"{iepoch}epoch.pth")

                _improved = []
                for _phase, k, _mode in trainer_options.best_model_criterion:
                    # e.g. _phase, k, _mode = "train", "loss", "min"
                    if reporter.has(_phase, k):
                        best_epoch = reporter.get_best_epoch(_phase, k, _mode)
                        # Creates sym links if it's the best result
                        if best_epoch == iepoch:
                            p = output_dir / f"{_phase}.{k}.best.pth"
                            if p.is_symlink() or p.exists():
                                p.unlink()
                            p.symlink_to(f"{iepoch}epoch.pth")
                            _improved.append(f"{_phase}.{k}")
                if len(_improved) == 0:
                    logging.info("There are no improvements in this epoch")
                else:
                    logging.info("The best model has been updated: " +
                                 ", ".join(_improved))

                log_model = (trainer_options.wandb_model_log_interval > 0
                             and iepoch %
                             trainer_options.wandb_model_log_interval == 0)
                if log_model and trainer_options.use_wandb:
                    import wandb

                    logging.info("Logging Model on this epoch :::::")
                    artifact = wandb.Artifact(
                        name=f"model_{wandb.run.id}",
                        type="model",
                        metadata={"improved": _improved},
                    )
                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
                    aliases = [
                        f"epoch-{iepoch}",
                        "best" if best_epoch == iepoch else "",
                    ]
                    wandb.log_artifact(artifact, aliases=aliases)

                # 6. Remove the model files excluding n-best epoch and latest epoch
                _removed = []
                # Get the union set of the n-best among multiple criterion
                nbests = set().union(*[
                    set(
                        reporter.sort_epochs(ph, k, m)
                        [:max(keep_nbest_models)])
                    for ph, k, m in trainer_options.best_model_criterion
                    if reporter.has(ph, k)
                ])

                # Generated n-best averaged model
                if (trainer_options.nbest_averaging_interval > 0
                        and iepoch % trainer_options.nbest_averaging_interval
                        == 0):
                    average_nbest_models(
                        reporter=reporter,
                        output_dir=output_dir,
                        best_model_criterion=trainer_options.
                        best_model_criterion,
                        nbest=keep_nbest_models,
                        suffix=f"till{iepoch}epoch",
                    )

                for e in range(1, iepoch):
                    p = output_dir / f"{e}epoch.pth"
                    if p.exists() and e not in nbests:
                        p.unlink()
                        _removed.append(str(p))
                if len(_removed) != 0:
                    logging.info("The model files were removed: " +
                                 ", ".join(_removed))

            # 7. If any updating haven't happened, stops the training
            if all_steps_are_invalid:
                logging.warning(
                    f"The gradients at all steps are invalid in this epoch. "
                    f"Something seems wrong. This training was stopped at {iepoch}epoch"
                )
                break

            # 8. Check early stopping
            if trainer_options.patience is not None:
                if reporter.check_early_stopping(
                        trainer_options.patience,
                        *trainer_options.early_stopping_criterion):
                    break

        else:
            logging.info(
                f"The training was finished at {trainer_options.max_epoch} epochs "
            )

        # Generated n-best averaged model
        if not distributed_option.distributed or distributed_option.dist_rank == 0:
            average_nbest_models(
                reporter=reporter,
                output_dir=output_dir,
                best_model_criterion=trainer_options.best_model_criterion,
                nbest=keep_nbest_models,
            )
Esempio n. 13
0
def average_nbest_models(
    output_dir: Path,
    reporter: Reporter,
    best_model_criterion: Sequence[Sequence[str]],
    nbest: int,
) -> None:
    """Generate averaged model from n-best models

    Args:
        output_dir: The directory contains the model file for each epoch
        reporter: Reporter instance
        best_model_criterion: Give criterions to decide the best model.
            e.g. [("valid", "loss", "min"), ("train", "acc", "max")]
        nbest:
    """
    assert check_argument_types()
    # 1. Get nbests: List[Tuple[str, str, List[Tuple[epoch, value]]]]
    nbest_epochs = [
        (ph, k, reporter.sort_epochs_and_values(ph, k, m)[:nbest])
        for ph, k, m in best_model_criterion
        if reporter.has(ph, k)
    ]

    _loaded = {}
    for ph, cr, epoch_and_values in nbest_epochs:
        # Note that len(epoch_and_values) doesn't always equal to nbest.
        op = output_dir / f"{ph}.{cr}.ave_{len(epoch_and_values)}best.pth"
        logging.info(
            f"Averaging {len(epoch_and_values)}best models: "
            f'criterion="{ph}.{cr}": {op}'
        )

        if len(epoch_and_values) == 0:
            continue
        elif len(epoch_and_values) == 1:
            # The averaged model is same as the best model
            e, _ = epoch_and_values[0]
            op = output_dir / f"{e}epoch.pth"
            for sym_op in [
                output_dir / f"{ph}.{cr}.ave.pth",
                output_dir / f"{ph}.{cr}.ave_{len(epoch_and_values)}best.pth",
            ]:
                if sym_op.is_symlink() or sym_op.exists():
                    sym_op.unlink()
                sym_op.symlink_to(op.name)
        else:
            avg = None
            # 2.a Averaging model
            for e, _ in epoch_and_values:
                if e not in _loaded:
                    _loaded[e] = torch.load(
                        output_dir / f"{e}epoch.pth", map_location="cpu",
                    )
                states = _loaded[e]

                if avg is None:
                    avg = states
                else:
                    # Accumulated
                    for k in avg:
                        avg[k] += states[k]
            for k in avg:
                if str(avg[k].dtype).startswith("torch.int"):
                    # For int type, not averaged, but only accumulated.
                    # e.g. BatchNorm.num_batches_tracked
                    # (If there are any cases that requires averaging
                    #  or the other reducing method, e.g. max/min, for integer type,
                    #  please report.)
                    pass
                else:
                    avg[k] /= len(epoch_and_values)

            # 2.b Save the ave model and create a symlink
            torch.save(avg, op)
            sym_op = output_dir / f"{ph}.{cr}.ave.pth"
            if sym_op.is_symlink() or sym_op.exists():
                sym_op.unlink()
            sym_op.symlink_to(op.name)
Esempio n. 14
0
def test_start_middle_epoch():
    reporter = Reporter()
    with reporter.observe("train", 2) as sub:
        sub.register({"a": 3})
Esempio n. 15
0
def test_measure_time():
    reporter = Reporter()
    with reporter.observe("train", 2) as sub:
        with sub.measure_time("foo"):
            pass
Esempio n. 16
0
def test_mismatch_key2():
    reporter = Reporter()
    with reporter.observe("train", 1) as sub:
        sub.register({"a": 2})
    with reporter.observe("train", 2) as sub:
        sub.register({"b": 3})
def reporter():
    _reporter = Reporter()
    _reporter.set_epoch(1)
    with _reporter.observe("valid") as sub:
        sub.register({"acc": 0.4})
        sub.next()

    _reporter.set_epoch(2)
    with _reporter.observe("valid") as sub:
        sub.register({"acc": 0.5})
        sub.next()

    _reporter.set_epoch(3)
    with _reporter.observe("valid") as sub:
        sub.register({"acc": 0.6})
        sub.next()

    return _reporter
Esempio n. 18
0
def test_tensorboard_add_scalar(tmp_path: Path):
    reporter = Reporter()
    reporter.set_epoch(1)
    key1 = "train"
    with reporter.observe(key1) as sub:
        stats1 = {"aa": 0.6}
        sub.register(stats1)
        sub.next()

    reporter.set_epoch(1)
    with reporter.observe(key1) as sub:
        # Skip epoch=2
        sub.register({})
        sub.next()

    reporter.set_epoch(3)
    with reporter.observe(key1) as sub:
        stats1 = {"aa": 0.6}
        sub.register(stats1)
        sub.next()

    writer = SummaryWriter(tmp_path)
    reporter.tensorboard_add_scalar(writer)
Esempio n. 19
0
def test_get_value_not_found():
    reporter = Reporter()
    with pytest.raises(KeyError):
        reporter.get_value("a", "b")
Esempio n. 20
0
    def run(
        cls,
        model: AbsESPnetModel,
        optimizers: Sequence[torch.optim.Optimizer],
        schedulers: Sequence[Optional[AbsScheduler]],
        train_iter_factory: AbsIterFactory,
        valid_iter_factory: AbsIterFactory,
        plot_attention_iter_factory: Optional[AbsIterFactory],
        reporter: Reporter,
        scaler: Optional[GradScaler],
        output_dir: Path,
        max_epoch: int,
        seed: int,
        patience: Optional[int],
        keep_nbest_models: int,
        early_stopping_criterion: Sequence[str],
        best_model_criterion: Sequence[Sequence[str]],
        val_scheduler_criterion: Sequence[str],
        trainer_options,
        distributed_option: DistributedOption,
    ) -> None:
        """Perform training. This method performs the main process of training."""
        assert check_argument_types()
        # NOTE(kamo): Don't check the type more strictly as far trainer_options
        assert is_dataclass(trainer_options), type(trainer_options)

        start_epoch = reporter.get_epoch() + 1
        if start_epoch == max_epoch + 1:
            logging.warning(
                f"The training has already reached at max_epoch: {start_epoch}"
            )

        if distributed_option.distributed:
            dp_model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=(
                    # Perform multi-Process with multi-GPUs
                    [torch.cuda.current_device()]
                    if distributed_option.ngpu == 1
                    # Perform single-Process with multi-GPUs
                    else None),
                output_device=(torch.cuda.current_device()
                               if distributed_option.ngpu == 1 else None),
            )
        elif distributed_option.ngpu > 1:
            dp_model = torch.nn.parallel.DataParallel(
                model,
                device_ids=list(range(distributed_option.ngpu)),
            )
        else:
            # NOTE(kamo): DataParallel also should work with ngpu=1,
            # but for debuggability it's better to keep this block.
            dp_model = model

        if not distributed_option.distributed or distributed_option.dist_rank == 0:
            summary_writer = SummaryWriter(str(output_dir / "tensorboard"))
        else:
            summary_writer = None

        start_time = time.perf_counter()
        for iepoch in range(start_epoch, max_epoch + 1):
            if iepoch != start_epoch:
                logging.info(
                    "{}/{}epoch started. Estimated time to finish: {}".format(
                        iepoch,
                        max_epoch,
                        humanfriendly.format_timespan(
                            (time.perf_counter() - start_time) /
                            (iepoch - start_epoch) * (max_epoch - iepoch + 1)),
                    ))
            else:
                logging.info(f"{iepoch}/{max_epoch}epoch started")
            set_all_random_seed(seed + iepoch)

            reporter.set_epoch(iepoch)
            # 1. Train and validation for one-epoch
            with reporter.observe("train") as sub_reporter:
                all_steps_are_invalid = cls.train_one_epoch(
                    model=dp_model,
                    optimizers=optimizers,
                    schedulers=schedulers,
                    iterator=train_iter_factory.build_iter(iepoch),
                    reporter=sub_reporter,
                    scaler=scaler,
                    summary_writer=summary_writer,
                    options=trainer_options,
                )

            with reporter.observe("valid") as sub_reporter:
                cls.validate_one_epoch(
                    model=dp_model,
                    iterator=valid_iter_factory.build_iter(iepoch),
                    reporter=sub_reporter,
                    options=trainer_options,
                )

            if not distributed_option.distributed or distributed_option.dist_rank == 0:
                # att_plot doesn't support distributed
                if plot_attention_iter_factory is not None:
                    with reporter.observe("att_plot") as sub_reporter:
                        cls.plot_attention(
                            model=model,
                            output_dir=output_dir / "att_ws",
                            summary_writer=summary_writer,
                            iterator=plot_attention_iter_factory.build_iter(
                                iepoch),
                            reporter=sub_reporter,
                            options=trainer_options,
                        )

            # 2. LR Scheduler step
            for scheduler in schedulers:
                if isinstance(scheduler, AbsValEpochStepScheduler):
                    scheduler.step(
                        reporter.get_value(*val_scheduler_criterion))
                elif isinstance(scheduler, AbsEpochStepScheduler):
                    scheduler.step()

            if not distributed_option.distributed or distributed_option.dist_rank == 0:
                # 3. Report the results
                logging.info(reporter.log_message())
                reporter.matplotlib_plot(output_dir / "images")
                reporter.tensorboard_add_scalar(summary_writer)

                # 4. Save/Update the checkpoint
                torch.save(
                    {
                        "model":
                        model.state_dict(),
                        "reporter":
                        reporter.state_dict(),
                        "optimizers": [o.state_dict() for o in optimizers],
                        "schedulers": [
                            s.state_dict() if s is not None else None
                            for s in schedulers
                        ],
                        "scaler":
                        scaler.state_dict() if scaler is not None else None,
                    },
                    output_dir / "checkpoint.pth",
                )

                # 5. Save the model and update the link to the best model
                torch.save(model.state_dict(),
                           output_dir / f"{iepoch}epoch.pth")

                # Creates a sym link latest.pth -> {iepoch}epoch.pth
                p = output_dir / "latest.pth"
                if p.is_symlink() or p.exists():
                    p.unlink()
                p.symlink_to(f"{iepoch}epoch.pth")

                _improved = []
                for _phase, k, _mode in best_model_criterion:
                    # e.g. _phase, k, _mode = "train", "loss", "min"
                    if reporter.has(_phase, k):
                        best_epoch = reporter.get_best_epoch(_phase, k, _mode)
                        # Creates sym links if it's the best result
                        if best_epoch == iepoch:
                            p = output_dir / f"{_phase}.{k}.best.pth"
                            if p.is_symlink() or p.exists():
                                p.unlink()
                            p.symlink_to(f"{iepoch}epoch.pth")
                            _improved.append(f"{_phase}.{k}")
                if len(_improved) == 0:
                    logging.info("There are no improvements in this epoch")
                else:
                    logging.info("The best model has been updated: " +
                                 ", ".join(_improved))

                # 6. Remove the model files excluding n-best epoch and latest epoch
                _removed = []
                # Get the union set of the n-best among multiple criterion
                nbests = set().union(*[
                    set(reporter.sort_epochs(ph, k, m)[:keep_nbest_models])
                    for ph, k, m in best_model_criterion
                    if reporter.has(ph, k)
                ])
                for e in range(1, iepoch):
                    p = output_dir / f"{e}epoch.pth"
                    if p.exists() and e not in nbests:
                        p.unlink()
                        _removed.append(str(p))
                if len(_removed) != 0:
                    logging.info("The model files were removed: " +
                                 ", ".join(_removed))

            # 7. If any updating haven't happened, stops the training
            if all_steps_are_invalid:
                logging.warning(
                    f"The gradients at all steps are invalid in this epoch. "
                    f"Something seems wrong. This training was stopped at {iepoch}epoch"
                )
                break

            # 8. Check early stopping
            if patience is not None:
                if reporter.check_early_stopping(patience,
                                                 *early_stopping_criterion):
                    break

        else:
            logging.info(f"The training was finished at {max_epoch} epochs ")
Esempio n. 21
0
def test_different_type():
    reporter = Reporter()
    with pytest.raises(ValueError):
        with reporter.observe("train", 1) as sub:
            sub.register({"a": 2}, weight=1)
            sub.register({"a": 3})
Esempio n. 22
0
def test_matplotlib_plot(tmp_path: Path):
    reporter = Reporter()
    reporter.set_epoch(1)
    key1 = uuid.uuid4().hex
    with reporter.observe(key1) as sub:
        stats1 = {"aa": 0.6}
        sub.register(stats1)

    reporter.set_epoch(1)
    with reporter.observe(key1) as sub:
        # Skip epoch=2
        sub.register({})

    reporter.set_epoch(3)
    with reporter.observe(key1) as sub:
        stats1 = {"aa": 0.6}
        sub.register(stats1)

    reporter.matplotlib_plot(tmp_path)
    assert (tmp_path / "aa.png").exists()
Esempio n. 23
0
def test__plot_stats_input_str():
    reporter = Reporter()
    with pytest.raises(TypeError):
        reporter._plot_stats("aaa", "a")
Esempio n. 24
0
def test_tensorboard_add_scalar(tmp_path: Path):
    reporter = Reporter()
    reporter.set_epoch(1)
    key1 = uuid.uuid4().hex
    with reporter.observe(key1) as sub:
        stats1 = {"aa": 0.6}
        sub.register(stats1)

    reporter.set_epoch(1)
    with reporter.observe(key1) as sub:
        # Skip epoch=2
        sub.register({})

    reporter.set_epoch(3)
    with reporter.observe(key1) as sub:
        stats1 = {"aa": 0.6}
        sub.register(stats1)

    if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"):
        from torch.utils.tensorboard import SummaryWriter
    else:
        from tensorboardX import SummaryWriter
    writer = SummaryWriter(tmp_path)
    reporter.tensorboard_add_scalar(writer)
Esempio n. 25
0
def test_measure_iter_time():
    reporter = Reporter()
    with reporter.observe("train", 2) as sub:
        for _ in sub.measure_iter_time(range(3), "foo"):
            pass
Esempio n. 26
0
def test_state_dict():
    reporter = Reporter()
    reporter.set_epoch(1)
    with reporter.observe("train") as sub:
        stats1 = {"aa": 0.6}
        sub.register(stats1)
    with reporter.observe("eval") as sub:
        stats1 = {"bb": 0.6}
        sub.register(stats1)
    state = reporter.state_dict()

    reporter2 = Reporter()
    reporter2.load_state_dict(state)
    state2 = reporter2.state_dict()

    assert state == state2
Esempio n. 27
0
def average_nbest_models(
    output_dir: Path,
    reporter: Reporter,
    best_model_criterion: Sequence[Sequence[str]],
    nbest: Union[Collection[int], int],
    suffix: Optional[str] = None,
) -> None:
    """Generate averaged model from n-best models

    Args:
        output_dir: The directory contains the model file for each epoch
        reporter: Reporter instance
        best_model_criterion: Give criterions to decide the best model.
            e.g. [("valid", "loss", "min"), ("train", "acc", "max")]
        nbest: Number of best model files to be averaged
        suffix: A suffix added to the averaged model file name
    """
    assert check_argument_types()
    if isinstance(nbest, int):
        nbests = [nbest]
    else:
        nbests = list(nbest)
    if len(nbests) == 0:
        warnings.warn("At least 1 nbest values are required")
        nbests = [1]
    if suffix is not None:
        suffix = suffix + "."
    else:
        suffix = ""

    # 1. Get nbests: List[Tuple[str, str, List[Tuple[epoch, value]]]]
    nbest_epochs = [(ph, k, reporter.sort_epochs_and_values(ph, k,
                                                            m)[:max(nbests)])
                    for ph, k, m in best_model_criterion
                    if reporter.has(ph, k)]

    _loaded = {}
    for ph, cr, epoch_and_values in nbest_epochs:
        _nbests = [i for i in nbests if i <= len(epoch_and_values)]
        if len(_nbests) == 0:
            _nbests = [1]

        for n in _nbests:
            if n == 0:
                continue
            elif n == 1:
                # The averaged model is same as the best model
                e, _ = epoch_and_values[0]
                op = output_dir / f"{e}epoch.pth"
                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
                if sym_op.is_symlink() or sym_op.exists():
                    sym_op.unlink()
                sym_op.symlink_to(op.name)
            else:
                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
                logging.info(f"Averaging {n}best models: "
                             f'criterion="{ph}.{cr}": {op}')

                avg = None
                # 2.a. Averaging model
                for e, _ in epoch_and_values[:n]:
                    if e not in _loaded:
                        _loaded[e] = torch.load(
                            output_dir / f"{e}epoch.pth",
                            map_location="cpu",
                        )
                    states = _loaded[e]

                    if avg is None:
                        avg = states
                    else:
                        # Accumulated
                        for k in avg:
                            avg[k] = avg[k] + states[k]
                for k in avg:
                    if str(avg[k].dtype).startswith("torch.int"):
                        # For int type, not averaged, but only accumulated.
                        # e.g. BatchNorm.num_batches_tracked
                        # (If there are any cases that requires averaging
                        #  or the other reducing method, e.g. max/min, for integer type,
                        #  please report.)
                        pass
                    else:
                        avg[k] = avg[k] / n

                # 2.b. Save the ave model and create a symlink
                torch.save(avg, op)

        # 3. *.*.ave.pth is a symlink to the max ave model
        op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
        sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
        if sym_op.is_symlink() or sym_op.exists():
            sym_op.unlink()
        sym_op.symlink_to(op.name)
Esempio n. 28
0
def test_get_epoch():
    reporter = Reporter(2)
    assert reporter.get_epoch() == 2