def test_best_epoch(): mode = "min" reporter = Reporter() key1 = uuid.uuid4().hex stats_list = [{"aa": 0.3}, {"aa": 0.5}, {"aa": 0.2}] for e in range(len(stats_list)): reporter.set_epoch(e + 1) with reporter.observe(key1) as sub: sub.register(stats_list[e]) sub.next() best_epoch = reporter.get_best_epoch(key1, "aa", mode) assert best_epoch == 3
def test_logging(tmp_path): reporter = Reporter() key1 = uuid.uuid4().hex key2 = uuid.uuid4().hex stats_list = [ { "aa": 0.3, "bb": 3.0 }, { "aa": 0.5, "bb": 3.0 }, { "aa": 0.2, "bb": 3.0 }, ] writer = SummaryWriter(tmp_path) for e in range(len(stats_list)): reporter.set_epoch(e + 1) with reporter.observe(key1) as sub: sub.register(stats_list[e]) sub.next() with reporter.observe(key2) as sub: sub.register(stats_list[e]) sub.next() logging.info(sub.log_message()) logging.info(sub.log_message(-1)) logging.info(sub.log_message(0, 1)) sub.tensorboard_add_scalar(writer, -1) with pytest.raises(RuntimeError): logging.info(sub.log_message()) logging.info(reporter.log_message()) with reporter.observe(key1) as sub: sub.register({"aa": 0.1, "bb": 0.4}) sub.next() sub.register({"aa": 0.1}) sub.next()
def test_check_early_stopping(): mode = "min" reporter = Reporter() key1 = uuid.uuid4().hex stats_list = [{"aa": 0.3}, {"aa": 0.2}, {"aa": 0.4}, {"aa": 0.3}] patience = 1 results = [] for e in range(len(stats_list)): reporter.set_epoch(e + 1) with reporter.observe(key1) as sub: sub.register(stats_list[e]) truefalse = reporter.check_early_stopping(patience, key1, "aa", mode) results.append(truefalse) assert results == [False, False, False, True]
def test_sort_values(): mode = "min" reporter = Reporter() key1 = uuid.uuid4().hex stats_list = [{"aa": 0.3}, {"aa": 0.5}, {"aa": 0.2}] for e in range(len(stats_list)): reporter.set_epoch(e + 1) with reporter.observe(key1) as sub: sub.register(stats_list[e]) sort_values = reporter.sort_values(key1, "aa", mode) desired = sorted([stats_list[e]["aa"] for e in range(len(stats_list))],) for e in range(len(stats_list)): assert sort_values[e] == desired[e]
def test_register(weight1, weight2): reporter = Reporter() reporter.set_epoch(1) with reporter.observe(uuid.uuid4().hex) as sub: stats1 = { "float": 0.6, "int": 6, "np": np.random.random(), "torch": torch.rand(1), } sub.register(stats1, weight1) sub.next() stats2 = { "float": 0.3, "int": 100, "np": np.random.random(), "torch": torch.rand(1), } sub.register(stats2, weight2) sub.next() assert sub.get_epoch() == 1 with pytest.raises(RuntimeError): sub.register({}) desired = {} for k in stats1: if stats1[k] is None: continue if weight1 is None: desired[k] = (stats1[k] + stats2[k]) / 2 else: weight1 = float(weight1) weight2 = float(weight2) desired[k] = float(weight1 * stats1[k] + weight2 * stats2[k]) desired[k] /= weight1 + weight2 for k1, k2 in reporter.get_all_keys(): if k2 in ("time", "total_count", "gpu_max_cached_mem_GB", "gpu_cached_mem_GB"): continue np.testing.assert_allclose(reporter.get_value(k1, k2), desired[k2])
def test_logging(): reporter = Reporter() key1 = uuid.uuid4().hex key2 = uuid.uuid4().hex stats_list = [ {"aa": 0.3, "bb": 3.0}, {"aa": 0.5, "bb": 3.0}, {"aa": 0.2, "bb": 3.0}, ] for e in range(len(stats_list)): reporter.set_epoch(e + 1) with reporter.observe(key1) as sub: sub.register(stats_list[e]) with reporter.observe(key2) as sub: sub.register(stats_list[e]) logging.info(sub.log_message()) with pytest.raises(RuntimeError): logging.info(sub.log_message()) logging.info(reporter.log_message())
def test_no_register(): reporter = Reporter() with reporter.observe("train", 1): pass
def test_zero_weight(): reporter = Reporter() with reporter.observe("train", 1) as sub: sub.register({"a": 1}, weight=0)
def test_register_nan(): reporter = Reporter() with reporter.observe("train", 1) as sub: sub.register({"a": np.nan}, weight=1.0)
def test_change_epoch(): reporter = Reporter() with pytest.raises(RuntimeError): with reporter.observe("train", 1): reporter.set_epoch(2)
def test_minus_epoch(): with pytest.raises(ValueError): Reporter(-1)
def run( cls, model: AbsESPnetModel, optimizers: Sequence[torch.optim.Optimizer], schedulers: Sequence[Optional[AbsScheduler]], train_iter_factory: AbsIterFactory, valid_iter_factory: AbsIterFactory, plot_attention_iter_factory: Optional[AbsIterFactory], trainer_options, distributed_option: DistributedOption, ) -> None: """Perform training. This method performs the main process of training.""" assert check_argument_types() # NOTE(kamo): Don't check the type more strictly as far trainer_options assert is_dataclass(trainer_options), type(trainer_options) assert len(optimizers) == len(schedulers), (len(optimizers), len(schedulers)) if isinstance(trainer_options.keep_nbest_models, int): keep_nbest_models = [trainer_options.keep_nbest_models] else: if len(trainer_options.keep_nbest_models) == 0: logging.warning("No keep_nbest_models is given. Change to [1]") trainer_options.keep_nbest_models = [1] keep_nbest_models = trainer_options.keep_nbest_models output_dir = Path(trainer_options.output_dir) reporter = Reporter() if trainer_options.use_amp: if LooseVersion(torch.__version__) < LooseVersion("1.6.0"): raise RuntimeError( "Require torch>=1.6.0 for Automatic Mixed Precision") if trainer_options.sharded_ddp: if fairscale is None: raise RuntimeError( "Requiring fairscale. Do 'pip install fairscale'") scaler = fairscale.optim.grad_scaler.ShardedGradScaler() else: scaler = GradScaler() else: scaler = None if trainer_options.resume and (output_dir / "checkpoint.pth").exists(): cls.resume( checkpoint=output_dir / "checkpoint.pth", model=model, optimizers=optimizers, schedulers=schedulers, reporter=reporter, scaler=scaler, ngpu=trainer_options.ngpu, ) start_epoch = reporter.get_epoch() + 1 if start_epoch == trainer_options.max_epoch + 1: logging.warning( f"The training has already reached at max_epoch: {start_epoch}" ) if distributed_option.distributed: if trainer_options.sharded_ddp: dp_model = fairscale.nn.data_parallel.ShardedDataParallel( module=model, sharded_optimizer=optimizers, ) else: dp_model = torch.nn.parallel.DistributedDataParallel( model, device_ids=( # Perform multi-Process with multi-GPUs [torch.cuda.current_device()] if distributed_option.ngpu == 1 # Perform single-Process with multi-GPUs else None), output_device=(torch.cuda.current_device() if distributed_option.ngpu == 1 else None), find_unused_parameters=trainer_options.unused_parameters, ) elif distributed_option.ngpu > 1: dp_model = torch.nn.parallel.DataParallel( model, device_ids=list(range(distributed_option.ngpu)), ) else: # NOTE(kamo): DataParallel also should work with ngpu=1, # but for debuggability it's better to keep this block. dp_model = model if trainer_options.use_tensorboard and ( not distributed_option.distributed or distributed_option.dist_rank == 0): from torch.utils.tensorboard import SummaryWriter train_summary_writer = SummaryWriter( str(output_dir / "tensorboard" / "train")) valid_summary_writer = SummaryWriter( str(output_dir / "tensorboard" / "valid")) else: train_summary_writer = None start_time = time.perf_counter() for iepoch in range(start_epoch, trainer_options.max_epoch + 1): if iepoch != start_epoch: logging.info( "{}/{}epoch started. Estimated time to finish: {}".format( iepoch, trainer_options.max_epoch, humanfriendly.format_timespan( (time.perf_counter() - start_time) / (iepoch - start_epoch) * (trainer_options.max_epoch - iepoch + 1)), )) else: logging.info( f"{iepoch}/{trainer_options.max_epoch}epoch started") set_all_random_seed(trainer_options.seed + iepoch) reporter.set_epoch(iepoch) # 1. Train and validation for one-epoch with reporter.observe("train") as sub_reporter: all_steps_are_invalid = cls.train_one_epoch( model=dp_model, optimizers=optimizers, schedulers=schedulers, iterator=train_iter_factory.build_iter(iepoch), reporter=sub_reporter, scaler=scaler, summary_writer=train_summary_writer, options=trainer_options, distributed_option=distributed_option, ) with reporter.observe("valid") as sub_reporter: cls.validate_one_epoch( model=dp_model, iterator=valid_iter_factory.build_iter(iepoch), reporter=sub_reporter, options=trainer_options, distributed_option=distributed_option, ) if not distributed_option.distributed or distributed_option.dist_rank == 0: # att_plot doesn't support distributed if plot_attention_iter_factory is not None: with reporter.observe("att_plot") as sub_reporter: cls.plot_attention( model=model, output_dir=output_dir / "att_ws", summary_writer=train_summary_writer, iterator=plot_attention_iter_factory.build_iter( iepoch), reporter=sub_reporter, options=trainer_options, ) # 2. LR Scheduler step for scheduler in schedulers: if isinstance(scheduler, AbsValEpochStepScheduler): scheduler.step( reporter.get_value( *trainer_options.val_scheduler_criterion)) elif isinstance(scheduler, AbsEpochStepScheduler): scheduler.step() if trainer_options.sharded_ddp: for optimizer in optimizers: if isinstance(optimizer, fairscale.optim.oss.OSS): optimizer.consolidate_state_dict() if not distributed_option.distributed or distributed_option.dist_rank == 0: # 3. Report the results logging.info(reporter.log_message()) if trainer_options.use_matplotlib: reporter.matplotlib_plot(output_dir / "images") if train_summary_writer is not None: reporter.tensorboard_add_scalar(train_summary_writer, key1="train") reporter.tensorboard_add_scalar(valid_summary_writer, key1="valid") if trainer_options.use_wandb: reporter.wandb_log() # 4. Save/Update the checkpoint torch.save( { "model": model.state_dict(), "reporter": reporter.state_dict(), "optimizers": [o.state_dict() for o in optimizers], "schedulers": [ s.state_dict() if s is not None else None for s in schedulers ], "scaler": scaler.state_dict() if scaler is not None else None, }, output_dir / "checkpoint.pth", ) # 5. Save and log the model and update the link to the best model torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth") # Creates a sym link latest.pth -> {iepoch}epoch.pth p = output_dir / "latest.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved = [] for _phase, k, _mode in trainer_options.best_model_criterion: # e.g. _phase, k, _mode = "train", "loss", "min" if reporter.has(_phase, k): best_epoch = reporter.get_best_epoch(_phase, k, _mode) # Creates sym links if it's the best result if best_epoch == iepoch: p = output_dir / f"{_phase}.{k}.best.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved.append(f"{_phase}.{k}") if len(_improved) == 0: logging.info("There are no improvements in this epoch") else: logging.info("The best model has been updated: " + ", ".join(_improved)) log_model = (trainer_options.wandb_model_log_interval > 0 and iepoch % trainer_options.wandb_model_log_interval == 0) if log_model and trainer_options.use_wandb: import wandb logging.info("Logging Model on this epoch :::::") artifact = wandb.Artifact( name=f"model_{wandb.run.id}", type="model", metadata={"improved": _improved}, ) artifact.add_file(str(output_dir / f"{iepoch}epoch.pth")) aliases = [ f"epoch-{iepoch}", "best" if best_epoch == iepoch else "", ] wandb.log_artifact(artifact, aliases=aliases) # 6. Remove the model files excluding n-best epoch and latest epoch _removed = [] # Get the union set of the n-best among multiple criterion nbests = set().union(*[ set( reporter.sort_epochs(ph, k, m) [:max(keep_nbest_models)]) for ph, k, m in trainer_options.best_model_criterion if reporter.has(ph, k) ]) # Generated n-best averaged model if (trainer_options.nbest_averaging_interval > 0 and iepoch % trainer_options.nbest_averaging_interval == 0): average_nbest_models( reporter=reporter, output_dir=output_dir, best_model_criterion=trainer_options. best_model_criterion, nbest=keep_nbest_models, suffix=f"till{iepoch}epoch", ) for e in range(1, iepoch): p = output_dir / f"{e}epoch.pth" if p.exists() and e not in nbests: p.unlink() _removed.append(str(p)) if len(_removed) != 0: logging.info("The model files were removed: " + ", ".join(_removed)) # 7. If any updating haven't happened, stops the training if all_steps_are_invalid: logging.warning( f"The gradients at all steps are invalid in this epoch. " f"Something seems wrong. This training was stopped at {iepoch}epoch" ) break # 8. Check early stopping if trainer_options.patience is not None: if reporter.check_early_stopping( trainer_options.patience, *trainer_options.early_stopping_criterion): break else: logging.info( f"The training was finished at {trainer_options.max_epoch} epochs " ) # Generated n-best averaged model if not distributed_option.distributed or distributed_option.dist_rank == 0: average_nbest_models( reporter=reporter, output_dir=output_dir, best_model_criterion=trainer_options.best_model_criterion, nbest=keep_nbest_models, )
def average_nbest_models( output_dir: Path, reporter: Reporter, best_model_criterion: Sequence[Sequence[str]], nbest: int, ) -> None: """Generate averaged model from n-best models Args: output_dir: The directory contains the model file for each epoch reporter: Reporter instance best_model_criterion: Give criterions to decide the best model. e.g. [("valid", "loss", "min"), ("train", "acc", "max")] nbest: """ assert check_argument_types() # 1. Get nbests: List[Tuple[str, str, List[Tuple[epoch, value]]]] nbest_epochs = [ (ph, k, reporter.sort_epochs_and_values(ph, k, m)[:nbest]) for ph, k, m in best_model_criterion if reporter.has(ph, k) ] _loaded = {} for ph, cr, epoch_and_values in nbest_epochs: # Note that len(epoch_and_values) doesn't always equal to nbest. op = output_dir / f"{ph}.{cr}.ave_{len(epoch_and_values)}best.pth" logging.info( f"Averaging {len(epoch_and_values)}best models: " f'criterion="{ph}.{cr}": {op}' ) if len(epoch_and_values) == 0: continue elif len(epoch_and_values) == 1: # The averaged model is same as the best model e, _ = epoch_and_values[0] op = output_dir / f"{e}epoch.pth" for sym_op in [ output_dir / f"{ph}.{cr}.ave.pth", output_dir / f"{ph}.{cr}.ave_{len(epoch_and_values)}best.pth", ]: if sym_op.is_symlink() or sym_op.exists(): sym_op.unlink() sym_op.symlink_to(op.name) else: avg = None # 2.a Averaging model for e, _ in epoch_and_values: if e not in _loaded: _loaded[e] = torch.load( output_dir / f"{e}epoch.pth", map_location="cpu", ) states = _loaded[e] if avg is None: avg = states else: # Accumulated for k in avg: avg[k] += states[k] for k in avg: if str(avg[k].dtype).startswith("torch.int"): # For int type, not averaged, but only accumulated. # e.g. BatchNorm.num_batches_tracked # (If there are any cases that requires averaging # or the other reducing method, e.g. max/min, for integer type, # please report.) pass else: avg[k] /= len(epoch_and_values) # 2.b Save the ave model and create a symlink torch.save(avg, op) sym_op = output_dir / f"{ph}.{cr}.ave.pth" if sym_op.is_symlink() or sym_op.exists(): sym_op.unlink() sym_op.symlink_to(op.name)
def test_start_middle_epoch(): reporter = Reporter() with reporter.observe("train", 2) as sub: sub.register({"a": 3})
def test_measure_time(): reporter = Reporter() with reporter.observe("train", 2) as sub: with sub.measure_time("foo"): pass
def test_mismatch_key2(): reporter = Reporter() with reporter.observe("train", 1) as sub: sub.register({"a": 2}) with reporter.observe("train", 2) as sub: sub.register({"b": 3})
def reporter(): _reporter = Reporter() _reporter.set_epoch(1) with _reporter.observe("valid") as sub: sub.register({"acc": 0.4}) sub.next() _reporter.set_epoch(2) with _reporter.observe("valid") as sub: sub.register({"acc": 0.5}) sub.next() _reporter.set_epoch(3) with _reporter.observe("valid") as sub: sub.register({"acc": 0.6}) sub.next() return _reporter
def test_tensorboard_add_scalar(tmp_path: Path): reporter = Reporter() reporter.set_epoch(1) key1 = "train" with reporter.observe(key1) as sub: stats1 = {"aa": 0.6} sub.register(stats1) sub.next() reporter.set_epoch(1) with reporter.observe(key1) as sub: # Skip epoch=2 sub.register({}) sub.next() reporter.set_epoch(3) with reporter.observe(key1) as sub: stats1 = {"aa": 0.6} sub.register(stats1) sub.next() writer = SummaryWriter(tmp_path) reporter.tensorboard_add_scalar(writer)
def test_get_value_not_found(): reporter = Reporter() with pytest.raises(KeyError): reporter.get_value("a", "b")
def run( cls, model: AbsESPnetModel, optimizers: Sequence[torch.optim.Optimizer], schedulers: Sequence[Optional[AbsScheduler]], train_iter_factory: AbsIterFactory, valid_iter_factory: AbsIterFactory, plot_attention_iter_factory: Optional[AbsIterFactory], reporter: Reporter, scaler: Optional[GradScaler], output_dir: Path, max_epoch: int, seed: int, patience: Optional[int], keep_nbest_models: int, early_stopping_criterion: Sequence[str], best_model_criterion: Sequence[Sequence[str]], val_scheduler_criterion: Sequence[str], trainer_options, distributed_option: DistributedOption, ) -> None: """Perform training. This method performs the main process of training.""" assert check_argument_types() # NOTE(kamo): Don't check the type more strictly as far trainer_options assert is_dataclass(trainer_options), type(trainer_options) start_epoch = reporter.get_epoch() + 1 if start_epoch == max_epoch + 1: logging.warning( f"The training has already reached at max_epoch: {start_epoch}" ) if distributed_option.distributed: dp_model = torch.nn.parallel.DistributedDataParallel( model, device_ids=( # Perform multi-Process with multi-GPUs [torch.cuda.current_device()] if distributed_option.ngpu == 1 # Perform single-Process with multi-GPUs else None), output_device=(torch.cuda.current_device() if distributed_option.ngpu == 1 else None), ) elif distributed_option.ngpu > 1: dp_model = torch.nn.parallel.DataParallel( model, device_ids=list(range(distributed_option.ngpu)), ) else: # NOTE(kamo): DataParallel also should work with ngpu=1, # but for debuggability it's better to keep this block. dp_model = model if not distributed_option.distributed or distributed_option.dist_rank == 0: summary_writer = SummaryWriter(str(output_dir / "tensorboard")) else: summary_writer = None start_time = time.perf_counter() for iepoch in range(start_epoch, max_epoch + 1): if iepoch != start_epoch: logging.info( "{}/{}epoch started. Estimated time to finish: {}".format( iepoch, max_epoch, humanfriendly.format_timespan( (time.perf_counter() - start_time) / (iepoch - start_epoch) * (max_epoch - iepoch + 1)), )) else: logging.info(f"{iepoch}/{max_epoch}epoch started") set_all_random_seed(seed + iepoch) reporter.set_epoch(iepoch) # 1. Train and validation for one-epoch with reporter.observe("train") as sub_reporter: all_steps_are_invalid = cls.train_one_epoch( model=dp_model, optimizers=optimizers, schedulers=schedulers, iterator=train_iter_factory.build_iter(iepoch), reporter=sub_reporter, scaler=scaler, summary_writer=summary_writer, options=trainer_options, ) with reporter.observe("valid") as sub_reporter: cls.validate_one_epoch( model=dp_model, iterator=valid_iter_factory.build_iter(iepoch), reporter=sub_reporter, options=trainer_options, ) if not distributed_option.distributed or distributed_option.dist_rank == 0: # att_plot doesn't support distributed if plot_attention_iter_factory is not None: with reporter.observe("att_plot") as sub_reporter: cls.plot_attention( model=model, output_dir=output_dir / "att_ws", summary_writer=summary_writer, iterator=plot_attention_iter_factory.build_iter( iepoch), reporter=sub_reporter, options=trainer_options, ) # 2. LR Scheduler step for scheduler in schedulers: if isinstance(scheduler, AbsValEpochStepScheduler): scheduler.step( reporter.get_value(*val_scheduler_criterion)) elif isinstance(scheduler, AbsEpochStepScheduler): scheduler.step() if not distributed_option.distributed or distributed_option.dist_rank == 0: # 3. Report the results logging.info(reporter.log_message()) reporter.matplotlib_plot(output_dir / "images") reporter.tensorboard_add_scalar(summary_writer) # 4. Save/Update the checkpoint torch.save( { "model": model.state_dict(), "reporter": reporter.state_dict(), "optimizers": [o.state_dict() for o in optimizers], "schedulers": [ s.state_dict() if s is not None else None for s in schedulers ], "scaler": scaler.state_dict() if scaler is not None else None, }, output_dir / "checkpoint.pth", ) # 5. Save the model and update the link to the best model torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth") # Creates a sym link latest.pth -> {iepoch}epoch.pth p = output_dir / "latest.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved = [] for _phase, k, _mode in best_model_criterion: # e.g. _phase, k, _mode = "train", "loss", "min" if reporter.has(_phase, k): best_epoch = reporter.get_best_epoch(_phase, k, _mode) # Creates sym links if it's the best result if best_epoch == iepoch: p = output_dir / f"{_phase}.{k}.best.pth" if p.is_symlink() or p.exists(): p.unlink() p.symlink_to(f"{iepoch}epoch.pth") _improved.append(f"{_phase}.{k}") if len(_improved) == 0: logging.info("There are no improvements in this epoch") else: logging.info("The best model has been updated: " + ", ".join(_improved)) # 6. Remove the model files excluding n-best epoch and latest epoch _removed = [] # Get the union set of the n-best among multiple criterion nbests = set().union(*[ set(reporter.sort_epochs(ph, k, m)[:keep_nbest_models]) for ph, k, m in best_model_criterion if reporter.has(ph, k) ]) for e in range(1, iepoch): p = output_dir / f"{e}epoch.pth" if p.exists() and e not in nbests: p.unlink() _removed.append(str(p)) if len(_removed) != 0: logging.info("The model files were removed: " + ", ".join(_removed)) # 7. If any updating haven't happened, stops the training if all_steps_are_invalid: logging.warning( f"The gradients at all steps are invalid in this epoch. " f"Something seems wrong. This training was stopped at {iepoch}epoch" ) break # 8. Check early stopping if patience is not None: if reporter.check_early_stopping(patience, *early_stopping_criterion): break else: logging.info(f"The training was finished at {max_epoch} epochs ")
def test_different_type(): reporter = Reporter() with pytest.raises(ValueError): with reporter.observe("train", 1) as sub: sub.register({"a": 2}, weight=1) sub.register({"a": 3})
def test_matplotlib_plot(tmp_path: Path): reporter = Reporter() reporter.set_epoch(1) key1 = uuid.uuid4().hex with reporter.observe(key1) as sub: stats1 = {"aa": 0.6} sub.register(stats1) reporter.set_epoch(1) with reporter.observe(key1) as sub: # Skip epoch=2 sub.register({}) reporter.set_epoch(3) with reporter.observe(key1) as sub: stats1 = {"aa": 0.6} sub.register(stats1) reporter.matplotlib_plot(tmp_path) assert (tmp_path / "aa.png").exists()
def test__plot_stats_input_str(): reporter = Reporter() with pytest.raises(TypeError): reporter._plot_stats("aaa", "a")
def test_tensorboard_add_scalar(tmp_path: Path): reporter = Reporter() reporter.set_epoch(1) key1 = uuid.uuid4().hex with reporter.observe(key1) as sub: stats1 = {"aa": 0.6} sub.register(stats1) reporter.set_epoch(1) with reporter.observe(key1) as sub: # Skip epoch=2 sub.register({}) reporter.set_epoch(3) with reporter.observe(key1) as sub: stats1 = {"aa": 0.6} sub.register(stats1) if LooseVersion(torch.__version__) >= LooseVersion("1.1.0"): from torch.utils.tensorboard import SummaryWriter else: from tensorboardX import SummaryWriter writer = SummaryWriter(tmp_path) reporter.tensorboard_add_scalar(writer)
def test_measure_iter_time(): reporter = Reporter() with reporter.observe("train", 2) as sub: for _ in sub.measure_iter_time(range(3), "foo"): pass
def test_state_dict(): reporter = Reporter() reporter.set_epoch(1) with reporter.observe("train") as sub: stats1 = {"aa": 0.6} sub.register(stats1) with reporter.observe("eval") as sub: stats1 = {"bb": 0.6} sub.register(stats1) state = reporter.state_dict() reporter2 = Reporter() reporter2.load_state_dict(state) state2 = reporter2.state_dict() assert state == state2
def average_nbest_models( output_dir: Path, reporter: Reporter, best_model_criterion: Sequence[Sequence[str]], nbest: Union[Collection[int], int], suffix: Optional[str] = None, ) -> None: """Generate averaged model from n-best models Args: output_dir: The directory contains the model file for each epoch reporter: Reporter instance best_model_criterion: Give criterions to decide the best model. e.g. [("valid", "loss", "min"), ("train", "acc", "max")] nbest: Number of best model files to be averaged suffix: A suffix added to the averaged model file name """ assert check_argument_types() if isinstance(nbest, int): nbests = [nbest] else: nbests = list(nbest) if len(nbests) == 0: warnings.warn("At least 1 nbest values are required") nbests = [1] if suffix is not None: suffix = suffix + "." else: suffix = "" # 1. Get nbests: List[Tuple[str, str, List[Tuple[epoch, value]]]] nbest_epochs = [(ph, k, reporter.sort_epochs_and_values(ph, k, m)[:max(nbests)]) for ph, k, m in best_model_criterion if reporter.has(ph, k)] _loaded = {} for ph, cr, epoch_and_values in nbest_epochs: _nbests = [i for i in nbests if i <= len(epoch_and_values)] if len(_nbests) == 0: _nbests = [1] for n in _nbests: if n == 0: continue elif n == 1: # The averaged model is same as the best model e, _ = epoch_and_values[0] op = output_dir / f"{e}epoch.pth" sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth" if sym_op.is_symlink() or sym_op.exists(): sym_op.unlink() sym_op.symlink_to(op.name) else: op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth" logging.info(f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}') avg = None # 2.a. Averaging model for e, _ in epoch_and_values[:n]: if e not in _loaded: _loaded[e] = torch.load( output_dir / f"{e}epoch.pth", map_location="cpu", ) states = _loaded[e] if avg is None: avg = states else: # Accumulated for k in avg: avg[k] = avg[k] + states[k] for k in avg: if str(avg[k].dtype).startswith("torch.int"): # For int type, not averaged, but only accumulated. # e.g. BatchNorm.num_batches_tracked # (If there are any cases that requires averaging # or the other reducing method, e.g. max/min, for integer type, # please report.) pass else: avg[k] = avg[k] / n # 2.b. Save the ave model and create a symlink torch.save(avg, op) # 3. *.*.ave.pth is a symlink to the max ave model op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth" sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth" if sym_op.is_symlink() or sym_op.exists(): sym_op.unlink() sym_op.symlink_to(op.name)
def test_get_epoch(): reporter = Reporter(2) assert reporter.get_epoch() == 2