Ejemplo n.º 1
0
    def on_phase_end(self, task: "tasks.ClassyTask") -> None:
        for meter in task.ema_meters:
            meter.sync_state()

        if is_primary():
            LogLossMetricsCheckpointHook.print_and_save_meters(
                task,
                task.train_phase_idx,
                task.ema_meters,
                metric_key_name_suffix="ema",
            )
Ejemplo n.º 2
0
    def on_forward(self, task: "tasks.ClassyTask") -> None:
        """
        Called each time a model forward is done and make sure that
        the model forward output is not NaN. If we encounter NaN as the model
        output, we checkpoint the model to enable debugging and also checkpoint
        the model input sample, model output.
        """
        # check the model output is not NaN.
        has_nan = False
        model_output = task.last_batch.model_output
        if isinstance(model_output, list):
            has_nan = not torch.tensor(
                [torch.isfinite(x).all() for x in model_output]
            ).all()
        else:
            has_nan = not torch.isfinite(model_output).all()

        if has_nan:
            _, dist_rank = get_machine_local_and_dist_rank()
            logging.info(f"Infinite Model output or NaN at iteration={task.iteration}.")

            # TODO - this code was broken during a refactoring: improve it
            from vissl.hooks.log_hooks import LogLossMetricsCheckpointHook

            LogLossMetricsCheckpointHook.checkpoint_model(
                task,
                world_size=self.world_size,
                mode_frequency=1,
                mode_num=task.iteration,
                mode="iteration",
            )
            model_output_file = (
                f"{task.checkpoint_folder}/rank{dist_rank}_model_output.pth"
            )
            input_sample_file = (
                f"{task.checkpoint_folder}/rank{dist_rank}_input_sample.pth"
            )
            with PathManager.open(model_output_file, "wb") as fwrite:
                torch.save(model_output, fwrite)
            with PathManager.open(input_sample_file, "wb") as fwrite:
                torch.save(task.last_batch.sample, fwrite)
            logging.info(f"Saved model output: {model_output_file}")
            logging.info(f"Saved model input: {input_sample_file}")
Ejemplo n.º 3
0
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]:
    """
    The utility function that prepares all the hoooks that will be used in training
    based on user selection. Some basic hooks are used by default.

    Optional hooks:
        - Tensorboard hook,
        - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the
          loss is being used
        - model complexity hook (if user wants to compute model flops, activations, params)
          enable the hook via HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True

    Returns:
        hooks (List(functions)): list containing the hook functions that will be used
    """
    hooks = []

    # conditionally add hooks based on use-case
    if cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS:
        perf_stat_freq = (cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY
                          if cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY > 0 else
                          None)
        hooks.append(LogPerfTimeMetricsHook(perf_stat_freq))
    if cfg.LOSS.name == "swav_loss":
        hooks.extend([SwAVUpdateQueueScoresHook(), NormalizePrototypesHook()])
    if cfg.LOSS.name == "swav_momentum_loss":
        hooks.extend([
            SwAVMomentumHook(
                cfg.LOSS["swav_momentum_loss"]["momentum"],
                cfg.LOSS["swav_momentum_loss"]
                ["momentum_eval_mode_iter_start"],
                cfg.LOSS["swav_momentum_loss"]["crops_for_assign"],
            ),
            SwAVMomentumNormalizePrototypesHook(),
        ])
    if cfg.LOSS.name == "deepclusterv2_loss":
        hooks.extend([InitMemoryHook(), ClusterMemoryHook()])
    if cfg.LOSS.name == "moco_loss":
        hooks.extend([
            MoCoHook(
                cfg.LOSS["moco_loss"]["momentum"],
                shuffle_batch=(
                    not cfg.MODEL.SYNC_BN_CONFIG.CONVERT_BN_TO_SYNC_BN),
            )
        ])
    if cfg.HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY:
        hooks.extend([SSLModelComplexityHook()])
    if cfg.HOOKS.LOG_GPU_STATS:
        hooks.extend([LogGpuStatsHook()])
    if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY:
        hooks.extend(
            [LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)])
    if cfg.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD:
        assert is_tensorboard_available(), (
            "Tensorboard must be installed to use it. Please install tensorboard using:"
            "If pip environment: `pip install tensorboard` "
            "If using conda and you prefer conda install of tensorboard: "
            "`conda install -c conda-forge tensorboard`")
        tb_hook = get_tensorboard_hook(cfg)
        hooks.extend([tb_hook])
    if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP:
        hooks.extend([
            GradClipHook(
                norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE,
                max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM,
            )
        ])

    # hooks that are used irrespective of workflow type
    rolling_btime_freq = (cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ
                          if cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ > 0 else
                          None)

    if ProfilingHook.is_enabled(cfg.PROFILING):
        hooks.append(ProfilingHook(profiling_config=cfg.PROFILING))

    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    checkpoint_folder = get_checkpoint_folder(cfg)
    hooks.extend([
        CheckNanLossHook(),
        SetDataSamplerEpochHook(),
        FreezeParametersHook(),
        UpdateBatchesSeenHook(),
        UpdateTrainBatchTimeHook(),
        UpdateTestBatchTimeHook(),
        UpdateTrainIterationNumHook(),
        LogLossMetricsCheckpointHook(world_size),
        LogLossLrEtaHook(checkpoint_folder, rolling_btime_freq),
    ])
    return hooks
Ejemplo n.º 4
0
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]:
    """
    The utility function that prepares all the hoooks that will be used in training
    based on user selection. Some basic hooks are used by default.

    Optional hooks:
        - Tensorboard hook,
        - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the
          loss is being used
        - model complexity hook (if user wants to compute model flops, activations, params)
          enable the hook via MODEL.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True

    Returns:
        hooks (List(functions)): list containing the hook functions that will be used
    """
    hooks = []

    # conditionally add hooks based on use-case
    if cfg.MONITOR_PERF_STATS:
        perf_stat_freq = (
            cfg.PERF_STAT_FREQUENCY if cfg.PERF_STAT_FREQUENCY > 0 else None
        )
        hooks.append(LogPerfTimeMetricsHook(perf_stat_freq))
    if cfg.LOSS.name == "swav_loss":
        hooks.extend([SwAVUpdateQueueScoresHook(), NormalizePrototypesHook()])
    if cfg.LOSS.name == "swav_momentum_loss":
        hooks.extend(
            [
                SwAVMomentumHook(
                    cfg.LOSS["swav_momentum_loss"]["momentum"],
                    cfg.LOSS["swav_momentum_loss"]["momentum_eval_mode_iter_start"],
                    cfg.LOSS["swav_momentum_loss"]["crops_for_assign"],
                ),
                SwAVMomentumNormalizePrototypesHook(),
            ]
        )
    if cfg.LOSS.name == "deepclusterv2_loss":
        hooks.extend([InitMemoryHook(), ClusterMemoryHook()])
    if cfg.LOSS.name == "moco_loss":
        hooks.extend(
            [
                MoCoHook(
                    cfg.LOSS["moco_loss"]["momentum"],
                    shuffle_batch=(not cfg.MODEL.SYNC_BN_CONFIG.CONVERT_BN_TO_SYNC_BN),
                )
            ]
        )
    if cfg.MODEL.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY:
        hooks.extend([SSLModelComplexityHook()])
    if cfg.TENSORBOARD_SETUP.USE_TENSORBOARD:
        assert is_tensorboard_available(), "Tensorboard must be installed to use it."
        tb_hook = get_tensorboard_hook(cfg)
        hooks.extend([tb_hook])

    # hooks that are used irrespective of workflow type
    rolling_btime_freq = cfg.ROLLING_BTIME_FREQ if cfg.ROLLING_BTIME_FREQ > 0 else None
    hooks.extend(
        [
            CheckNanLossHook(),
            SetDataSamplerEpochHook(),
            FreezeParametersHook(),
            UpdateBatchesSeenHook(),
            UpdateTrainBatchTimeHook(),
            UpdateTestBatchTimeHook(),
            UpdateTrainIterationNumHook(),
            LogLossMetricsCheckpointHook(),
            LogLossLrEtaHook(rolling_btime_freq),
            LogGpuStatsHook(),
        ]
    )
    return hooks
Ejemplo n.º 5
0
def default_hook_generator(cfg: AttrDict) -> List[ClassyHook]:
    """
    The utility function that prepares all the hoooks that will be used in training
    based on user selection. Some basic hooks are used by default.

    Optional hooks:
        - Tensorboard hook,
        - loss specific hooks (swav loss, deepcluster loss, moco loss) used only when the
          loss is being used
        - model complexity hook (if user wants to compute model flops, activations, params)
          enable the hook via HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY = True

    Returns:
        hooks (List(functions)): list containing the hook functions that will be used
    """
    hooks = []

    # conditionally add hooks based on use-case
    if cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS:
        perf_stat_freq = (
            cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY
            if cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY > 0
            else None
        )
        hooks.append(LogPerfTimeMetricsHook(perf_stat_freq))

    # add the loss hooks based on the loss being used
    hooks = add_loss_hooks(hooks, cfg.LOSS, cfg)

    if cfg.HOOKS.MODEL_COMPLEXITY.COMPUTE_COMPLEXITY:
        hooks.extend([SSLModelComplexityHook()])
    if cfg.HOOKS.LOG_GPU_STATS:
        hooks.extend([LogGpuStatsHook()])
    if cfg.HOOKS.MEMORY_SUMMARY.PRINT_MEMORY_SUMMARY:
        hooks.extend([LogGpuMemoryHook(cfg.HOOKS.MEMORY_SUMMARY.LOG_ITERATION_NUM)])
    if cfg.HOOKS.MEMORY_SUMMARY.DUMP_MEMORY_ON_EXCEPTION:
        hooks.append(DumpMemoryOnException())
    if cfg.HOOKS.TENSORBOARD_SETUP.USE_TENSORBOARD:
        assert is_tensorboard_available(), (
            "Tensorboard must be installed to use it. Please install tensorboard using:"
            "If pip environment: `pip install tensorboard` "
            "If using conda and you prefer conda install of tensorboard: "
            "`conda install -c conda-forge tensorboard`"
        )
        tb_hook = get_tensorboard_hook(cfg)
        hooks.extend([tb_hook])
    if cfg.MODEL.GRAD_CLIP.USE_GRAD_CLIP:
        hooks.extend(
            [
                GradClipHook(
                    norm_type=cfg.MODEL.GRAD_CLIP.NORM_TYPE,
                    max_norm=cfg.MODEL.GRAD_CLIP.MAX_NORM,
                )
            ]
        )

    # hooks that are used irrespective of workflow type
    rolling_btime_freq = (
        cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ
        if cfg.HOOKS.PERF_STATS.ROLLING_BTIME_FREQ > 0
        else None
    )

    if CudaSynchronizeHook.is_enabled(cfg.MODEL):
        hooks.append(CudaSynchronizeHook())

    if ProfilingHook.is_enabled(cfg.PROFILING):
        hooks.append(ProfilingHook(profiling_config=cfg.PROFILING))

    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    checkpoint_folder = get_checkpoint_folder(cfg)

    hooks.extend(
        [
            SetDataSamplerEpochHook(),
            FreezeParametersHook(),
            LogLossMetricsCheckpointHook(world_size),
            LogLossLrEtaHook(checkpoint_folder, rolling_btime_freq),
        ]
    )

    if cfg.METERS.model_output_mask:
        hooks.extend([ModelOutputMaskHook()])

    if cfg.HOOKS.CHECK_NAN:
        hooks.extend([CheckNanLossHook(), CheckNanModelOutputHook(world_size)])

    return hooks