Example #1
0
    def on_phase_end(self, task: "tasks.ClassyTask") -> None:
        """
        Called at the end of every epoch if the tensorboard hook is
        enabled.
        Log model parameters and/or parameter gradients as set by user
        in the tensorboard configuration. Also resents the CUDA memory counter.
        """
        # Log train/test accuracy
        if is_primary():
            phase_type = "Training" if task.train else "Testing"
            for meter in task.meters:
                if "accuracy" in meter.name:
                    for top_n, accuracies in meter.value.items():
                        for i, acc in accuracies.items():
                            tag_name = f"{phase_type}/Accuracy_" f" {top_n}_Output_{i}"
                            self.tb_writer.add_scalar(
                                tag=tag_name,
                                scalar_value=round(acc, 5),
                                global_step=task.train_phase_idx,
                            )
        if not (self.log_params or self.log_params_gradients):
            return

        if is_primary() and task.train:
            # Log the weights and bias at the end of the epoch
            if self.log_params:
                for name, parameter in task.base_model.named_parameters():
                    self.tb_writer.add_histogram(
                        f"Parameters/{name}",
                        parameter,
                        global_step=task.train_phase_idx,
                    )
            # Log the parameter gradients at the end of the epoch
            if self.log_params_gradients:
                for name, parameter in task.base_model.named_parameters():
                    if parameter.grad is not None:
                        try:
                            self.tb_writer.add_histogram(
                                f"Gradients/{name}",
                                parameter.grad,
                                global_step=task.train_phase_idx,
                            )
                        except ValueError:
                            logging.info(
                                f"Gradient histogram empty for {name}, "
                                f"iteration {task.iteration}. Unable to "
                                f"log gradient."
                            )

            # Reset the GPU Memory counter
            if torch.cuda.is_available():
                torch.cuda.reset_max_memory_allocated()
                torch.cuda.reset_max_memory_cached()
Example #2
0
 def on_start(self, task) -> None:
     if not is_primary() or getattr(task, "test_only", False):
         return
     if not PathManager.exists(self.torchscript_folder):
         err_msg = "Torchscript folder '{}' does not exist.".format(
             self.torchscript_folder)
         raise FileNotFoundError(err_msg)
Example #3
0
 def on_start(self, task: "tasks.ClassyTask") -> None:
     """
     Called at the start of training.
     """
     if self.log_activation_statistics and is_primary():
         self.activation_watcher.monitor(task.base_model)
         self.activation_watcher.set_iteration(task.iteration)
Example #4
0
 def on_start(self, task: "tasks.ClassyTask") -> None:
     """
     Logs Gpu nvidia-smi stats to logger streams.
     """
     if is_primary() and (task.device.type == "cuda"):
         # print the nvidia-smi stats
         log_gpu_stats()
Example #5
0
    def extract(self, output_folder: str) -> None:
        """
        Extract workflow supports multi-gpu feature extraction. Since we are only extracting
        features, only the model is built (and initialized from some model weights file
        if specified by user). The model is set to the eval mode fully.

        The features are extracted for whatever data splits (train, val, test) etc that user
        wants.
        """
        # support feature extraction on gpu only.
        assert self.task.device.type == "cuda", "Set MACHINE.DEVICE = gpu"
        self.task.prepare_extraction(pin_memory=self.cfg.DATA.PIN_MEMORY)

        # Create distributed model
        self._add_dummy_layer()
        self.task.init_distributed_data_parallel_model()
        if is_primary():
            logging.info("Model is:\n {}".format(self.task.model))

        # Get the names of the features that we are extracting. If user doesn't
        # specify the features to evaluate, we get the full model output and freeze
        # head/trunk both as caution.
        feat_names = get_trunk_output_feature_names(self.cfg.MODEL)
        if len(feat_names) == 0:
            feat_names = ["heads"]

        for split in self.task.available_splits:
            logging.info(f"============== Split: {split} =======================")
            logging.info(f"Extracting features for partition: {split.lower()}")
            self.task.data_iterator = iter(self.task.dataloaders[split.lower()])
            self._extract_split_features(feat_names, self.task, split, output_folder)
            logging.info(f"Done getting features for partition: {split.lower()}")

        self._cleanup_task()
Example #6
0
 def on_step(self, task: "tasks.ClassyTask") -> None:
     """
     Print the nvidia-smi stats again to get more accurate nvidia-smi
     useful for monitoring memory usage.
     """
     if (is_primary() and (task.device.type == "cuda")
             and task.local_iteration_num == 50):
         log_gpu_stats()
Example #7
0
    def on_phase_end(self, task) -> None:
        """
        Plot the metrics on visdom.
        """
        phase_type = task.phase_type
        metrics = self.metrics
        batches = len(task.losses)

        if batches == 0:
            return

        # Loss for the phase
        loss = sum(task.losses) / (batches * task.get_batchsize_per_replica())
        loss_key = phase_type + "_loss"
        if loss_key not in metrics:
            metrics[loss_key] = []
        metrics[loss_key].append(loss)

        # Optimizer LR for the phase
        optimizer_lr = task.optimizer.options_view.lr
        lr_key = phase_type + "_learning_rate"
        if lr_key not in metrics:
            metrics[lr_key] = []
        metrics[lr_key].append(optimizer_lr)

        # Calculate meters
        for meter in task.meters:
            if isinstance(meter.value, collections.MutableMapping):
                flattened_meters_dict = flatten_dict(meter.value,
                                                     prefix=meter.name)
                for k, v in flattened_meters_dict.items():
                    metric_key = phase_type + "_" + k
                    if metric_key not in metrics:
                        metrics[metric_key] = []
                    metrics[metric_key].append(v)
            else:
                metric_key = phase_type + "_" + meter.name
                if metric_key not in metrics:
                    metrics[metric_key] = []
                metrics[metric_key].append(meter.value)

        # update learning curve visualizations:
        phase_type = "train" if task.train else "test"
        title = "%s-%s-%d" % (
            phase_type,
            task.base_model.__class__.__name__,
            task.base_model.model_depth,
        )
        title += self.title_suffix

        if not task.train and is_primary():
            logging.info("Plotting learning curves to visdom")
            plot_learning_curves(metrics,
                                 visdom_server=self.visdom,
                                 env=self.env,
                                 win=title,
                                 title=title)
Example #8
0
    def extract(
        self,
        output_folder: str,
        extract_features: bool = True,
        extract_predictions: bool = False,
    ) -> None:
        """
        Extract workflow supports multi-gpu feature extraction and also extracting
        predicted labels. Since we are only extracting features or label predictions,
        only the model is built (and initialized from some model weights file
        if specified by user). Optionally the meters are built if the labels
        are being extracted. The model is set to the eval mode fully.

        The features / labels are extracted for whatever data splits (train, val, test)
        the user wants.
        """
        # support feature/label predictions extraction on gpu only.
        assert self.task.device.type == "cuda", "Set MACHINE.DEVICE = gpu"
        self.task.prepare_extraction(pin_memory=self.cfg.DATA.PIN_MEMORY)

        # Create distributed model
        self.task.add_dummy_layer()
        self.task.init_distributed_data_parallel_model()
        if is_primary():
            logging.info(f"Model is:\n {self.task.model}")

        # Get the names of the features that we are extracting. If user doesn't
        # specify the features to evaluate, we get the full model output and freeze
        # head/trunk both as caution.
        feat_names = get_trunk_output_feature_names(self.cfg.MODEL)
        if len(feat_names) == 0:
            feat_names = ["heads"]

        self.task.train = False
        self.task.run_hooks(SSLClassyHookFunctions.on_start.name)
        for split in self.task.available_splits:
            logging.info(
                f"============== Split: {split} =======================")
            self.task.data_iterator = iter(
                self.task.dataloaders[split.lower()])
            if extract_features:
                logging.info(
                    f"Extracting features for partition: {split.lower()}")
                self._extract_split_features(feat_names, self.task, split,
                                             output_folder)
                logging.info(
                    f"Done getting features for partition: {split.lower()}")
            if extract_predictions:
                logging.info(
                    f"Extracting predictions for partition: {split.lower()}")
                self._extract_split_label_predictions(feat_names, self.task,
                                                      split, output_folder)
                logging.info(
                    f"Done getting predictions for partition: {split.lower()}")
        self.task.run_hooks(SSLClassyHookFunctions.on_end.name)

        self._cleanup_task()
Example #9
0
 def _print_memory_summary(self, task: "tasks.ClassyTask", stage_name: str) -> None:
     if (
         is_primary()
         and (task.device.type == "cuda")
         and task.local_iteration_num == self.log_iteration_num
     ):
         logging.info(
             f"========= Memory Summary at {stage_name} ======="
             f"\n{torch.cuda.memory_summary()}\n"
         )
Example #10
0
    def on_phase_start(self, task) -> None:
        """Create and display a progress bar with 0 progress."""
        if not progressbar_available:
            raise RuntimeError(
                "progressbar module not installed, cannot use ProgressBarHook")

        if is_primary():
            self.bar_size = task.num_batches_per_phase
            self.batches = 0
            self.progress_bar = progressbar.ProgressBar(self.bar_size)
            self.progress_bar.start()
Example #11
0
    def on_phase_end(self, task: "tasks.ClassyTask") -> None:
        for meter in task.ema_meters:
            meter.sync_state()

        if is_primary():
            LogLossMetricsCheckpointHook.print_and_save_meters(
                task,
                task.train_phase_idx,
                task.ema_meters,
                metric_key_name_suffix="ema",
            )
Example #12
0
    def extract(self):
        """
        Extract workflow supports multi-gpu feature extraction. Since we are only extracting
        features, only the model is built (and initialized from some model weights file
        if specified by user). The model is set to the eval mode fully.

        The features are extracted for whatever data splits (train, val, test) etc that user
        wants.
        """
        # support feature extraction on gpu only.
        assert self.task.device.type == "cuda", "Set MACHINE.DEVICE = gpu"
        self.task.prepare_extraction(pin_memory=self.cfg.DATA.PIN_MEMORY)

        # in case of feature evaluation mode, if we are freezing both trunk and
        # head, DDP won't work as there are no parameters in the model. Adding
        # the dummy head will lead to features being not right. So we rather
        # add the dummy layer to the model and use DDP. We copy the model to
        # gpu (if using gpus) after the new dummy layer addition.
        fully_frozen_model = self.task.base_model.is_fully_frozen_model()
        if fully_frozen_model:
            self.task.base_model.dummy_layer = torch.nn.Linear(4, 4)
            if self.task.device.type == "cuda":
                self.task.base_model = copy_model_to_gpu(self.task.base_model)
        self.task.init_distributed_data_parallel_model()

        if is_primary():
            logging.info("Model is:\n {}".format(self.task.model))

        # Get the names of the features that we are extracting. If user doesn't
        # specify the features to evaluate, we get the full model output and freeze
        # head/trunk both as caution.
        feat_names = get_trunk_output_feature_names(self.cfg.MODEL)
        if len(feat_names) == 0:
            feat_names = ["heads"]

        features = {}
        for split in self.task.available_splits:
            logging.info(f"Extracting features for partition: {split.lower()}")
            self.task.data_iterator = iter(
                self.task.dataloaders[split.lower()])
            features[split.lower()] = self._get_split_features(
                feat_names, self.cfg, self.task)
            logging.info(
                f"Done getting features for partition: {split.lower()}")

        if hasattr(self.task, "data_iterator"):
            del self.task.data_iterator
            gc.collect()
        if hasattr(self.task, "dataloaders"):
            del self.task.dataloaders
            gc.collect()
        return features
Example #13
0
 def on_phase_end(self, task: "tasks.ClassyTask") -> None:
     """
     Called at the end of each phase and forward. We log the metrics and also
     save the checkpoint. We pass the mode: phase or iteration
     """
     if is_primary():
         self._print_and_save_meters(task, task.train_phase_idx)
     checkpoint_frequency = task.config["CHECKPOINT"]["CHECKPOINT_FREQUENCY"]
     self._checkpoint_model(
         task,
         mode_frequency=checkpoint_frequency,
         mode_num=task.train_phase_idx,
         mode="phase",
     )
    def on_phase_end(self, task) -> None:
        """Checkpoint the task every checkpoint_period phases.

        We do not necessarily checkpoint the task at the end of every phase.
        """
        if not is_primary() or task.phase_type not in self.phase_types:
            return

        self.phase_counter += 1
        if self.phase_counter % self.checkpoint_period != 0:
            return

        checkpoint_name = CheckpointHook.get_checkpoint_name(task.phase_idx)
        self._save_checkpoint(task, checkpoint_name)
Example #15
0
def load_and_broadcast_checkpoint(
        checkpoint_path: str,
        device: torch.device = CPU_DEVICE) -> Optional[Dict]:
    """Loads a checkpoint on master and broadcasts it to all replicas.

    This is a collective operation which needs to be run in sync on all replicas.

    See :func:`load_checkpoint` for the arguments.
    """
    if is_primary():
        checkpoint = load_checkpoint(checkpoint_path, device)
    else:
        checkpoint = None
    logging.info(f"Broadcasting checkpoint loaded from {checkpoint_path}")
    return broadcast_object(checkpoint)
Example #16
0
    def on_phase_start(self, task: "tasks.ClassyTask") -> None:
        """
        Called at the start of every epoch if the tensorboard hook is
        enabled.
        Logs the model parameters once at the beginning of training only.
        """
        if not self.log_params:
            return

        # log the parameters just once, before training starts
        if is_primary() and task.train and task.train_phase_idx == 0:
            for name, parameter in task.base_model.named_parameters():
                self.tb_writer.add_histogram(
                    f"Parameters/{name}", parameter, global_step=-1
                )
Example #17
0
    def on_forward(self, task: "tasks.ClassyTask") -> None:
        """
        Called after every forward if tensorboard hook is enabled.
        Logs the model parameters if the training iteration matches the
        logging frequency.
        """
        if not self.log_params:
            return

        if (self.log_params_every_n_iterations > 0 and is_primary()
                and task.train
                and task.iteration % self.log_params_every_n_iterations == 0):
            for name, parameter in task.base_model.named_parameters():
                self.tb_writer.add_histogram(f"Parameters/{name}",
                                             parameter,
                                             global_step=task.iteration)
Example #18
0
    def init_distributed_data_parallel_model(self):
        """
        Initialize FSDP if needed.

        This method overloads the ClassificationTask class's method from ClassyVision.
        """
        if not is_distributed_training_run():
            return

        # Make sure default cuda device is set. TODO (Min): we should enable FSDP can
        # be enabled for 1-GPU as well, but the use case there is likely different.
        # I.e. perhaps we use it for cpu_offloading.
        assert get_cuda_device_index(
        ) > -1, "Distributed training not setup correctly"

        # The model might be already wrapped by FSDP internally. Check regnet_fsdp.py.
        # Here, we wrap it at the outer most level.
        fsdp_config = self.config["MODEL"]["FSDP_CONFIG"]
        if is_primary():
            logging.info(f"Using FSDP, config: {fsdp_config}")

        # First, wrap the head's prototype_i layers if it is SWAV.
        # TODO (Min): make this more general for different models, which may have multiple
        #             heads.
        head0 = self.base_model.heads[0]
        if isinstance(head0, SwAVPrototypesHead):
            for j in range(head0.nmb_heads):
                module = getattr(head0, "prototypes" + str(j))
                module = FSDP(module=module, **fsdp_config)
                setattr(head0, "prototypes" + str(j), module)

        # TODO (Min): We can load checkpoint, but it ends up setting the trunk's _is_root
        # flag to true. We need to set it back to None here.
        # Also, right now, the head's weight is only partially loaded from the checkpoint
        # because we dump the checkpoint after the head if wrapped, but loading it before
        # it is wrapped.
        # For very big models, we need re-work the checkpoint logic because we don't have
        # enough memory to load the entire model on one node. We need to use local_state_dict()
        # API to load checkpoint shards.
        for module in self.base_model.trunk.modules():
            if isinstance(module, FSDP):
                module._is_root = None

        # Then, wrap the whole model. We replace the base_model since it is used
        # when checkpoint is taken.
        self.base_model = FSDP(module=self.base_model, **fsdp_config)
        self.distributed_model = self.base_model
Example #19
0
    def on_update(self, task: "tasks.ClassyTask") -> None:
        """
        Executed after after parameter update. If the current phase is training,
        and it's a logging iteration, we compute and log several helpul training
        stats to keep track of ongoing training.

        For monitoring the batch size (average training iteration time), we allow
        monitoring the stats (optionally) for every N iterations to get better
        idea about the batch time and training eta.

        Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N
        ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True.
        """
        if is_primary() and task.train:
            # Only log during training and on primary
            self._log_training_epoch(task)
        task.additional_log_data.clear()
    def on_phase_start(self, task) -> None:
        """Initialize losses and learning_rates."""
        self.learning_rates = []
        self.wall_times = []
        self.sample_fetch_times = []

        if not is_primary():
            return

        if torch.cuda.is_available():
            torch.cuda.reset_max_memory_allocated()

        # log the parameters before training starts
        if task.train and task.train_phase_idx == 0:
            for name, parameter in task.base_model.named_parameters():
                self.tb_writer.add_histogram(
                    f"Parameters/{name}", parameter, global_step=-1
                )
Example #21
0
 def __init__(self,
              checkpoint_folder: str,
              btime_freq: Optional[int] = None) -> None:
     """
     Args:
         checkpoint_folder: checkpoint directory where we will write the stdout.json
         btime_freq: if specified, logs average batch time of rolling_freq
                       batches also.
     """
     super().__init__()
     self.btime_freq: Optional[int] = btime_freq
     self.json_stdout_logger = None
     if is_primary():
         self.json_stdout_logger = PathManager.open(
             f"{checkpoint_folder}/stdout.json",
             mode="a",
             buffering=10 * 1024,  # 10KB
         )
         atexit.register(self.json_stdout_logger.close)
Example #22
0
    def _update(self, model, update_fn):
        base_model_state_dict = model.state_dict()
        for key, ema_params in self.module.state_dict().items():
            model_params = base_model_state_dict[key]

            if self.device is not None:
                model_params = model_params.to(device=self.device)
            if ema_params.dtype != torch.float32:
                # This is modification from original code.
                if self.first_run and is_primary():
                    logging.warning(
                        f"EMA: will be skipping key: {key} since it is of type: {ema_params.dtype}"  # NOQA
                    )
                value = model_params
            else:
                value = update_fn(ema_params, model_params)
            ema_params.copy_(value)

        self.first_run = False
    def on_start(self, task) -> None:
        """
        Plot the model on Tensorboard.
        """
        if is_primary():
            try:
                # Show model in tensorboard:
                logging.info("Showing model graph in TensorBoard...")

                plot_model(
                    task.base_model,
                    size=task.base_model.input_shape,
                    input_key=task.base_model.input_key if hasattr(
                        task.base_model, "input_key") else None,
                    writer=self.tb_writer,
                )
            except Exception:
                logging.warn("Unable to plot model to tensorboard")
                logging.debug("Exception encountered:", exc_info=True)
Example #24
0
 def _sync_and_print_meters(self, task):
     for meter in task.meters:
         meter.sync_state()
         logging.info("Meters synced")
     if is_primary():
         rank, _ = get_machine_local_and_dist_rank()
         for meter in task.meters:
             if len(task.meters) > 0 and (
                 (task.train and task.config["METERS"]["enable_training_meter"])
                 or (not task.train)
             ):
                 meter_value = meter.value
                 metric_key = f"{meter.name}"
                 if metric_key not in task.metrics:
                     task.metrics[metric_key] = []
                 task.metrics[metric_key].append(meter_value)
                 logging.info(
                     f"Rank: {rank}, name: {metric_key}, value: {meter_value}"
                 )
Example #25
0
    def on_phase_end(self, task: "tasks.ClassyTask") -> None:
        """
        Called at the end of every epoch if the tensorboard hook is
        enabled.
        Log model parameters and/or parameter gradients as set by user
        in the tensorboard configuration. Also resents the CUDA memory counter.
        """
        if not (self.log_params or self.log_params_gradients):
            return

        if is_primary() and task.train:
            # Log the weights and bias at the end of the epoch
            if self.log_params:
                for name, parameter in task.base_model.named_parameters():
                    self.tb_writer.add_histogram(
                        f"Parameters/{name}",
                        parameter,
                        global_step=task.train_phase_idx,
                    )
            # Log the parameter gradients at the end of the epoch
            if self.log_params_gradients:
                for name, parameter in task.base_model.named_parameters():
                    if parameter.grad is not None:
                        try:
                            self.tb_writer.add_histogram(
                                f"Gradients/{name}",
                                parameter.grad,
                                global_step=task.train_phase_idx,
                            )
                        except ValueError:
                            logging.info(
                                f"Gradient histogram empty for {name}, "
                                f"iteration {task.iteration}. Unable to "
                                f"log gradient.")

            # Reset the GPU Memory counter
            if torch.cuda.is_available():
                torch.cuda.reset_max_memory_allocated()
                torch.cuda.reset_max_memory_cached()
Example #26
0
    def extract_clusters(self, output_folder: str) -> Dict[str, Dict[int, int]]:
        """
        Workflow to extract multi-gpu cluster extraction for pre-trained models
        based on clusterization (SwAV, DeepCluster, etc).

        The function returns a map from image index to cluster index for the
        whole dataset for each of the different splits.
        """

        # Support feature extraction on gpu only.
        assert self.task.device.type == "cuda", "Set MACHINE.DEVICE = gpu"
        self.task.prepare_extraction(pin_memory=self.cfg.DATA.PIN_MEMORY)

        # Assert that the model support extract of clusters
        assert (
            self.task.base_model.is_clustering_model()
        ), "Extracting clusters is only available for cluster based pre-training methods"

        # Create distributed model
        self.task.add_dummy_layer()
        self.task.init_distributed_data_parallel_model()
        if is_primary():
            logging.info("Model is:\n {}".format(self.task.model))

        # Compute the cluster assignment on each worker in parallel
        cluster_assignment = {}
        for split in self.task.available_splits:
            msg = f"Extracting cluster assignment for partition: {split}"
            logging.info(msg)
            cluster_assignment[split] = self._get_cluster_assignment_for_split(
                self.task, split, output_folder=output_folder
            )
            logging.info("Done: " + msg)
        self._cleanup_task()

        # Merge the cluster assignments and group by cluster
        return self._merge_cluster_assignments(cluster_assignment)
Example #27
0
    def on_loss_and_meter(self, task: "tasks.ClassyTask") -> None:
        """
        
        """
        if not is_primary():
            return
        phase_type = "train" if task.train else "test"

        train_phase_idx = task.train_phase_idx
        iteration = task.iteration

        loss_val = round(task.last_batch.loss.data.cpu().item(), 5)

        if isinstance(task.optimizer.options_view.lr, set):
            lr_val = list(task.optimizer.options_view.lr)
        else:
            lr_val = round(task.optimizer.options_view.lr, 5)

        log_str = (f"Phase Type: {phase_type}; "
                   f"[ep: {train_phase_idx}] "
                   f"iter: {iteration}; "
                   f"lr: {lr_val}; "
                   f"loss: {loss_val}; ")
        logging.info(log_str)
Example #28
0
    def _checkpoint_model(self,
                          task,
                          train_phase_idx,
                          mode_frequency,
                          mode_num,
                          mode="phase"):
        """
        Checkpoint model. Can be called in 3 possible scenarios:
        1. If training becomes NaN, then we checkpoint the model to facilitate debugging
        2. After every N epochs (CHECKPOINT_FREQ), model state is checkpointed.
        3. If user wants to checkpoint during the epoch (ie. after every few training
           iterations, the model state is checkpointed.)

        Args:
            task: Self-supervision task that hold information about training iteration,
                  epoch number etc.
            train_phase_idx (int): current training phase number. Starts from 0
            mode_frequency (int): mode can be "phase" or "iteration". Frequency
                                  of checkpointing for the given mode
            mode_num (int): for the checkpointing mode (phase or iteration), the number
                            of phase or iteration at which checkpointing is being done
        """
        phase_idx = task.phase_idx
        # num_train_phases = num_epochs * num_phases_per_epoch
        # For OSS use, num_train_phases will be equal to num_epochs
        num_train_phases = task.num_train_phases

        # check if we need to checkpoint this phase
        is_checkpointing_phase = is_checkpoint_phase(mode_num, mode_frequency,
                                                     train_phase_idx,
                                                     num_train_phases, mode)
        is_final_train_phase = ((train_phase_idx == (num_train_phases - 1))
                                and task.train and mode == "phase")

        # handle checkpoint:
        if task.train and (is_final_train_phase or is_checkpointing_phase):
            #  - if sharded state consolidate the state
            # /!\ All the ranks have to participate
            if hasattr(task.optimizer,
                       "consolidate_state_dict") and mode != "phase":
                logging.info(
                    f"[{mode}: {mode_num}] Consolidating sharded state on all replicas"
                )
                task.optimizer.consolidate_state_dict()

            # Depending on whether we are in FSDP mode or not
            # - save the checkpoint on the primary rank
            # - save the sharded checkpoint on all ranks
            if is_primary() or isinstance(task.base_model, FSDP):
                checkpoint_folder = task.checkpoint_folder
                logging.info(
                    f"[{mode}: {mode_num}] Saving checkpoint to {checkpoint_folder}"
                )
                model_state_dict = task.get_classy_state()

                # phase_idx is already incremented at the beginning of phase but if we
                # are checkpointing at an iteration in the middle of phase, we should not
                # save the incremented phase_idx as it will incorrectly assume that model
                # trained for that phase already.
                if mode == "iteration":
                    model_state_dict[
                        "phase_idx"] = model_state_dict["phase_idx"] - 1
                    if task.train:
                        train_phase_idx = train_phase_idx - 1
                        model_state_dict["train_phase_idx"] = train_phase_idx
                    restart_phase = phase_idx - 1
                    restart_iteration = task.iteration

                # When loading from a phase checkpoint:
                else:
                    restart_phase = phase_idx
                    restart_iteration = task.iteration

                checkpoint_content = {
                    "phase_idx": restart_phase,
                    "iteration": restart_iteration,
                    "loss": task.loss.state_dict(),
                    "iteration_num": task.local_iteration_num,
                    "train_phase_idx": train_phase_idx,
                    "classy_state_dict": model_state_dict,
                }

                checkpoint_writer = CheckpointWriter(
                    checkpoint_folder=checkpoint_folder,
                    is_final_train_phase=is_final_train_phase,
                    mode=mode,
                    mode_num=mode_num,
                    backend=task.config["CHECKPOINT"]["BACKEND"],
                )

                if isinstance(task.base_model, FSDP):
                    _, rank = get_machine_local_and_dist_rank()
                    checkpoint_writer.save_sharded_checkpoint(
                        content=checkpoint_content,
                        shard_rank=rank,
                        world_size=self.world_size,
                    )
                else:
                    checkpoint_writer.save_consolidated_checkpoint(
                        checkpoint_content)
Example #29
0
    def on_update(self, task: "tasks.ClassyTask") -> None:
        """
        Executed after after parameter update. If the current phase is training,
        and it's a logging iteration, we compute and log several helpul training
        stats to keep track of ongoing training.

        For monitoring the batch size (average training iteration time), we allow
        monitoring the stats (optionally) for every N iterations to get better
        idea about the batch time and training eta.

        Set the btime_freq input using cfg.HOOKS.PERF_STATS.PERF_STAT_FREQUENCY=N
        ensuring that cfg.HOOKS.PERF_STATS.MONITOR_PERF_STATS = True.
        """
        phase_type = "train" if task.train else "test"
        if is_primary() and phase_type == "train":
            train_phase_idx = task.train_phase_idx
            log_freq = task.config["LOG_FREQUENCY"]
            iteration = task.iteration

            if torch.cuda.is_available():
                peak_mem_used = int(torch.cuda.max_memory_allocated() /
                                    1024.0 / 1024.0)
            else:
                peak_mem_used = -1

            if ((iteration == 1) or (iteration % log_freq == 0)
                    or (iteration <= 100 and iteration % 5 == 0)):
                loss_val = round(task.last_batch.loss.data.cpu().item(), 5)
                if len(task.batch_time) > 0:
                    batch_times = task.batch_time
                else:
                    batch_times = [0]
                avg_time = sum(batch_times) / len(batch_times)

                eta_secs = avg_time * (task.max_iteration - iteration)
                eta_string = str(datetime.timedelta(seconds=int(eta_secs)))
                if isinstance(task.optimizer.options_view.lr, set):
                    lr_val = list(task.optimizer.options_view.lr)
                else:
                    lr_val = round(task.optimizer.options_view.lr, 5)
                batch_time = int(1000.0 * avg_time)
                rank = get_rank()
                log_data = {
                    "Rank": rank,
                    "ep": train_phase_idx,
                    "iter": iteration,
                    "lr": lr_val,
                    "loss": loss_val,
                    "btime(ms)": batch_time,
                    "eta": eta_string,
                    "peak_mem(M)": peak_mem_used,
                }

                if iteration == 1:
                    # Set max iterations. Currently used in benchmark_suite_scheduler.py
                    log_data["max_iterations"] = task.max_iteration

                if self.btime_freq and len(batch_times) >= self.btime_freq:
                    rolling_avg_time = (sum(batch_times[-self.btime_freq:]) /
                                        self.btime_freq)
                    rolling_eta_secs = int(rolling_avg_time *
                                           (task.max_iteration - iteration))
                    rolling_eta_str = str(
                        datetime.timedelta(seconds=int(rolling_eta_secs)))
                    rolling_btime = int(1000.0 * rolling_avg_time)
                    log_data[
                        f"btime({self.btime_freq}iters)(ms)"] = rolling_btime
                    log_data["rolling_eta"] = rolling_eta_str

                # to maintain the backwards compatibility with the log.txt
                # logs, we convert the json to the previous format.
                # the stdout.json can be used to use the json format of logs.
                stdout_data = ""
                for key, value in log_data.items():
                    stdout_data = (f"{stdout_data}[{key}: {value}] "
                                   if key == "ep" else
                                   f"{stdout_data}{key}: {value}; ")
                logging.info(stdout_data.strip())
                self.json_stdout_logger.write(json.dumps(log_data) + "\n")
Example #30
0
 def on_phase_end(self, task) -> None:
     """Clear the progress bar at the end of the phase."""
     if is_primary() and self.progress_bar is not None:
         self.progress_bar.finish()