Example #1
0
class BaseHook:
    __metaclass__ = ABCMeta

    def __init__(
        self,
        collection_manager: CollectionManager,
        default_include_collections: List[str],
        profiler_config_parser: ProfilerConfigParser,
        init_step: int = 0,
        out_dir: Optional[str] = None,
        export_tensorboard: bool = False,
        tensorboard_dir: Optional[str] = None,
        dry_run: bool = False,
        reduction_config: Optional[ReductionConfig] = None,
        save_config: Optional[Union[SaveConfig, Dict[ModeKeys,
                                                     SaveConfigMode]]] = None,
        include_regex: Optional[List[str]] = None,
        include_collections: Optional[List[str]] = None,
        save_all: bool = False,
        include_workers: str = "one",
    ):
        """
        A class used to represent the hook which gets attached to the
        training process. This takes the form appropriate for the framework
        such as tf.train.SessionRunHook for TF, Callback for keras...

        ...

        Attributes
        ----------
        out_dir : str
            represents a path into which outputs will be written to. The hook raises error if the 'out_dir' already
            exists. The implementation does not support merging the tensors generated in current job with tensors
            from previous job. Hence, ensure that the 'out_dir' does not exist.
        dry_run : bool
            when dry run is set, behavior is only described in the log file.
            tensors are not actually saved.
        save_config: SaveConfig object
            Takes save config object which is applied as default for all included tensors.
            A collection can optionally have its own saveconfig object
            which overrides this for its tensors.

        reduction_config: ReductionConfig object
            if passed, this reduction config object is used
            as default for all tensors included.
            A collection has its own saveconfig object
            which overrides this for its tensors. if this is not passed,
            tensor is saved in full.

        include_regex: list of str
            takes as input the list of string representing regular expressions. Tensors whose names match
            these regular expressions will be saved. These tensors will be available as part of the `default`
            collection.

        include_collections: list of str representing collection names
            takes as input the collections which should be saved.
            if this is empty, it defaults to including all collections from code

        save_all: bool
            a shortcut for saving all tensors in the model.
            they are all saved in the collection `all`
        include_workers: str
            makes the hook save data from all workers

        profiler_config_parser: ProfilerConfigParser object
            if passed, use this profiler configuration. by default, set up a new profiler configuration here.
        """
        error_handling_agent.set_hook(
            self)  # This should be the first line in the constructor.
        self.out_dir = verify_and_get_out_dir(out_dir)
        self.tensorboard_dir = get_tensorboard_dir(
            export_tensorboard=export_tensorboard,
            tensorboard_dir=tensorboard_dir,
            out_dir=self.out_dir,
        )

        self.dry_run = dry_run
        self.worker = None
        # when smdebug is used during an unsupported dist training process
        # we write data only from the process that has self.first_process set to True.
        self.first_process = None
        self.save_all_workers = True if include_workers == "all" else False
        self.chief_worker = DEFAULT_WORKER_NAME

        if include_collections is None:
            include_collections = default_include_collections
        else:
            include_collections = flatten(include_collections)
        self.include_collections = list(
            set(include_collections).union(set(default_include_collections)))

        self.save_all = save_all
        self.save_config = SaveConfig.parse(save_config)
        if reduction_config is None:
            reduction_config = ReductionConfig(save_raw_tensor=True)
        self.reduction_config = reduction_config
        self.include_regex = include_regex
        self.collection_manager = collection_manager
        self.init_step = init_step

        # The written_tensor_name_for_step dictionary stores
        # the names of each tensor saved for every step.
        # This is to detect name clashes.
        # If a name clash is detected, it is avoided by appending
        # an index to the tensor name.
        self.written_tensor_name_for_step = defaultdict(int)

        self.logger = logger

        if self.tensorboard_dir is None:
            self.logger.info(
                f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries."
            )

        if include_regex is not None:
            collection_manager.get(
                CollectionKeys.DEFAULT).include(include_regex)
            if CollectionKeys.DEFAULT not in self.include_collections:
                self.include_collections.append(CollectionKeys.DEFAULT)

        self.save_all = save_all
        if self.save_all:
            collection_manager.get(CollectionKeys.ALL).include(".*")
            if CollectionKeys.ALL not in self.include_collections:
                self.include_collections.append(CollectionKeys.ALL)

        if (CollectionKeys.DEFAULT not in self.include_collections and
                collection_manager.get(CollectionKeys.DEFAULT).include_regex):
            self.logger.warn("The `default` collection was not passed to "
                             "include_collections. So it is not being saved")

        self._collections_to_save = set()
        self._collections_to_save_for_step = None
        self.prepared_collections = False
        self.tensor_to_collections = {}

        self.step = init_step
        self.last_saved_step = None
        self.mode = ModeKeys.GLOBAL
        self.mode_steps = {ModeKeys.GLOBAL: init_step}
        self.writer = None

        self.profiler_config_parser = profiler_config_parser
        self.profiler_config_parser.load_config()

        self.timeline_writer = TimelineFileWriter(
            profiler_config_parser=profiler_config_parser)
        self.hvd_reader = None
        self.is_smdataparallel_profiling = False

        if is_sagemaker_job() and SageMakerFileMetricsWriter is not None:
            self.metrics_writer = SageMakerFileMetricsWriter()
        else:
            self.metrics_writer = None

        # Maps ModeKeys to FileWriter objects
        self.tb_writers = {}

        # Cache scalars that are being saved through save_scalar() calls
        self.scalar_cache = []

        self.logger.info("Saving to {}".format(self.out_dir))
        atexit.register(self._cleanup)

        # Check if there is any last saved state. Initialize the hook based last saved state.
        self.training_run = 0
        self._initialize_to_last_saved_state()
        self.custom_tensors_to_save = dict()

    # This will avoid pickling of BaseHook object
    def __getstate__(self):
        return {}

    def _initialize_to_last_saved_state(self):
        self.state_store = StateStore()
        last_state = self.state_store.get_last_saved_state()
        if last_state is not None:
            self.last_saved_step = last_state[LATEST_GLOBAL_STEP_SAVED]
            self.init_step = last_state[LATEST_GLOBAL_STEP_SEEN]
            self.training_run = 1 + last_state[TRAINING_RUN]
            for (mode, step) in last_state[LATEST_MODE_STEP].items():
                self.mode_steps[ModeKeys[mode]] = step
            self.mode_steps[ModeKeys.GLOBAL] = self.init_step
            self.step = self.init_step
            self.logger.info(
                f"Initialized the hook with the last saved state: last_saved_step={self.last_saved_step} init_step = {self.init_step}, step = {self.step} mode_steps = {str(self.mode_steps)}"
            )

    def __repr__(self):
        return (
            f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}>:(\n"
            f"    out_dir={self.out_dir},\n"
            f"    tensorboard_dir={self.tensorboard_dir},\n"
            f"    step={self.step},\n"
            f"    mode={self.mode},\n"
            f"    mode_steps={self.mode_steps},\n"
            f"    include_collections={self.include_collections},\n"
            f"    writer={self.writer},\n"
            f"    save_config={str(self.save_config)[:200]} ...>,\n"
            f"    reduction_config={str(self.reduction_config)},\n"
            f"    save_all={self.save_all},\n"
            f"    dry_run={self.dry_run},\n"
            f")")

    @classmethod
    def create_from_json_file(cls, json_file_path=None):
        """Relies on the existence of a JSON file.

        First, check json_config_path. If it's not None,
            If the file exists, use that.
            If the file does not exist, throw an error.
        Otherwise, check the filepath set by a SageMaker environment variable.
            If the file exists, use that.
        Otherwise,
            return None.
        """
        return create_hook_from_json_config(cls,
                                            json_config_path=json_file_path)

    @abstractmethod
    def _get_worker_name(self):
        pass

    @abstractmethod
    def _get_num_workers(self):
        pass

    @abstractmethod
    def _is_not_supported(self):
        pass

    #### Save Manager methods ####

    def _should_collection_be_saved(self, coll_name: str) -> bool:
        return coll_name in self.include_collections

    def _assert_prep(self):
        assert self.prepared_collections, "Collections have not been prepared yet"

    def _get_all_collections_to_save(self) -> Set["Collection"]:
        self._assert_prep()
        return self._collections_to_save

    @error_handling_agent.catch_smdebug_errors(default_return_val=False)
    def _is_collection_being_saved_for_step(self, name):
        # if saving all, all collections will be part of colls_for_step
        colls_for_step = self._get_collections_to_save_for_step()
        return self.collection_manager.get(name) in colls_for_step

    def _get_collections_to_save_for_step(self) -> Set["Collection"]:
        if self._collections_to_save_for_step is None:
            self._assert_prep()
            self._collections_to_save_for_step = set()
            for coll in self._get_all_collections_to_save():
                if self.mode in [ModeKeys.EVAL, ModeKeys.PREDICT]:
                    if coll.name in [
                            CollectionKeys.GRADIENTS,
                            CollectionKeys.OPTIMIZER_VARIABLES
                    ]:
                        continue
                if coll.save_config.should_save_step(
                        self.mode, self.mode_steps[self.mode]):
                    self._collections_to_save_for_step.add(coll)

            if self._collections_to_save_for_step:
                if self.mode == ModeKeys.GLOBAL:
                    step_str = f"for step {self.step}"
                else:
                    step_str = f"for step {self.mode_steps[self.mode]} of mode {self.mode.name}"
                self.logger.debug(
                    f"Saving the collections "
                    f"{', '.join([x.name for x in self._collections_to_save_for_step])} {step_str}"
                )
        return self._collections_to_save_for_step

    def _is_tensor_saved_for_step(self, tensor_name):
        collections_to_save = self._get_collections_to_save_for_step()
        for c in collections_to_save:
            if match_inc(tensor_name, c.include_regex):
                return True
        return False

    def _get_collections_with_tensor(self, tensor_name) -> Set["Collection"]:
        self._assert_prep()
        # for tf this will be prepopulated in check_and_add_tensor
        if tensor_name not in self.tensor_to_collections:
            # for mxnet it is computed and then cached
            matched_colls = set()
            for coll in self._collections_to_save:
                if tensor_name in coll.tensor_names:
                    # if being matched as reduction,
                    # it must be in reduction_tensor_name, not with regex
                    matched_colls.add(coll)
                elif match_inc(tensor_name, coll.include_regex):
                    coll.add_tensor_name(tensor_name)
                    matched_colls.add(coll)
            self.tensor_to_collections[tensor_name] = matched_colls
        return self.tensor_to_collections[tensor_name]

    @abstractmethod
    def _get_default_collections(self):
        pass

    def has_default_hook_configuration(
            self, default_saved_collections=DEFAULT_SAVED_COLLECTIONS):
        # Used in the internal framework forks to determine if the hook
        # is using the default hook configuration
        if not self.prepared_collections:
            self._prepare_collections()

        collections_being_saved = [x.name for x in self._collections_to_save]
        return set(collections_being_saved) == set(default_saved_collections)

    def _has_default_profiler_configuration(self):
        return self.profiler_config_parser.config is None

    def has_default_configuration(self):
        return self.has_default_hook_configuration(
        ) and self._has_default_profiler_configuration()

    def _prepare_collections(self):
        """Populate collections_to_save and ensure every collection has
        a save_config and reduction_config."""
        for c_name, c in self.collection_manager.get_collections().items():
            if c_name not in self._get_default_collections():
                if bool(c.include_regex) is False and bool(
                        c.tensor_names) is False:
                    raise InvalidCollectionConfiguration(c_name)
            if c in self._collections_to_save:
                continue
            elif self._should_collection_be_saved(CollectionKeys.ALL):
                self._collections_to_save.add(c)
            elif self._should_collection_be_saved(c_name):
                self._collections_to_save.add(c)

        self.logger.info(
            f'Monitoring the collections: {", ".join([x.name for x in self._collections_to_save])}'
        )
        # Populate configs_for_collections and reduction_config
        for c_name, c in self.collection_manager.get_collections().items():

            if c_name in NON_HISTOGRAM_COLLECTIONS:
                c.save_histogram = False

            if c.save_config is None:
                # Set to the default if None
                c.save_config = self.save_config
            elif isinstance(c.save_config, SaveConfig):
                # Otherwise, set missing modes to the defaults
                c.save_config.merge_default_save_config(self.save_config)
            else:
                raise TypeError(
                    f"save_config={c.save_config} must be None or SaveConfig")

            if c_name in NON_REDUCTION_COLLECTIONS:
                c.reduction_config = ReductionConfig(save_raw_tensor=True)
            elif c.reduction_config is None:
                c.reduction_config = self.reduction_config

        self.prepared_collections = True

    #### End of Save Manager methods ####
    @staticmethod
    def _close_given_writer_map(writer_dict):
        # Delete all the dist training writers
        to_delete_writers = []
        for key, writer in writer_dict.items():
            # close calls flush
            writer.close()
            to_delete_writers.append(key)

        for key in to_delete_writers:
            del writer_dict[key]

    def _close_writers(self) -> None:
        if self.dry_run:
            return

        # flush out sm_metric scalars to metrics file
        self._write_scalars()

        if self.writer is not None:
            self.writer.flush()
            self.writer.close()
            self.writer = None

        self._close_given_writer_map(self.tb_writers)

    def _initialize_writers(self, only_initialize_if_missing=False) -> None:
        # Function is overridden in smdebug/tensorflow/base_hook.py
        if only_initialize_if_missing and self.writer:
            return
        if self.dry_run:
            return
        if self.first_process is False:
            return
        elif self.first_process is None:
            if self._get_num_workers() == 1:
                if is_first_process(self.out_dir):
                    self.first_process = True
                    self.logger.info(
                        f"Hook is writing from the hook with pid: {os.getpid()}\n"
                    )
                else:
                    if self.first_process is None:
                        self.logger.warn(
                            f"Unsupported Distributed Training Strategy Detected. \
                            Sagemaker-Debugger will only write from one process. \
                            The process with pid: {os.getpid()} will not be writing any data. \n"
                        )
                    self.first_process = False
                    return

        if self.save_all_workers is False:
            if self.worker != self.chief_worker:
                return

        self.writer = FileWriter(trial_dir=self.out_dir,
                                 step=self.step,
                                 worker=self.worker)

    def _get_main_writer(self) -> List[FileWriter]:
        return [self.writer] if self.writer else []

    def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
        """
        :param tensor_name:
        :param tensor_ref: used by TF
        :return: List[FileWriter]
        """
        if self.save_all_workers is False and self.worker != self.chief_worker:
            return []
        return self._get_main_writer()

    def _maybe_get_tb_writer(self) -> Optional[FileWriter]:
        """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None.

        Creates a writer if does not exist.
        """
        if not self.tensorboard_dir:
            return None

        if self.mode in self.tb_writers:
            assert self.tb_writers[self.mode] is not None
            # would be there if set_mode was called
            return self.tb_writers[self.mode]
        else:
            # s = self.step
            # if s < 0: s = 0
            self.tb_writers[self.mode] = FileWriter(
                trial_dir=self.tensorboard_dir,
                step=self.step,
                worker=get_tb_worker(),
                write_checksum=True,
                wtype="tensorboard",
                mode=self.mode,
            )
            return self.tb_writers[self.mode]

    def _close_tb_writer(self):
        if self.dry_run:
            return

        if self.mode in self.tb_writers:
            self.tb_writers[self.mode].close()
            del self.tb_writers[self.mode]

    def close(self):
        self._cleanup()

    def log_outstanding_timeline_metrics(self):
        pass

    def _cleanup(self):
        self._close_writers()

        if self.metrics_writer:
            self.metrics_writer.close()
        self.log_outstanding_timeline_metrics()
        self.timeline_writer.close()

        # close the Horovod file reader thread if it has been enabled
        if self.hvd_reader and self.hvd_reader.enabled:
            self.hvd_reader.close()

        training_has_ended(self.out_dir)
        if self.first_process is True:
            remove_claim_file(self.out_dir)

    def _increment_step(self):
        # Update the last_state to the last step number that was saved or seen
        self._write_state()

        self.step += 1
        self.mode_steps[self.mode] += 1
        self.written_tensor_name_for_step.clear()

        # Increment Global step number irrespective of what mode it is
        if self.mode != ModeKeys.GLOBAL:
            self.mode_steps[ModeKeys.GLOBAL] = self.step
        self._collections_to_save_for_step = None

    # Called in the internal AWS codebase to determine
    # if a particular tensor value should be saved
    @error_handling_agent.catch_smdebug_errors()
    def should_save_tensor_or_collection(self, tensor_name: str,
                                         collection_name: str) -> bool:
        if self.prepared_collections is False:
            # always return false if an attempt to save a
            # tensor is made before the collections are prepared.
            # this can happen if the fn is called before callbacks are init.
            self.logger.warning(
                "Tensors cannot be saved with smdebug before callbacks are initialized."
            )
            return False
        if collection_name == "gradients":
            layer_name = tensor_name.split(":")[0]
            tensor_name = "gradients/" + layer_name + "Grad"
        if self._is_collection_being_saved_for_step(collection_name):
            c = self.collection_manager.get(collection_name)
            return match_inc(tensor_name,
                             c.include_regex) or c.include_regex == []
        return self._is_tensor_saved_for_step(tensor_name)

    def _write_state(self):
        if self.state_store.is_checkpoint_updated():
            current_state = dict()
            current_state[TRAINING_RUN] = self.training_run
            current_state[LATEST_GLOBAL_STEP_SAVED] = self.last_saved_step
            current_state[LATEST_GLOBAL_STEP_SEEN] = self.step
            mode_step = dict()
            for (mode, step) in self.mode_steps.items():
                mode_step[mode.name] = step
            current_state[LATEST_MODE_STEP] = mode_step
            self.state_store.update_state(current_state)

    def save_tensor(self,
                    tensor_name,
                    tensor_value,
                    collections_to_write=CollectionKeys.DEFAULT):
        if validate_custom_tensor_value(tensor_value,
                                        self._make_numpy_array) is False:
            self.logger.warn(
                "The tensor value could not be converted into a numpy value")
            return
        if isinstance(collections_to_write, str):
            collections_to_write = [collections_to_write]
        for collection in collections_to_write:
            self.custom_tensors_to_save[tensor_name] = (tensor_value,
                                                        collection)

    def _save_custom_tensors_post_step(self):
        for tensor_name in self.custom_tensors_to_save:
            tensor_value, collection_names = self.custom_tensors_to_save[
                tensor_name]
            c = self.collection_manager.get(collection_names, create=True)
            c.add_tensor_name(tensor_name)
            self._write_raw_tensor(tensor_name, tensor_value, [c])
        self.custom_tensors_to_save.clear()

    def set_mode(self, mode):
        # train
        if mode in ALLOWED_MODES:
            self.mode = mode
        else:
            raise ValueError("Invalid mode {}. Valid modes are {}.".format(
                mode, ",".join(ALLOWED_MODE_NAMES)))

        if mode not in self.mode_steps:
            self.mode_steps[mode] = self.init_step

        self._collections_to_save_for_step = None

    def export_collections(self):
        num_workers = self._get_num_workers()
        if num_workers == 1 and self.first_process is False:
            self.logger.warn(
                f"Unsupported Distributed Training Strategy Detected. \
                Sagemaker-Debugger will only write from one process. \
                The process with pid: {os.getpid()} will not be writing any data. \n"
            )
            return
        if self.save_all_workers is False:
            if self.chief_worker != self.worker:
                return
            num_workers = 1  # Override
        self.collection_manager.set_num_workers(num_workers)
        collection_file_name = f"{self.worker}_collections.json"
        self.collection_manager.export(self.out_dir, collection_file_name)

    def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs):
        return get_reduction_tensor_name(tensor_name,
                                         reduction_name,
                                         abs,
                                         remove_colon_index=True)

    def _write_reduction(self,
                         tensor_name,
                         tensor_value,
                         reduction_name,
                         abs,
                         tensor_ref=None):
        reduction_tensor_name = self._get_reduction_tensor_name(
            tensor_name, reduction_name, abs)
        try:
            tensor_data = self._get_reduction_of_data(reduction_name,
                                                      tensor_value,
                                                      tensor_name, abs)
            self._write_raw_tensor_simple(reduction_tensor_name,
                                          tensor_data,
                                          tensor_ref=tensor_ref)
        except ValueError as e:
            self.logger.warning(
                f"Could not compute reduction {reduction_name} of {tensor_name} due to {e}"
            )

    def _write_reductions(self,
                          tensor_name,
                          tensor_value,
                          save_collections,
                          tensor_ref=None):
        reductions_saved = set()
        for s_col in save_collections:
            if s_col.name in SCALAR_COLLECTIONS:
                continue
            reduction_config = s_col.reduction_config
            for reduction_list in (reduction_config.reductions,
                                   reduction_config.norms):
                for reduction in reduction_list:
                    if (reduction, False) not in reductions_saved:
                        self._write_reduction(tensor_name,
                                              tensor_value,
                                              reduction,
                                              abs=False,
                                              tensor_ref=tensor_ref)
                        reductions_saved.add((reduction, False))
            for reduction_list in (reduction_config.abs_reductions,
                                   reduction_config.abs_norms):
                for reduction in reduction_list:
                    if (reduction, True) not in reductions_saved:
                        self._write_reduction(tensor_name,
                                              tensor_value,
                                              reduction,
                                              abs=True,
                                              tensor_ref=tensor_ref)
                        reductions_saved.add((reduction, True))

    def _write_scalar_summary(self, tensor_name, tensor_value, save_colls):
        """ Maybe write to TensorBoard. """
        tb_writer = self._maybe_get_tb_writer()
        if tb_writer:
            for s_col in save_colls:
                if s_col.name in SCALAR_COLLECTIONS:
                    np_val = self._make_numpy_array(tensor_value)
                    if self.dry_run:
                        return

                    if np_val.squeeze().ndim == 0:
                        self.logger.debug(
                            f"Saving scalar summary {tensor_name} for global step {self.step}"
                        )
                        tb_writer.write_scalar_summary(tensor_name, np_val,
                                                       self.step)
                    else:
                        self.logger.debug(
                            f"Value of {tensor_name} is not scalar, "
                            f"so scalar summary could not be created")
                    break

    def _write_histogram_summary(self, tensor_name, tensor_value,
                                 save_collections):
        """ Maybe write to TensorBoard. """
        tb_writer = self._maybe_get_tb_writer()
        if tb_writer:
            for s_col in save_collections:
                if s_col.name in NON_HISTOGRAM_COLLECTIONS:
                    continue
                elif s_col.save_histogram is True:
                    np_value = self._make_numpy_array(tensor_value)
                    if self.dry_run or np_value.dtype == np.bool or np_value.nbytes == 0:
                        return

                    hist_name = f"{s_col.name}/{tensor_name}"
                    self.logger.debug(
                        f"Saving {hist_name} for global step {self.step}")
                    tb_writer.write_histogram_summary(tdata=np_value,
                                                      tname=hist_name,
                                                      global_step=self.step)
                    break

    @error_handling_agent.catch_smdebug_errors()
    def record_trace_events(self,
                            timestamp,
                            training_phase="",
                            op_name="",
                            phase="X",
                            duration=1,
                            **kwargs):
        """
        Write trace events to the timeline.
        :param training_phase: strings like, data_iterating, forward, backward, operations etc
        :param op_name: more details about phase like whether dataset or iterator
        :param phase: this is defaulted to 'X'
        :param timestamp: start_time for the event (in seconds)
        :param duration: any duration manually computed (in seconds)
        :param kwargs: can be process id and thread id
        """
        self.timeline_writer.write_trace_events(
            training_phase=training_phase,
            op_name=op_name,
            phase=phase,
            timestamp=timestamp,
            duration=duration,
            **kwargs,
        )

    def _write_scalars(self):
        """
        This function writes all the scalar values saved in the scalar_cache to file.
        If sm_metric is set to True for certain scalars, then that scalar is written to
        SageMaker as well. By default, loss values are sm_metric.
        """
        if self._is_not_supported():
            # Do not log scalars if smdebug hook is not supported
            # Like when TFDistributionStrategy.UNSUPPORTED
            self.scalar_cache = []
            return
        for scalar_obj in self.scalar_cache:
            scalar_name = scalar_obj.name
            scalar_val = scalar_obj.value
            scalar_mode = scalar_obj.mode
            sm_metric = scalar_obj.sm_metric
            write_tb = scalar_obj.write_tb
            write_event = scalar_obj.write_event
            timestamp = scalar_obj.timestamp
            if self.metrics_writer and sm_metric:
                self.metrics_writer.log_metric(
                    scalar_name + "_" + scalar_mode.name,
                    scalar_val,
                    timestamp=timestamp,
                    iteration_number=self.mode_steps[scalar_mode],
                )
            if write_tb:
                tb_writer = self._maybe_get_tb_writer()
                if tb_writer:
                    tb_writer.write_scalar_summary(scalar_name,
                                                   scalar_val,
                                                   self.step,
                                                   timestamp=timestamp)
            if write_event:
                self._initialize_writers(only_initialize_if_missing=True)
                self._write_raw_tensor_simple(scalar_name,
                                              scalar_val,
                                              timestamp=timestamp)

        self.scalar_cache = []

    # Fix step number for saving scalar and tensor
    def save_scalar(self,
                    name,
                    value,
                    sm_metric=False,
                    timestamp: float = None):
        """
        Call save_scalar at any point in the training script to log a scalar value,
        such as a metric or any other value.
        :param name: Name of the scalar. A prefix 'scalar/' will be added to it
        :param value: Scalar value
        :param sm_metric: True/False. If set to True, the scalar value will be written to
        SageMaker
        """
        name = CallbackHook.SCALAR_PREFIX + name
        val = self._make_numpy_array(value)
        if val.size != 1:
            raise TypeError(
                f"{name} has non scalar value of type: {type(value)}")
        scalar_obj = ScalarCache(name,
                                 val,
                                 self.mode,
                                 sm_metric,
                                 write_tb=True,
                                 write_event=True,
                                 timestamp=timestamp)
        self.scalar_cache.append(scalar_obj)

    def _write_raw_tensor(self,
                          tensor_name,
                          tensor_value,
                          save_collections,
                          tensor_ref=None):
        for s_col in save_collections:
            reduction_config = s_col.reduction_config
            if reduction_config.save_raw_tensor is True:
                self._write_raw_tensor_simple(tensor_name,
                                              tensor_value,
                                              tensor_ref=tensor_ref)
                break

    def _write_shape(self,
                     tensor_name,
                     tensor_value,
                     save_collections,
                     tensor_ref=None):
        writers = self._get_writers(tensor_name, tensor_ref=tensor_ref)
        for s_col in save_collections:
            reduction_config = s_col.reduction_config
            if self.dry_run is False and reduction_config.save_shape is True:
                numpy_tensor_value = self._make_numpy_array(tensor_value)
                this_size, this_shape = size_and_shape(numpy_tensor_value)
                # In TF Keras and Variables in all interfaces of TF, sometimes we output tensors with
                # more meaningful names than the origina name. Outputting
                # both Smdebug given name and original name in such cases
                if tensor_ref is not None and tensor_ref.tf_obj is not None:
                    original_name = tensor_ref.tf_obj.name
                else:
                    original_name = None

                for writer in writers:
                    writer.write_shape(
                        tensor_name,
                        this_shape,
                        self.mode,
                        self.mode_steps[self.mode],
                        original_name=original_name,
                    )
                break

    def _write_raw_tensor_simple(self,
                                 tensor_name,
                                 tensor_value,
                                 tensor_ref=None,
                                 timestamp=None):
        # tensor_ref is used by TF
        # todo: if fp16, check perf of saving as fp16 in proto vs as fp32
        numpy_tensor_value = self._make_numpy_array(tensor_value)
        this_size, this_shape = size_and_shape(numpy_tensor_value)
        if self.dry_run is False and this_size > 0:
            writers = self._get_writers(tensor_name, tensor_ref=tensor_ref)
            for writer in writers:
                writer.write_tensor(
                    tdata=numpy_tensor_value,
                    tname=tensor_name,
                    mode=self.mode,
                    mode_step=self.mode_steps[self.mode],
                    timestamp=timestamp,
                )

    def _save_for_tensor(self,
                         tensor_name,
                         tensor_value,
                         check_before_write=True):
        """
        Identifies if this tensor should be saved for this step
        based on the save configs for the collections it belongs to.
        If this tensor is to be saved, calls write_for_tensor.

        This check can be disabled by passing check_before_write=False.
        Disabling this check is cleaner for TF, as for TF this method is never
        called if tensor should not be saved for this step.
        :param tensor_name: str
        The name of tensor. In TensorFlow's case, this is graph name of tensor
        and will be converted to internal name in write_for_tensor.
        :param tensor_value: dtype is tensor class of corresponding framework
            value of the tensor to be saved
        :param check_before_write: bool
            checks whether to save tensor for this step
        :return:
        """
        save_collections = self._get_collections_with_tensor(tensor_name)
        save_collections_for_tensor = save_collections.intersection(
            self._get_collections_to_save_for_step())
        if check_before_write and bool(save_collections_for_tensor) is False:
            return
        elif not check_before_write:
            # if not checking before write, means we want to write
            # regardless of whether the collection should be written for step
            save_collections_for_tensor = save_collections

        self._write_for_tensor(tensor_name, tensor_value,
                               save_collections_for_tensor)
        for s_col in save_collections_for_tensor:
            if s_col.name in SM_METRIC_COLLECTIONS:
                np_val = self._make_numpy_array(tensor_value)
                # Always log loss to SageMaker
                tensor_val = np.mean(np_val)
                scalar_obj = ScalarCache(
                    tensor_name,
                    tensor_val,
                    self.mode,
                    sm_metric=True,
                    write_tb=False,
                    write_event=False,
                )
                self.scalar_cache.append(scalar_obj)

    def _log_save(self, tensor_name, save_collections):
        coll_str = ", ".join([x.name for x in save_collections])
        many_colls = len(save_collections) > 1
        if self.mode != ModeKeys.GLOBAL:
            step_str = f"for step {self.mode_steps[self.mode]} of mode {self.mode.name}"
        else:
            step_str = f"for step: {self.step}"
        base_str = f"Saving {tensor_name} from {'collections' if many_colls else 'collection'}"
        self.logger.debug(f"{base_str} {coll_str} {step_str}")

    def _write_for_tensor(self,
                          tensor_name,
                          tensor_value,
                          save_collections,
                          tensor_ref=None):
        """
        Write all data that we might want to for this tensor
        :param tensor_name: name of tensor
        :param tensor_value: value (could be in framework tensor dtype)
        :param save_collections: list of collections which are being saved for this step
        """
        self._log_save(tensor_name, save_collections)

        self._write_shape(tensor_name,
                          tensor_value,
                          save_collections,
                          tensor_ref=tensor_ref)

        # write reductions defined for collections this tensor may be part of
        self._write_reductions(tensor_name,
                               tensor_value,
                               save_collections,
                               tensor_ref=tensor_ref)

        # write histogram for this tensor if any collection this tensor
        # is part of has save_histogram as True
        self._write_histogram_summary(tensor_name, tensor_value,
                                      save_collections)

        # write raw tensor if save_raw_tensor in reduction config is True
        self._write_raw_tensor(tensor_name,
                               tensor_value,
                               save_collections,
                               tensor_ref=tensor_ref)

        # writes scalar summary if this value is a scalar (or 1x1 array)
        self._write_scalar_summary(tensor_name, tensor_value, save_collections)

    @staticmethod
    @abstractmethod
    def _get_reduction_of_data(reduction_name, tensor_value, tensor_name, abs):
        """
        Returns the reduction of given tensor
        :param reduction_name: str
            type of reduction
        :param tensor_value: tensor_data_type
            reduction to be performed on this original tensor value
        :param tensor_name: str
            name of original tensor
        :param abs: bool
            whether to take absolute value of tensor before performing reduction
        :return:
        """

    @staticmethod
    @abstractmethod
    def _make_numpy_array(tensor_value):
        """
        Convert the tensor value into a numpy array
        :param tensor_value: mx.nd.NDArray, torch.Tensor, etc
        :return: numpy ndarray
        """

    def get_collection(self, name, create=True):
        return self.collection_manager.get(name, create=create)

    def get_collections(self):
        return self.collection_manager.get_collections()

    def add_collection(self, collection):
        if not isinstance(collection, Collection):
            raise TypeError(
                f"collection must be an instance of Collection class. "
                f"value of type {collection.__class__} is not supported")
        self.collection_manager.add(collection)
Example #2
0
    def __init__(
        self,
        collection_manager: CollectionManager,
        default_include_collections: List[str],
        profiler_config_parser: ProfilerConfigParser,
        init_step: int = 0,
        out_dir: Optional[str] = None,
        export_tensorboard: bool = False,
        tensorboard_dir: Optional[str] = None,
        dry_run: bool = False,
        reduction_config: Optional[ReductionConfig] = None,
        save_config: Optional[Union[SaveConfig, Dict[ModeKeys,
                                                     SaveConfigMode]]] = None,
        include_regex: Optional[List[str]] = None,
        include_collections: Optional[List[str]] = None,
        save_all: bool = False,
        include_workers: str = "one",
    ):
        """
        A class used to represent the hook which gets attached to the
        training process. This takes the form appropriate for the framework
        such as tf.train.SessionRunHook for TF, Callback for keras...

        ...

        Attributes
        ----------
        out_dir : str
            represents a path into which outputs will be written to. The hook raises error if the 'out_dir' already
            exists. The implementation does not support merging the tensors generated in current job with tensors
            from previous job. Hence, ensure that the 'out_dir' does not exist.
        dry_run : bool
            when dry run is set, behavior is only described in the log file.
            tensors are not actually saved.
        save_config: SaveConfig object
            Takes save config object which is applied as default for all included tensors.
            A collection can optionally have its own saveconfig object
            which overrides this for its tensors.

        reduction_config: ReductionConfig object
            if passed, this reduction config object is used
            as default for all tensors included.
            A collection has its own saveconfig object
            which overrides this for its tensors. if this is not passed,
            tensor is saved in full.

        include_regex: list of str
            takes as input the list of string representing regular expressions. Tensors whose names match
            these regular expressions will be saved. These tensors will be available as part of the `default`
            collection.

        include_collections: list of str representing collection names
            takes as input the collections which should be saved.
            if this is empty, it defaults to including all collections from code

        save_all: bool
            a shortcut for saving all tensors in the model.
            they are all saved in the collection `all`
        include_workers: str
            makes the hook save data from all workers

        profiler_config_parser: ProfilerConfigParser object
            if passed, use this profiler configuration. by default, set up a new profiler configuration here.
        """
        error_handling_agent.set_hook(
            self)  # This should be the first line in the constructor.
        self.out_dir = verify_and_get_out_dir(out_dir)
        self.tensorboard_dir = get_tensorboard_dir(
            export_tensorboard=export_tensorboard,
            tensorboard_dir=tensorboard_dir,
            out_dir=self.out_dir,
        )

        self.dry_run = dry_run
        self.worker = None
        # when smdebug is used during an unsupported dist training process
        # we write data only from the process that has self.first_process set to True.
        self.first_process = None
        self.save_all_workers = True if include_workers == "all" else False
        self.chief_worker = DEFAULT_WORKER_NAME

        if include_collections is None:
            include_collections = default_include_collections
        else:
            include_collections = flatten(include_collections)
        self.include_collections = list(
            set(include_collections).union(set(default_include_collections)))

        self.save_all = save_all
        self.save_config = SaveConfig.parse(save_config)
        if reduction_config is None:
            reduction_config = ReductionConfig(save_raw_tensor=True)
        self.reduction_config = reduction_config
        self.include_regex = include_regex
        self.collection_manager = collection_manager
        self.init_step = init_step

        # The written_tensor_name_for_step dictionary stores
        # the names of each tensor saved for every step.
        # This is to detect name clashes.
        # If a name clash is detected, it is avoided by appending
        # an index to the tensor name.
        self.written_tensor_name_for_step = defaultdict(int)

        self.logger = logger

        if self.tensorboard_dir is None:
            self.logger.info(
                f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries."
            )

        if include_regex is not None:
            collection_manager.get(
                CollectionKeys.DEFAULT).include(include_regex)
            if CollectionKeys.DEFAULT not in self.include_collections:
                self.include_collections.append(CollectionKeys.DEFAULT)

        self.save_all = save_all
        if self.save_all:
            collection_manager.get(CollectionKeys.ALL).include(".*")
            if CollectionKeys.ALL not in self.include_collections:
                self.include_collections.append(CollectionKeys.ALL)

        if (CollectionKeys.DEFAULT not in self.include_collections and
                collection_manager.get(CollectionKeys.DEFAULT).include_regex):
            self.logger.warn("The `default` collection was not passed to "
                             "include_collections. So it is not being saved")

        self._collections_to_save = set()
        self._collections_to_save_for_step = None
        self.prepared_collections = False
        self.tensor_to_collections = {}

        self.step = init_step
        self.last_saved_step = None
        self.mode = ModeKeys.GLOBAL
        self.mode_steps = {ModeKeys.GLOBAL: init_step}
        self.writer = None

        self.profiler_config_parser = profiler_config_parser
        self.profiler_config_parser.load_config()

        self.timeline_writer = TimelineFileWriter(
            profiler_config_parser=profiler_config_parser)
        self.hvd_reader = None
        self.is_smdataparallel_profiling = False

        if is_sagemaker_job() and SageMakerFileMetricsWriter is not None:
            self.metrics_writer = SageMakerFileMetricsWriter()
        else:
            self.metrics_writer = None

        # Maps ModeKeys to FileWriter objects
        self.tb_writers = {}

        # Cache scalars that are being saved through save_scalar() calls
        self.scalar_cache = []

        self.logger.info("Saving to {}".format(self.out_dir))
        atexit.register(self._cleanup)

        # Check if there is any last saved state. Initialize the hook based last saved state.
        self.training_run = 0
        self._initialize_to_last_saved_state()
        self.custom_tensors_to_save = dict()
Example #3
0
    def __init__(
        self,
        collection_manager: CollectionManager,
        default_include_collections: List[str],
        init_step: int = 0,
        out_dir: Optional[str] = None,
        export_tensorboard: bool = False,
        tensorboard_dir: Optional[str] = None,
        dry_run: bool = False,
        reduction_config: Optional[ReductionConfig] = None,
        save_config: Optional[Union[SaveConfig, Dict[ModeKeys, SaveConfigMode]]] = None,
        include_regex: Optional[List[str]] = None,
        include_collections: Optional[List[str]] = None,
        save_all: bool = False,
        include_workers: str = "one",
    ):
        """
        A class used to represent the hook which gets attached to the
        training process. This takes the form appropriate for the framework
        such as tf.train.SessionRunHook for TF, Callback for keras...

        ...

        Attributes
        ----------
        out_dir : str
            represents a path into which outputs will be written to
        dry_run : bool
            when dry run is set, behavior is only described in the log file.
            tensors are not actually saved.
        save_config: SaveConfig object
            Takes save config object which is applied as default for all included tensors.
            A collection can optionally have its own saveconfig object
            which overrides this for its tensors.

        reduction_config: ReductionConfig object
            if passed, this reduction config object is used
            as default for all tensors included.
            A collection has its own saveconfig object
            which overrides this for its tensors. if this is not passed,
            tensor is saved in full.

        include_regex: list of str
            takes as input the list of string representing regular expressions. Tensors whose names match
            these regular expressions will be saved. These tensors will be available as part of the `default`
            collection.

        include_collections: list of str representing collection names
            takes as input the collections which should be saved.
            if this is empty, it defaults to including all collections from code

        save_all: bool
            a shortcut for saving all tensors in the model.
            they are all saved in the collection `all`
        include_workers: str
            makes the hook save data from all workers
        """
        self.out_dir = verify_and_get_out_dir(out_dir)
        self.tensorboard_dir = get_tensorboard_dir(
            export_tensorboard=export_tensorboard,
            tensorboard_dir=tensorboard_dir,
            out_dir=self.out_dir,
        )

        self.dry_run = dry_run
        self.worker = None
        self.save_all_workers = True if include_workers == "all" else False
        self.chief_worker = CONFIG_DEFAULT_WORKER_NAME

        if include_collections is None:
            include_collections = default_include_collections
        else:
            include_collections = flatten(include_collections)
        self.include_collections = list(
            set(include_collections).union(set(default_include_collections))
        )

        self.save_all = save_all
        self.save_config = SaveConfig.parse(save_config)
        if reduction_config is None:
            reduction_config = ReductionConfig(save_raw_tensor=True)
        self.reduction_config = reduction_config
        self.include_regex = include_regex
        self.collection_manager = collection_manager
        self.collection_manager.set_num_workers(self._get_num_workers())
        self.init_step = init_step

        self.logger = logger

        if self.tensorboard_dir is None:
            self.logger.info(
                f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries."
            )

        if include_regex is not None:
            collection_manager.get(CollectionKeys.DEFAULT).include(include_regex)
            if CollectionKeys.DEFAULT not in self.include_collections:
                self.include_collections.append(CollectionKeys.DEFAULT)

        self.save_all = save_all
        if self.save_all:
            collection_manager.get(CollectionKeys.ALL).include(".*")
            if CollectionKeys.ALL not in self.include_collections:
                self.include_collections.append(CollectionKeys.ALL)

        if (
            CollectionKeys.DEFAULT not in self.include_collections
            and collection_manager.get(CollectionKeys.DEFAULT).include_regex
        ):
            self.logger.warn(
                "The `default` collection was not passed to "
                "include_collections. So it is not being saved"
            )

        self._collections_to_save = set()
        self._collections_to_save_for_step = None
        self.prepared_collections = False
        self.tensor_to_collections = {}

        self.step = init_step
        self.last_saved_step = None
        self.mode = ModeKeys.GLOBAL
        self.mode_steps = {ModeKeys.GLOBAL: init_step}
        self.writer = None

        if is_sagemaker_job() and SageMakerFileMetricsWriter is not None:
            self.metrics_writer = SageMakerFileMetricsWriter()
        else:
            self.metrics_writer = None

        # Maps ModeKeys to FileWriter objects
        self.tb_writers = {}

        # Cache scalars that are being saved through save_scalar() calls
        self.scalar_cache = []

        self.logger.info("Saving to {}".format(self.out_dir))
        atexit.register(self._cleanup)

        # Check if there is any last saved state. Initialize the hook based last saved state.
        self.training_run = 0
        self._initialize_to_last_saved_state()