Example #1
0
def get_temp_path(file_path):
    directory = os.path.dirname(file_path)
    if is_sagemaker_job():
        temp_path = file_path + SAGEMAKER_TEMP_PATH_SUFFIX
    else:
        if len(file_path) > 0 and file_path[0] == "/":
            file_path = file_path[1:]
        temp_path = os.path.join(NON_SAGEMAKER_TEMP_PATH_PREFIX, file_path)
    return temp_path
Example #2
0
def check_metrics_file(save_steps, saved_scalars=None):
    """
    Check the SageMaker metrics file to ensure that all the scalars saved using
    save_scalar(sm_metrics=True) or mentioned through SM_METRICS collections, have been saved.
    """
    # need this to seek to the right file offset for test output verification
    metrics_file_position = 0
    if is_sagemaker_job():
        METRICS_DIR = os.environ.get(DEFAULT_SAGEMAKER_METRICS_PATH)
        if not METRICS_DIR:
            logging.warning("SageMaker Metric Directory not specified")
            return
        file_name = "{}/{}.json".format(METRICS_DIR, str(os.getpid()))
        scalarnames = set()

        import collections

        train_metric = collections.defaultdict(list)
        eval_metric = collections.defaultdict(list)

        with open(file_name) as fp:
            # since SM metrics expects all metrics to be written in 1 file, seeking to
            # the right offset for the purpose of this test - so that the metrics logged in
            # the corresponding test are verified
            fp.seek(metrics_file_position)
            for line in fp:
                data = json.loads(line)
                assert data[
                    "IterationNumber"] != -1  # iteration number should not be -1
                metric_name = data["MetricName"]
                if "TRAIN" in metric_name:
                    train_metric[metric_name].append(data["IterationNumber"])
                    scalarnames.add(metric_name.rstrip("_TRAIN"))
                elif "EVAL" in metric_name:
                    eval_metric[metric_name].append(data["IterationNumber"])
                    scalarnames.add(metric_name.rstrip("_EVAL"))
                else:
                    scalarnames.add(
                        metric_name.rstrip("_GLOBAL")
                    )  # check the scalar saved using save_scalar()
            metrics_file_position = fp.tell()
        assert scalarnames

        if saved_scalars:
            assert len(set(saved_scalars) & set(scalarnames)) > 0

        # check if all metrics have been written at the expected step number
        for train_data in train_metric:
            assert len(
                set(save_steps["TRAIN"])
                & set(train_metric[train_data])) == len(save_steps["TRAIN"])
        for eval_data in eval_metric:
            assert len(set(save_steps["EVAL"])
                       & set(eval_metric[eval_data])) == len(
                           save_steps["EVAL"])
Example #3
0
def create_hook_from_json_config(hook_cls,
                                 json_config_path,
                                 default_values=None):
    """Returns a SessionHook object corresponding to either TF, PT, or MXNet.

    If json_config_path is None, an environment variable must be set.
    Here we compare HookParameters with CollectionConfiguration and set all the defaults.
    """
    params_dict = get_json_config_as_dict(json_config_path=json_config_path)
    hook_params = collect_hook_config_params(params_dict)

    out_dir = hook_params.get("out_dir")
    dry_run = hook_params.get("dry_run", False)
    reduction_config = hook_params.get(CONFIG_RDN_CFG_KEY, None)
    save_config = SaveConfig.from_dict(hook_params.get("save_config_modes"),
                                       default_values)
    include_regex = hook_params.get(CONFIG_INCLUDE_REGEX_KEY)
    include_collections = get_include_collections(params_dict)
    save_all = hook_params.get(CONFIG_SAVE_ALL_KEY, False)
    include_workers = hook_params.get(CONFIG_INCLUDE_WORKERS_KEY, "one")

    # If Sagemaker, emit TB only if JSON file exists
    if is_sagemaker_job():
        tensorboard_dir = get_tensorboard_dir_from_json_config()
        export_tensorboard = bool(tensorboard_dir is not None)
    # Otherwise, place TB artifacts in out_dir
    else:
        tensorboard_dir = hook_params[TENSORBOARD_DIR_KEY]
        export_tensorboard = hook_params[EXPORT_TENSORBOARD_KEY]

    hook = hook_cls(
        out_dir=out_dir,
        export_tensorboard=export_tensorboard,
        tensorboard_dir=tensorboard_dir,
        dry_run=dry_run,
        reduction_config=reduction_config,
        save_config=save_config,
        include_regex=include_regex,
        include_collections=include_collections,
        include_workers=include_workers,
        save_all=save_all,
    )
    add_collections_to_manager(hook.collection_manager, params_dict,
                               hook_params)
    return hook
Example #4
0
def check_metrics_file(saved_scalars):
    """
    Check the SageMaker metrics file to ensure that all the scalars saved using
    save_scalar(sm_metrics=True) or mentioned through SM_METRICS collections, have been saved.
    """
    if is_sagemaker_job():
        METRICS_DIR = os.environ.get(DEFAULT_SAGEMAKER_METRICS_PATH)
        if not METRICS_DIR:
            logging.warning("SageMaker Metric Directory not specified")
            return
        file_name = "{}/{}.json".format(METRICS_DIR, str(os.getpid()))
        scalarnames = set()
        with open(file_name) as fp:
            for line in fp:
                data = json.loads(line)
                scalarnames.add(data["MetricName"])
        assert scalarnames
        assert len(set(saved_scalars) & set(scalarnames)) > 0
Example #5
0
def training_has_ended(trial_prefix):
    # Emit the end of training file only if the job is not running under SageMaker.
    if is_sagemaker_job():
        logger.info(
            f"The end of training job file will not be written for jobs running under SageMaker."
        )
        return
    try:
        check_dir_exists(trial_prefix)
        # if path does not exist, then we don't need to write a file
    except RuntimeError:
        # dir exists
        pass
    file_path = os.path.join(trial_prefix, END_OF_JOB_FILENAME)
    s3, bucket_name, key_name = is_s3(file_path)
    if s3:
        writer = TSAccessS3(bucket_name, key_name, binary=False)
    else:
        writer = TSAccessFile(file_path, "a+")
    writer.flush()
    try:
        writer.close()
    except OSError:
        """
Example #6
0
    def __init__(
        self,
        collection_manager: CollectionManager,
        default_include_collections: List[str],
        profiler_config_parser: ProfilerConfigParser,
        init_step: int = 0,
        out_dir: Optional[str] = None,
        export_tensorboard: bool = False,
        tensorboard_dir: Optional[str] = None,
        dry_run: bool = False,
        reduction_config: Optional[ReductionConfig] = None,
        save_config: Optional[Union[SaveConfig, Dict[ModeKeys,
                                                     SaveConfigMode]]] = None,
        include_regex: Optional[List[str]] = None,
        include_collections: Optional[List[str]] = None,
        save_all: bool = False,
        include_workers: str = "one",
    ):
        """
        A class used to represent the hook which gets attached to the
        training process. This takes the form appropriate for the framework
        such as tf.train.SessionRunHook for TF, Callback for keras...

        ...

        Attributes
        ----------
        out_dir : str
            represents a path into which outputs will be written to. The hook raises error if the 'out_dir' already
            exists. The implementation does not support merging the tensors generated in current job with tensors
            from previous job. Hence, ensure that the 'out_dir' does not exist.
        dry_run : bool
            when dry run is set, behavior is only described in the log file.
            tensors are not actually saved.
        save_config: SaveConfig object
            Takes save config object which is applied as default for all included tensors.
            A collection can optionally have its own saveconfig object
            which overrides this for its tensors.

        reduction_config: ReductionConfig object
            if passed, this reduction config object is used
            as default for all tensors included.
            A collection has its own saveconfig object
            which overrides this for its tensors. if this is not passed,
            tensor is saved in full.

        include_regex: list of str
            takes as input the list of string representing regular expressions. Tensors whose names match
            these regular expressions will be saved. These tensors will be available as part of the `default`
            collection.

        include_collections: list of str representing collection names
            takes as input the collections which should be saved.
            if this is empty, it defaults to including all collections from code

        save_all: bool
            a shortcut for saving all tensors in the model.
            they are all saved in the collection `all`
        include_workers: str
            makes the hook save data from all workers

        profiler_config_parser: ProfilerConfigParser object
            if passed, use this profiler configuration. by default, set up a new profiler configuration here.
        """
        error_handling_agent.set_hook(
            self)  # This should be the first line in the constructor.
        self.out_dir = verify_and_get_out_dir(out_dir)
        self.tensorboard_dir = get_tensorboard_dir(
            export_tensorboard=export_tensorboard,
            tensorboard_dir=tensorboard_dir,
            out_dir=self.out_dir,
        )

        self.dry_run = dry_run
        self.worker = None
        # when smdebug is used during an unsupported dist training process
        # we write data only from the process that has self.first_process set to True.
        self.first_process = None
        self.save_all_workers = True if include_workers == "all" else False
        self.chief_worker = DEFAULT_WORKER_NAME

        if include_collections is None:
            include_collections = default_include_collections
        else:
            include_collections = flatten(include_collections)
        self.include_collections = list(
            set(include_collections).union(set(default_include_collections)))

        self.save_all = save_all
        self.save_config = SaveConfig.parse(save_config)
        if reduction_config is None:
            reduction_config = ReductionConfig(save_raw_tensor=True)
        self.reduction_config = reduction_config
        self.include_regex = include_regex
        self.collection_manager = collection_manager
        self.init_step = init_step

        # The written_tensor_name_for_step dictionary stores
        # the names of each tensor saved for every step.
        # This is to detect name clashes.
        # If a name clash is detected, it is avoided by appending
        # an index to the tensor name.
        self.written_tensor_name_for_step = defaultdict(int)

        self.logger = logger

        if self.tensorboard_dir is None:
            self.logger.info(
                f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries."
            )

        if include_regex is not None:
            collection_manager.get(
                CollectionKeys.DEFAULT).include(include_regex)
            if CollectionKeys.DEFAULT not in self.include_collections:
                self.include_collections.append(CollectionKeys.DEFAULT)

        self.save_all = save_all
        if self.save_all:
            collection_manager.get(CollectionKeys.ALL).include(".*")
            if CollectionKeys.ALL not in self.include_collections:
                self.include_collections.append(CollectionKeys.ALL)

        if (CollectionKeys.DEFAULT not in self.include_collections and
                collection_manager.get(CollectionKeys.DEFAULT).include_regex):
            self.logger.warn("The `default` collection was not passed to "
                             "include_collections. So it is not being saved")

        self._collections_to_save = set()
        self._collections_to_save_for_step = None
        self.prepared_collections = False
        self.tensor_to_collections = {}

        self.step = init_step
        self.last_saved_step = None
        self.mode = ModeKeys.GLOBAL
        self.mode_steps = {ModeKeys.GLOBAL: init_step}
        self.writer = None

        self.profiler_config_parser = profiler_config_parser
        self.profiler_config_parser.load_config()

        self.timeline_writer = TimelineFileWriter(
            profiler_config_parser=profiler_config_parser)
        self.hvd_reader = None
        self.is_smdataparallel_profiling = False

        if is_sagemaker_job() and SageMakerFileMetricsWriter is not None:
            self.metrics_writer = SageMakerFileMetricsWriter()
        else:
            self.metrics_writer = None

        # Maps ModeKeys to FileWriter objects
        self.tb_writers = {}

        # Cache scalars that are being saved through save_scalar() calls
        self.scalar_cache = []

        self.logger.info("Saving to {}".format(self.out_dir))
        atexit.register(self._cleanup)

        # Check if there is any last saved state. Initialize the hook based last saved state.
        self.training_run = 0
        self._initialize_to_last_saved_state()
        self.custom_tensors_to_save = dict()
Example #7
0
    def __init__(
        self,
        collection_manager: CollectionManager,
        default_include_collections: List[str],
        init_step: int = 0,
        out_dir: Optional[str] = None,
        export_tensorboard: bool = False,
        tensorboard_dir: Optional[str] = None,
        dry_run: bool = False,
        reduction_config: Optional[ReductionConfig] = None,
        save_config: Optional[Union[SaveConfig, Dict[ModeKeys, SaveConfigMode]]] = None,
        include_regex: Optional[List[str]] = None,
        include_collections: Optional[List[str]] = None,
        save_all: bool = False,
        include_workers: str = "one",
    ):
        """
        A class used to represent the hook which gets attached to the
        training process. This takes the form appropriate for the framework
        such as tf.train.SessionRunHook for TF, Callback for keras...

        ...

        Attributes
        ----------
        out_dir : str
            represents a path into which outputs will be written to
        dry_run : bool
            when dry run is set, behavior is only described in the log file.
            tensors are not actually saved.
        save_config: SaveConfig object
            Takes save config object which is applied as default for all included tensors.
            A collection can optionally have its own saveconfig object
            which overrides this for its tensors.

        reduction_config: ReductionConfig object
            if passed, this reduction config object is used
            as default for all tensors included.
            A collection has its own saveconfig object
            which overrides this for its tensors. if this is not passed,
            tensor is saved in full.

        include_regex: list of str
            takes as input the list of string representing regular expressions. Tensors whose names match
            these regular expressions will be saved. These tensors will be available as part of the `default`
            collection.

        include_collections: list of str representing collection names
            takes as input the collections which should be saved.
            if this is empty, it defaults to including all collections from code

        save_all: bool
            a shortcut for saving all tensors in the model.
            they are all saved in the collection `all`
        include_workers: str
            makes the hook save data from all workers
        """
        self.out_dir = verify_and_get_out_dir(out_dir)
        self.tensorboard_dir = get_tensorboard_dir(
            export_tensorboard=export_tensorboard,
            tensorboard_dir=tensorboard_dir,
            out_dir=self.out_dir,
        )

        self.dry_run = dry_run
        self.worker = None
        self.save_all_workers = True if include_workers == "all" else False
        self.chief_worker = CONFIG_DEFAULT_WORKER_NAME

        if include_collections is None:
            include_collections = default_include_collections
        else:
            include_collections = flatten(include_collections)
        self.include_collections = list(
            set(include_collections).union(set(default_include_collections))
        )

        self.save_all = save_all
        self.save_config = SaveConfig.parse(save_config)
        if reduction_config is None:
            reduction_config = ReductionConfig(save_raw_tensor=True)
        self.reduction_config = reduction_config
        self.include_regex = include_regex
        self.collection_manager = collection_manager
        self.collection_manager.set_num_workers(self._get_num_workers())
        self.init_step = init_step

        self.logger = logger

        if self.tensorboard_dir is None:
            self.logger.info(
                f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries."
            )

        if include_regex is not None:
            collection_manager.get(CollectionKeys.DEFAULT).include(include_regex)
            if CollectionKeys.DEFAULT not in self.include_collections:
                self.include_collections.append(CollectionKeys.DEFAULT)

        self.save_all = save_all
        if self.save_all:
            collection_manager.get(CollectionKeys.ALL).include(".*")
            if CollectionKeys.ALL not in self.include_collections:
                self.include_collections.append(CollectionKeys.ALL)

        if (
            CollectionKeys.DEFAULT not in self.include_collections
            and collection_manager.get(CollectionKeys.DEFAULT).include_regex
        ):
            self.logger.warn(
                "The `default` collection was not passed to "
                "include_collections. So it is not being saved"
            )

        self._collections_to_save = set()
        self._collections_to_save_for_step = None
        self.prepared_collections = False
        self.tensor_to_collections = {}

        self.step = init_step
        self.last_saved_step = None
        self.mode = ModeKeys.GLOBAL
        self.mode_steps = {ModeKeys.GLOBAL: init_step}
        self.writer = None

        if is_sagemaker_job() and SageMakerFileMetricsWriter is not None:
            self.metrics_writer = SageMakerFileMetricsWriter()
        else:
            self.metrics_writer = None

        # Maps ModeKeys to FileWriter objects
        self.tb_writers = {}

        # Cache scalars that are being saved through save_scalar() calls
        self.scalar_cache = []

        self.logger.info("Saving to {}".format(self.out_dir))
        atexit.register(self._cleanup)

        # Check if there is any last saved state. Initialize the hook based last saved state.
        self.training_run = 0
        self._initialize_to_last_saved_state()