def dummy_trial_creator(trial_dir, num_workers, job_ended): Path(trial_dir).mkdir(parents=True, exist_ok=True) cm = CollectionManager() for i in range(num_workers): collection_file_name = f"worker_{i}_collections.json" cm.export(trial_dir, collection_file_name) if job_ended: Path(os.path.join(trial_dir, "training_job_end.ts")).touch()
def test_collection_defaults_to_hook_config(): """Test that hook save_configs propagate to collection defaults. For example, if we set ModeKeys.TRAIN: save_interval=10 in the hook and ModeKeys.EVAL: save_interval=20 in a collection, we would like the collection to be finalized as {ModeKeys.TRAIN: save_interval=10, ModeKeys.EVAL: save_interval=20}. """ cm = CollectionManager() cm.create_collection("foo") cm.get("foo").include_regex = "*" cm.get("foo").save_config = { ModeKeys.EVAL: SaveConfigMode(save_interval=20) } hook = Hook( out_dir="/tmp/test_collections/" + str(datetime.datetime.now()), save_config={ModeKeys.TRAIN: SaveConfigMode(save_interval=10)}, include_collections=["foo"], reduction_config=ReductionConfig(save_raw_tensor=True), ) hook.collection_manager = cm assert cm.get("foo").save_config.mode_save_configs[ModeKeys.TRAIN] is None assert cm.get("foo").reduction_config is None hook._prepare_collections() assert cm.get("foo").save_config.mode_save_configs[ ModeKeys.TRAIN].save_interval == 10 assert cm.get("foo").reduction_config.save_raw_tensor is True
def test_json_params(): params_dict = get_json_config_as_dict( json_config_path="tests/core/json_configs/all_params.json") hook_params = collect_hook_config_params(params_dict) include_collections = get_include_collections(params_dict) coll_manager = CollectionManager() add_collections_to_manager(coll_manager, params_dict, hook_params) assert hook_params["include_workers"] == "one" assert hook_params["save_all"] is True assert coll_manager.get("weights").save_histogram is False assert coll_manager.get("gradients").save_histogram is False assert "weights" in include_collections assert "gradients" in include_collections assert len(include_collections) == 2 assert hook_params["export_tensorboard"] == True assert hook_params["tensorboard_dir"] == "/tmp/tensorboard"
def _read_collections(self, collection_files): first_collection_file = collection_files[0] # First Collection File key = os.path.join(first_collection_file) collections_req = ReadObjectRequest(self._get_s3_location(key)) obj_data = self.s3_handler.get_objects([collections_req])[0] obj_data = obj_data.decode("utf-8") self.collection_manager = CollectionManager.load_from_string(obj_data) self.num_workers = self.collection_manager.get_num_workers()
def help_test_multiple_trials(num_steps=20, num_tensors=10): trial_name = str(uuid.uuid4()) bucket = "smdebug-testing" path = "s3://" + os.path.join(bucket, "outputs/") c = CollectionManager() c.add("default") c.get("default").tensor_names = [ "foo_" + str(i) for i in range(num_tensors) ] c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME) c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME) for i in range(num_steps): generate_data( path=path, trial=trial_name, num_tensors=num_tensors, step=i, tname_prefix="foo", worker="algo-1", shape=(3, 3, 3), rank=0, ) _, bucket, prefix = is_s3(os.path.join(path, trial_name)) trial_obj = S3Trial(name=prefix, bucket_name=bucket, prefix_name=prefix) return trial_obj, trial_name
def test_invalid_collection_config_exception(): cm = CollectionManager() cm.create_collection("foo") hook = Hook( out_dir="/tmp/test_collections/" + str(datetime.datetime.now()), save_config={ModeKeys.TRAIN: SaveConfigMode(save_interval=10)}, include_collections=["foo"], reduction_config=ReductionConfig(save_raw_tensor=True), ) hook.collection_manager = cm try: hook._prepare_collections() except InvalidCollectionConfiguration: pass else: assert False, "Invalid Collection Name did not raise error" cm.get("foo").include_regex = "*" try: hook._prepare_collections() except InvalidCollectionConfiguration: assert False, "Valid Collection Name raised an error"
def generate_data( path, trial, step, tname_prefix, num_tensors, worker, shape, dtype=np.float32, rank=None, mode=None, mode_step=None, export_colls=True, data=None, ): with FileWriter(trial_dir=os.path.join(path, trial), step=step, worker=worker) as fw: for i in range(num_tensors): if data is None: data = np.ones(shape=shape, dtype=dtype) * step fw.write_tensor(tdata=data, tname=f"{tname_prefix}_{i}", mode=mode, mode_step=mode_step) if export_colls: c = CollectionManager() c.add("default") c.get("default").tensor_names = [ f"{tname_prefix}_{i}" for i in range(num_tensors) ] c.add("gradients") c.get("gradients").tensor_names = [ f"{tname_prefix}_{i}" for i in range(num_tensors) ] c.export(os.path.join(path, trial), DEFAULT_COLLECTIONS_FILE_NAME)
def test_mode_data(): run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") trial_dir = "/tmp/ts_outputs/" + run_id c = CollectionManager() c.add("default") c.get("default").tensor_names = ["arr_1"] c.get("default").tensor_names = ["arr_2"] c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME) trial = create_trial(trial_dir) worker = socket.gethostname() for s in range(0, 10): fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker) if s % 2 == 0: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr_1", mode=modes.TRAIN, mode_step=s // 2, ) else: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr_2", mode=modes.EVAL, mode_step=s // 2, ) fw.close() assert trial.tensor_names() == ["arr_1", "arr_2"] assert trial.tensor_names(step=0) == ["arr_1"] assert trial.tensor_names(step=1) == ["arr_2"] assert trial.tensor_names(step=0, mode=modes.TRAIN) == ["arr_1"] assert trial.tensor_names(step=0, mode=modes.EVAL) == ["arr_2"] assert trial.tensor_names(mode=modes.TRAIN) == ["arr_1"] assert trial.tensor_names(mode=modes.EVAL) == ["arr_2"]
def __init__( self, collection_manager: CollectionManager, default_include_collections: List[str], profiler_config_parser: ProfilerConfigParser, init_step: int = 0, out_dir: Optional[str] = None, export_tensorboard: bool = False, tensorboard_dir: Optional[str] = None, dry_run: bool = False, reduction_config: Optional[ReductionConfig] = None, save_config: Optional[Union[SaveConfig, Dict[ModeKeys, SaveConfigMode]]] = None, include_regex: Optional[List[str]] = None, include_collections: Optional[List[str]] = None, save_all: bool = False, include_workers: str = "one", ): """ A class used to represent the hook which gets attached to the training process. This takes the form appropriate for the framework such as tf.train.SessionRunHook for TF, Callback for keras... ... Attributes ---------- out_dir : str represents a path into which outputs will be written to. The hook raises error if the 'out_dir' already exists. The implementation does not support merging the tensors generated in current job with tensors from previous job. Hence, ensure that the 'out_dir' does not exist. dry_run : bool when dry run is set, behavior is only described in the log file. tensors are not actually saved. save_config: SaveConfig object Takes save config object which is applied as default for all included tensors. A collection can optionally have its own saveconfig object which overrides this for its tensors. reduction_config: ReductionConfig object if passed, this reduction config object is used as default for all tensors included. A collection has its own saveconfig object which overrides this for its tensors. if this is not passed, tensor is saved in full. include_regex: list of str takes as input the list of string representing regular expressions. Tensors whose names match these regular expressions will be saved. These tensors will be available as part of the `default` collection. include_collections: list of str representing collection names takes as input the collections which should be saved. if this is empty, it defaults to including all collections from code save_all: bool a shortcut for saving all tensors in the model. they are all saved in the collection `all` include_workers: str makes the hook save data from all workers profiler_config_parser: ProfilerConfigParser object if passed, use this profiler configuration. by default, set up a new profiler configuration here. """ error_handling_agent.set_hook( self) # This should be the first line in the constructor. self.out_dir = verify_and_get_out_dir(out_dir) self.tensorboard_dir = get_tensorboard_dir( export_tensorboard=export_tensorboard, tensorboard_dir=tensorboard_dir, out_dir=self.out_dir, ) self.dry_run = dry_run self.worker = None # when smdebug is used during an unsupported dist training process # we write data only from the process that has self.first_process set to True. self.first_process = None self.save_all_workers = True if include_workers == "all" else False self.chief_worker = DEFAULT_WORKER_NAME if include_collections is None: include_collections = default_include_collections else: include_collections = flatten(include_collections) self.include_collections = list( set(include_collections).union(set(default_include_collections))) self.save_all = save_all self.save_config = SaveConfig.parse(save_config) if reduction_config is None: reduction_config = ReductionConfig(save_raw_tensor=True) self.reduction_config = reduction_config self.include_regex = include_regex self.collection_manager = collection_manager self.init_step = init_step # The written_tensor_name_for_step dictionary stores # the names of each tensor saved for every step. # This is to detect name clashes. # If a name clash is detected, it is avoided by appending # an index to the tensor name. self.written_tensor_name_for_step = defaultdict(int) self.logger = logger if self.tensorboard_dir is None: self.logger.info( f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries." ) if include_regex is not None: collection_manager.get( CollectionKeys.DEFAULT).include(include_regex) if CollectionKeys.DEFAULT not in self.include_collections: self.include_collections.append(CollectionKeys.DEFAULT) self.save_all = save_all if self.save_all: collection_manager.get(CollectionKeys.ALL).include(".*") if CollectionKeys.ALL not in self.include_collections: self.include_collections.append(CollectionKeys.ALL) if (CollectionKeys.DEFAULT not in self.include_collections and collection_manager.get(CollectionKeys.DEFAULT).include_regex): self.logger.warn("The `default` collection was not passed to " "include_collections. So it is not being saved") self._collections_to_save = set() self._collections_to_save_for_step = None self.prepared_collections = False self.tensor_to_collections = {} self.step = init_step self.last_saved_step = None self.mode = ModeKeys.GLOBAL self.mode_steps = {ModeKeys.GLOBAL: init_step} self.writer = None self.profiler_config_parser = profiler_config_parser self.profiler_config_parser.load_config() self.timeline_writer = TimelineFileWriter( profiler_config_parser=profiler_config_parser) self.hvd_reader = None self.is_smdataparallel_profiling = False if is_sagemaker_job() and SageMakerFileMetricsWriter is not None: self.metrics_writer = SageMakerFileMetricsWriter() else: self.metrics_writer = None # Maps ModeKeys to FileWriter objects self.tb_writers = {} # Cache scalars that are being saved through save_scalar() calls self.scalar_cache = [] self.logger.info("Saving to {}".format(self.out_dir)) atexit.register(self._cleanup) # Check if there is any last saved state. Initialize the hook based last saved state. self.training_run = 0 self._initialize_to_last_saved_state() self.custom_tensors_to_save = dict()
def __init__( self, collection_manager: CollectionManager, default_include_collections: List[str], init_step: int = 0, out_dir: Optional[str] = None, export_tensorboard: bool = False, tensorboard_dir: Optional[str] = None, dry_run: bool = False, reduction_config: Optional[ReductionConfig] = None, save_config: Optional[Union[SaveConfig, Dict[ModeKeys, SaveConfigMode]]] = None, include_regex: Optional[List[str]] = None, include_collections: Optional[List[str]] = None, save_all: bool = False, include_workers: str = "one", ): """ A class used to represent the hook which gets attached to the training process. This takes the form appropriate for the framework such as tf.train.SessionRunHook for TF, Callback for keras... ... Attributes ---------- out_dir : str represents a path into which outputs will be written to dry_run : bool when dry run is set, behavior is only described in the log file. tensors are not actually saved. save_config: SaveConfig object Takes save config object which is applied as default for all included tensors. A collection can optionally have its own saveconfig object which overrides this for its tensors. reduction_config: ReductionConfig object if passed, this reduction config object is used as default for all tensors included. A collection has its own saveconfig object which overrides this for its tensors. if this is not passed, tensor is saved in full. include_regex: list of str takes as input the list of string representing regular expressions. Tensors whose names match these regular expressions will be saved. These tensors will be available as part of the `default` collection. include_collections: list of str representing collection names takes as input the collections which should be saved. if this is empty, it defaults to including all collections from code save_all: bool a shortcut for saving all tensors in the model. they are all saved in the collection `all` include_workers: str makes the hook save data from all workers """ self.out_dir = verify_and_get_out_dir(out_dir) self.tensorboard_dir = get_tensorboard_dir( export_tensorboard=export_tensorboard, tensorboard_dir=tensorboard_dir, out_dir=self.out_dir, ) self.dry_run = dry_run self.worker = None self.save_all_workers = True if include_workers == "all" else False self.chief_worker = CONFIG_DEFAULT_WORKER_NAME if include_collections is None: include_collections = default_include_collections else: include_collections = flatten(include_collections) self.include_collections = list( set(include_collections).union(set(default_include_collections)) ) self.save_all = save_all self.save_config = SaveConfig.parse(save_config) if reduction_config is None: reduction_config = ReductionConfig(save_raw_tensor=True) self.reduction_config = reduction_config self.include_regex = include_regex self.collection_manager = collection_manager self.collection_manager.set_num_workers(self._get_num_workers()) self.init_step = init_step self.logger = logger if self.tensorboard_dir is None: self.logger.info( f"tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries." ) if include_regex is not None: collection_manager.get(CollectionKeys.DEFAULT).include(include_regex) if CollectionKeys.DEFAULT not in self.include_collections: self.include_collections.append(CollectionKeys.DEFAULT) self.save_all = save_all if self.save_all: collection_manager.get(CollectionKeys.ALL).include(".*") if CollectionKeys.ALL not in self.include_collections: self.include_collections.append(CollectionKeys.ALL) if ( CollectionKeys.DEFAULT not in self.include_collections and collection_manager.get(CollectionKeys.DEFAULT).include_regex ): self.logger.warn( "The `default` collection was not passed to " "include_collections. So it is not being saved" ) self._collections_to_save = set() self._collections_to_save_for_step = None self.prepared_collections = False self.tensor_to_collections = {} self.step = init_step self.last_saved_step = None self.mode = ModeKeys.GLOBAL self.mode_steps = {ModeKeys.GLOBAL: init_step} self.writer = None if is_sagemaker_job() and SageMakerFileMetricsWriter is not None: self.metrics_writer = SageMakerFileMetricsWriter() else: self.metrics_writer = None # Maps ModeKeys to FileWriter objects self.tb_writers = {} # Cache scalars that are being saved through save_scalar() calls self.scalar_cache = [] self.logger.info("Saving to {}".format(self.out_dir)) atexit.register(self._cleanup) # Check if there is any last saved state. Initialize the hook based last saved state. self.training_run = 0 self._initialize_to_last_saved_state()
def write_dummy_collection_file(trial): cm = CollectionManager() cm.create_collection("default") cm.add(Collection(trial)) cm.export(trial, DEFAULT_COLLECTIONS_FILE_NAME)
def test_manager(): cm = CollectionManager() cm.create_collection("default") cm.get("default").include("loss") cm.get("default").add_tensor_name("assaas") cm.add(Collection("trial1")) cm.add("trial2") cm.get("trial2").include("total_loss") assert len(cm.collections) == 3 assert cm.get("default") == cm.collections["default"] assert "loss" in cm.get("default").include_regex assert len(cm.get("default").tensor_names) > 0 assert "total_loss" in cm.collections["trial2"].include_regex
def test_manager_export_load(): cm = CollectionManager() cm.create_collection("default") cm.get("default").include("loss") cm.add(Collection("trial1")) cm.add("trial2") cm.get("trial2").include("total_loss") cm.export("/tmp/dummy_trial", DEFAULT_COLLECTIONS_FILE_NAME) cm2 = CollectionManager.load( os.path.join(get_path_to_collections("/tmp/dummy_trial"), DEFAULT_COLLECTIONS_FILE_NAME)) assert cm == cm2
def _read_collections(self, collection_files): first_collection_file = collection_files[0] # First Collection File self.collection_manager = CollectionManager.load(first_collection_file) self.num_workers = self.collection_manager.get_num_workers()
def test_mode_data(): run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") trial_dir = "/tmp/ts_outputs/" + run_id c = CollectionManager() c.add("default") c.get("default").tensor_names = ["arr"] c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME) tr = create_trial(trial_dir) worker = socket.gethostname() for s in range(0, 10): fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker) if s % 2 == 0: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr", mode=modes.TRAIN, mode_step=s // 2, ) else: fw.write_tensor( tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), tname="arr", mode=modes.EVAL, mode_step=s // 2, ) fw.close() if s % 2 == 0: assert tr.has_passed_step(s // 2, mode=modes.TRAIN) == StepState.AVAILABLE assert tr.has_passed_step( s // 2, mode=modes.EVAL) == StepState.NOT_YET_AVAILABLE else: assert tr.has_passed_step(s // 2, mode=modes.EVAL) == StepState.AVAILABLE assert tr.has_passed_step(s) == StepState.AVAILABLE assert tr.has_passed_step(s + 1) == StepState.NOT_YET_AVAILABLE assert tr.has_passed_step( s + 1, mode=modes.TRAIN) == StepState.NOT_YET_AVAILABLE assert len(tr.tensor_names()) == 1 assert len(tr.steps()) == 10 assert len(tr.steps(mode=modes.TRAIN)) == 5 assert len(tr.steps(mode=modes.EVAL)) == 5 assert len(tr.modes()) == 2 for i in range(10): if i % 2 == 0: assert tr.mode(i) == modes.TRAIN else: assert tr.mode(i) == modes.EVAL assert tr.mode_step(i) == i // 2 for i in range(5): assert tr.global_step(modes.TRAIN, i) == (i * 2) assert tr.global_step(modes.EVAL, i) == (i * 2) + 1 assert len(tr.tensor("arr").steps()) == 10 assert len(tr.tensor("arr").steps(mode=modes.TRAIN)) == 5 assert len(tr.tensor("arr").steps(mode=modes.EVAL)) == 5 for i in range(10): assert tr.tensor("arr").value(i) is not None if i < 5: assert tr.tensor("arr").value(i, mode=modes.TRAIN) is not None assert tr.tensor("arr").value(i, mode=modes.EVAL) is not None shutil.rmtree("/tmp/ts_outputs/" + run_id)