def _initialize_writers(self, only_initialize_if_missing=False) -> None: # Function is overridden in smdebug/tensorflow/base_hook.py if only_initialize_if_missing and self.writer: return if self.dry_run: return if self.first_process is False: return elif self.first_process is None: if self._get_num_workers() == 1: if is_first_process(self.out_dir): self.first_process = True self.logger.info( f"Hook is writing from the hook with pid: {os.getpid()}\n" ) else: if self.first_process is None: self.logger.warn( f"Unsupported Distributed Training Strategy Detected. \ Sagemaker-Debugger will only write from one process. \ The process with pid: {os.getpid()} will not be writing any data. \n" ) self.first_process = False return if self.save_all_workers is False: if self.worker != self.chief_worker: return self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
def test_is_first_process(dir): s3_path = "s3://this/is/a/valid/path" assert is_first_process(s3_path) # This section tests local path for _ in range(10): helper_test_is_first_process(dir)
def write_tf_dataloader_flag(self, flag_filename): """If dataloader metrics collection is enabled, then write a .tmp file with the provided flag_filename such that is has this path: <local_path>/<node_id>/<flag_filename>. We simply create the file but never close the writer, since we don't want the file to be uploaded to s3. If flag_filename is TF_DATALOADER_START_FLAG_FILENAME, we are signaling that dataloader metrics should be collected now. If flag_filename is TF_DATALOADER_END_FLAG_FILENAME, we are signaling that dataloader metrics should not be collected anymore. In AWS TF, we will collect dataloader metrics when only TF_DATALOADER_START_FLAG_FILENAME exists and not collect dataloader metrics when neither or both flags exist. Return True if writing the flag was successful, False if unsuccessful. """ if not self.profiling_enabled or not self.config.dataloader_profiling_config.is_enabled(): return tf_dataloader_flag_path = os.path.join( self.config.local_path, get_node_id_from_resource_config(), flag_filename ) success = is_first_process(tf_dataloader_flag_path, is_dir=False) if not os.path.isfile(tf_dataloader_flag_path): self.logger.error(f"Could not write flag to: {tf_dataloader_flag_path}!") return False return success