def _has_event_file_been_skipped(self, missing_event_file_name: str) -> bool: """ Checks if an event file will ever be downloaded. if event_file present --> return False if the worker has written the next event file --> return True if none of the above --> return False :param missing_event_file_name: :return: """ self.logger.info(f" Index Reader: Event File {missing_event_file_name} not found.") missing_worker = parse_worker_name_from_file(missing_event_file_name) missing_step = IndexFileLocationUtils.parse_step_from_index_file_name( missing_event_file_name ) event_files = self.list_event_files(missing_event_file_name) for event_file in event_files: if missing_worker == parse_worker_name_from_file(event_file): step = IndexFileLocationUtils.parse_step_from_index_file_name(event_file) if missing_step == step: """ The missing step file may have been written to disk before we perform the list operation. """ return False self.logger.warn( f" Index Reader: Event File {missing_event_file_name} was written but not found " f"\nHowever Event File {event_file} found." ) self.logger.warn(f"IndexReader: Skipping {missing_event_file_name} ") return True return False
def test_get_prefix_from_index_file(): local_index_filepath = "/opt/ml/testing/run_1/index/000000000/000000000000_worker_0.json" prefix = IndexFileLocationUtils.get_prefix_from_index_file(local_index_filepath) assert prefix == "/opt/ml/testing/run_1" s3_index_filepath = ( "s3://bucket-that-does-not-exist/run_1/index/000000000/000000000000_worker_0.json" ) prefix = IndexFileLocationUtils.get_prefix_from_index_file(s3_index_filepath) assert prefix == "s3://bucket-that-does-not-exist/run_1"
def list_index_files(self): index_dirname = IndexFileLocationUtils.get_index_path(self.path) # index files are json files or csv files ending with string ".csv" or ".json" index_files_regex = r"(.+)\.(json|csv)$" index_files = list_files_in_directory(index_dirname, file_regex=index_files_regex) return sorted(index_files)
def read_index_files( self, start_after_key: str, range_steps=None ) -> Tuple[List[bytes], list, str, List[str]]: """ Read files like `trial_{datetime}/index/000/{step}_{worker}.json. :param start_after_key: str :param range_steps: str :return: Tuple( responses, steps, start_after_key, workers) """ index_files = self.list_index_files() steps = [] workers = [] responses = [] if start_after_key is not None: start_after_index = bisect_left(index_files, start_after_key) else: start_after_index = 0 index_files = index_files[start_after_index:] # ignore files we have already read for index_file in index_files: if self.index_file_cache.has_not_read(index_file): step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file) if ( range_steps is not None and step_in_range(range_steps, step) ) or range_steps is None: steps.append(step) workers.append(parse_worker_name_from_file(index_file)) self.logger.debug( f"Sagemaker-Debugger: Read {os.path.getsize(index_file)} bytes from file {index_file}" ) with open(index_file) as f: responses.append(f.read().encode()) self.index_file_cache.add(index_file, start_after_key) if len(index_files) > 0: start_after_key = index_files[-1] # Last file that we have read return responses, steps, start_after_key, workers
def read_index_files( self, start_after_key: str, range_steps=None ) -> Tuple[List[bytes], list, str, List[str]]: """ Read files like `trial_{datetime}/index/000/{step}_{worker}.json. :param start_after_key: str :param range_steps: :return: Tuple( responses, steps, start_after_key, workers) """ object_requests = [] steps = [] workers = [] index_files, start_after_key = self.list_index_files(start_after_key) self.logger.debug(f'Loaded Index Files: {",".join(index_files)}') for index_file in index_files: if self.index_file_cache.has_not_read(index_file): step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file) if ( range_steps is not None and step_in_range(range_steps, step) ) or range_steps is None: steps.append(step) workers.append(parse_worker_name_from_file(index_file)) object_requests.append( ReadObjectRequest(format(f"s3://{self.bucket_name}/") + index_file) ) self.index_file_cache.add(index_file, start_after_key) responses = self.s3_handler.get_objects(object_requests) return responses, steps, start_after_key, workers
def list_index_files(self, start_after_key=None): index_files, last_index_token = list_s3_objects( self.bucket_name, IndexFileLocationUtils.get_index_path(self.prefix_name), start_after_key, ) return index_files, last_index_token
def _update_last_index_token(self, new_index_token: str) -> None: """ This function updates the last_index_token in the following scenarios: 1. last_complete_step >= last_index_token_step : this means that the token isn't pointing to the latest completed step 2. number of steps available ( complete or incomplete ) - (last_completed_step+1) > window_size_limit: we maintain a window to stop querying for older steps that have not completed. if the total number of steps, we are querying for completion is greater than our window_size_limit we update the last_index_token and last_complete_step by (window_size_limit // 2) :param new_index_token: :return:None """ if self.last_index_token is None: last_index_token_step = 0 else: last_index_token_step = IndexFileLocationUtils.parse_step_from_index_file_name( self.last_index_token) # Case 1: This case is not satisfied when all workers in a # distributed training job have not written a step if self.last_complete_step >= last_index_token_step: prefix = IndexFileLocationUtils.get_prefix_from_index_file( new_index_token) # sort lexicographically and select the last worker last_worker = sorted(list(self.worker_set))[-1] # below converts worker_name to serialized workerName # if it's a tf device, else no effect last_worker_serialized = serialize_tf_device(last_worker) self.last_index_token = IndexFileLocationUtils.get_index_key_for_step( prefix, self.last_complete_step, last_worker_serialized) self.logger.debug( f"Updated last index token to:{self.last_index_token}") # Case 2: This case is satisfied if the number of incomplete steps # is greater than the INCOMPLETE_STEP_WAIT_WINDOW available_step = self._global_to_mode.keys() if (len(available_step) - (self.last_complete_step + 1) > self._incomplete_wait_for_step_window): prefix = IndexFileLocationUtils.get_prefix_from_index_file( new_index_token) last_worker = sorted(list(self.worker_set))[-1] # below converts worker_name to serialized workerName # if it's a tf device, else no effect last_worker_serialized = serialize_tf_device(last_worker) self.last_index_token = IndexFileLocationUtils.get_index_key_for_step( prefix, self.last_complete_step + (self._incomplete_wait_for_step_window // 2), last_worker_serialized, ) self.last_complete_step = IndexFileLocationUtils.parse_step_from_index_file_name( self.last_index_token) self.logger.info( f"Waiting for: {len(available_step) - (self.last_complete_step + 1)} Steps. \n" f"INCOMPLETE_STEP_WAIT_WINDOW: {self._incomplete_wait_for_step_window}. \n" f"Marking the last {self._incomplete_wait_for_step_window // 2} incomplete steps as complete" f"Updating last_index_token to: {self.last_index_token}. \n" f"Updating last_complete_step to: {self.last_complete_step}. ")
def test_index(): numpy_tensor = [ np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), np.array([[1.0, 2.0, 4.0], [3.0, 4.0, 5.0]], dtype=np.float32), ] runid = "default" logdir = "." step = 0 worker = "worker_0" run_dir = os.path.join(logdir, runid) writer = FileWriter(trial_dir=run_dir, step=step, worker=worker, verbose=True) for i in (0, len(numpy_tensor) - 1): n = "tensor" + str(i) writer.write_tensor(tdata=numpy_tensor[i], tname=n) writer.flush() writer.close() efl = TensorFileLocation(step_num=step, worker_name=worker) eventfile = efl.get_file_location(trial_dir=run_dir) indexfile = IndexFileLocationUtils.get_index_key_for_step( run_dir, step, worker) fo = open(eventfile, "rb") with open(indexfile) as idx_file: index_data = json.load(idx_file) tensor_payload = index_data["tensor_payload"] i = 0 for tensor in tensor_payload: start_idx = int(tensor["start_idx"]) fo.seek(start_idx, 0) length = int(tensor["length"]) line = fo.read(length) zoo = open("test.txt", "wb") zoo.write(line) zoo.close() testfile_reader = FileReader("./test.txt") tensor_values = list(testfile_reader.read_tensors()) assert np.allclose( tensor_values[0][2].all(), numpy_tensor[i].all()), "indexwriter not working" i = i + 1 fo.close() shutil.rmtree(run_dir) os.remove("test.txt")
def dummy_step_creator(trial_dir, global_step, mode, mode_step, worker_name): static_step_data = ( '{"meta": {"mode": "TRAIN", "mode_step": 0, "event_file_name": ""}, ' '"tensor_payload": [' '{"tensorname": "gradients/dummy:0", "start_idx": 0, "length": 1}' "]}") step = json.loads(static_step_data) step["meta"]["mode"] = mode step["meta"]["mode_step"] = mode_step index_file_location = IndexFileLocationUtils.get_index_key_for_step( trial_dir, global_step, worker_name) Path(os.path.dirname(index_file_location)).mkdir(parents=True, exist_ok=True) with open(index_file_location, "w") as f: json.dump(step, f)
def list_index_files(self): index_dirname = IndexFileLocationUtils.get_index_path(self.path) index_files = list_files_in_directory(index_dirname) return sorted(index_files)