def _has_event_file_been_skipped(self, missing_event_file_name: str) -> bool: """ Checks if an event file will ever be downloaded. if event_file present --> return False if the worker has written the next event file --> return True if none of the above --> return False :param missing_event_file_name: :return: """ self.logger.info(f" Index Reader: Event File {missing_event_file_name} not found.") missing_worker = parse_worker_name_from_file(missing_event_file_name) missing_step = IndexFileLocationUtils.parse_step_from_index_file_name( missing_event_file_name ) event_files = self.list_event_files(missing_event_file_name) for event_file in event_files: if missing_worker == parse_worker_name_from_file(event_file): step = IndexFileLocationUtils.parse_step_from_index_file_name(event_file) if missing_step == step: """ The missing step file may have been written to disk before we perform the list operation. """ return False self.logger.warn( f" Index Reader: Event File {missing_event_file_name} was written but not found " f"\nHowever Event File {event_file} found." ) self.logger.warn(f"IndexReader: Skipping {missing_event_file_name} ") return True return False
def read_index_files( self, start_after_key: str, range_steps=None ) -> Tuple[List[bytes], list, str, List[str]]: """ Read files like `trial_{datetime}/index/000/{step}_{worker}.json. :param start_after_key: str :param range_steps: str :return: Tuple( responses, steps, start_after_key, workers) """ index_files = self.list_index_files() steps = [] workers = [] responses = [] if start_after_key is not None: start_after_index = bisect_left(index_files, start_after_key) else: start_after_index = 0 index_files = index_files[start_after_index:] # ignore files we have already read for index_file in index_files: if self.index_file_cache.has_not_read(index_file): step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file) if ( range_steps is not None and step_in_range(range_steps, step) ) or range_steps is None: steps.append(step) workers.append(parse_worker_name_from_file(index_file)) self.logger.debug( f"Sagemaker-Debugger: Read {os.path.getsize(index_file)} bytes from file {index_file}" ) with open(index_file) as f: responses.append(f.read().encode()) self.index_file_cache.add(index_file, start_after_key) if len(index_files) > 0: start_after_key = index_files[-1] # Last file that we have read return responses, steps, start_after_key, workers
def read_index_files( self, start_after_key: str, range_steps=None ) -> Tuple[List[bytes], list, str, List[str]]: """ Read files like `trial_{datetime}/index/000/{step}_{worker}.json. :param start_after_key: str :param range_steps: :return: Tuple( responses, steps, start_after_key, workers) """ object_requests = [] steps = [] workers = [] index_files, start_after_key = self.list_index_files(start_after_key) self.logger.debug(f'Loaded Index Files: {",".join(index_files)}') for index_file in index_files: if self.index_file_cache.has_not_read(index_file): step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file) if ( range_steps is not None and step_in_range(range_steps, step) ) or range_steps is None: steps.append(step) workers.append(parse_worker_name_from_file(index_file)) object_requests.append( ReadObjectRequest(format(f"s3://{self.bucket_name}/") + index_file) ) self.index_file_cache.add(index_file, start_after_key) responses = self.s3_handler.get_objects(object_requests) return responses, steps, start_after_key, workers
def test_invalid_file_found_exception(): filename = "/tmp/ts-logs/index/000000001/000000001230_worker_2.json.tmp" try: worker_name = parse_worker_name_from_file(filename) assert False except IndexReaderException: assert True
def test_parse_worker_name_from_index_file(): filename = "/tmp/ts-logs/index/000000001/000000001230_worker_2.json" worker_name = parse_worker_name_from_file(filename) assert worker_name == "worker_2" filename = "/tmp/ts-logs/index/000000000499__job-worker_replica-0_task-1_device-GPU-6.json" worker_name = parse_worker_name_from_file(filename) assert worker_name == "/job:worker/replica:0/task:1/device:GPU:6" path = "s3://smdebug-testing/resources/one-index-file" _, bucket, prefix = is_s3(path) index_reader = S3IndexReader(path) index_files, _ = index_reader.list_index_files() filename = index_files[0] worker_name = parse_worker_name_from_file(filename) assert worker_name == "/job:worker/replica:0/task:1/device:GPU:4"
def run(self): while True: event_in_queue = self._queue.get() if isinstance(event_in_queue, EventWithIndex): # checking whether there is an object of EventWithIndex, # which is written by write_summary_with_index event = event_in_queue.event else: event = event_in_queue if event is self._sentinel_event: self._queue.task_done() break try: # write event positions = self._ev_writer.write_event(event) # write index if isinstance(event_in_queue, EventWithIndex): eventfile = self._ev_writer.name() eventfile = get_relative_event_file_path(eventfile) tensorlocation = TensorLocation( tname=event_in_queue.tensorname, mode=event_in_queue.get_mode(), mode_step=event_in_queue.mode_step, event_file_name=eventfile, start_idx=positions[0], length=positions[1], worker=parse_worker_name_from_file(eventfile), ) self._ev_writer.index_writer.add_index(tensorlocation) # Flush the event writer every so often. now = time.time() if now > self._next_event_flush_time: self._ev_writer.flush() # Do it again in two minutes. self._next_event_flush_time = now + self._flush_secs finally: self._queue.task_done()