def _has_event_file_been_skipped(self, missing_event_file_name: str) -> bool:
     """
     Checks if an event file will ever be downloaded.
     if event_file present --> return False
     if the worker has written the next event file --> return True
     if none of the above --> return False
     :param missing_event_file_name:
     :return:
     """
     self.logger.info(f" Index Reader: Event File {missing_event_file_name} not found.")
     missing_worker = parse_worker_name_from_file(missing_event_file_name)
     missing_step = IndexFileLocationUtils.parse_step_from_index_file_name(
         missing_event_file_name
     )
     event_files = self.list_event_files(missing_event_file_name)
     for event_file in event_files:
         if missing_worker == parse_worker_name_from_file(event_file):
             step = IndexFileLocationUtils.parse_step_from_index_file_name(event_file)
             if missing_step == step:
                 """
                     The missing step file may have been written to disk before
                     we perform the list operation.
                 """
                 return False
             self.logger.warn(
                 f" Index Reader: Event File {missing_event_file_name} was written but not found "
                 f"\nHowever Event File {event_file} found."
             )
             self.logger.warn(f"IndexReader: Skipping {missing_event_file_name} ")
             return True
     return False
 def read_index_files(
     self, start_after_key: str, range_steps=None
 ) -> Tuple[List[bytes], list, str, List[str]]:
     """
         Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
     :param start_after_key: str
     :param range_steps: str
     :return: Tuple( responses, steps, start_after_key, workers)
     """
     index_files = self.list_index_files()
     steps = []
     workers = []
     responses = []
     if start_after_key is not None:
         start_after_index = bisect_left(index_files, start_after_key)
     else:
         start_after_index = 0
     index_files = index_files[start_after_index:]  # ignore files we have already read
     for index_file in index_files:
         if self.index_file_cache.has_not_read(index_file):
             step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file)
             if (
                 range_steps is not None and step_in_range(range_steps, step)
             ) or range_steps is None:
                 steps.append(step)
                 workers.append(parse_worker_name_from_file(index_file))
                 self.logger.debug(
                     f"Sagemaker-Debugger: Read {os.path.getsize(index_file)} bytes from file {index_file}"
                 )
                 with open(index_file) as f:
                     responses.append(f.read().encode())
             self.index_file_cache.add(index_file, start_after_key)
     if len(index_files) > 0:
         start_after_key = index_files[-1]  # Last file that we have read
     return responses, steps, start_after_key, workers
    def read_index_files(
        self, start_after_key: str, range_steps=None
    ) -> Tuple[List[bytes], list, str, List[str]]:
        """
            Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
        :param start_after_key: str
        :param range_steps:
        :return: Tuple( responses, steps, start_after_key, workers)
        """
        object_requests = []
        steps = []
        workers = []
        index_files, start_after_key = self.list_index_files(start_after_key)
        self.logger.debug(f'Loaded Index Files: {",".join(index_files)}')
        for index_file in index_files:
            if self.index_file_cache.has_not_read(index_file):
                step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file)
                if (
                    range_steps is not None and step_in_range(range_steps, step)
                ) or range_steps is None:
                    steps.append(step)
                    workers.append(parse_worker_name_from_file(index_file))
                    object_requests.append(
                        ReadObjectRequest(format(f"s3://{self.bucket_name}/") + index_file)
                    )
                self.index_file_cache.add(index_file, start_after_key)

        responses = self.s3_handler.get_objects(object_requests)
        return responses, steps, start_after_key, workers
def test_invalid_file_found_exception():
    filename = "/tmp/ts-logs/index/000000001/000000001230_worker_2.json.tmp"
    try:
        worker_name = parse_worker_name_from_file(filename)
        assert False
    except IndexReaderException:
        assert True
def test_parse_worker_name_from_index_file():
    filename = "/tmp/ts-logs/index/000000001/000000001230_worker_2.json"
    worker_name = parse_worker_name_from_file(filename)
    assert worker_name == "worker_2"

    filename = "/tmp/ts-logs/index/000000000499__job-worker_replica-0_task-1_device-GPU-6.json"
    worker_name = parse_worker_name_from_file(filename)
    assert worker_name == "/job:worker/replica:0/task:1/device:GPU:6"

    path = "s3://smdebug-testing/resources/one-index-file"

    _, bucket, prefix = is_s3(path)

    index_reader = S3IndexReader(path)
    index_files, _ = index_reader.list_index_files()

    filename = index_files[0]
    worker_name = parse_worker_name_from_file(filename)
    assert worker_name == "/job:worker/replica:0/task:1/device:GPU:4"
Exemple #6
0
    def run(self):
        while True:
            event_in_queue = self._queue.get()

            if isinstance(event_in_queue, EventWithIndex):
                # checking whether there is an object of EventWithIndex,
                # which is written by write_summary_with_index
                event = event_in_queue.event
            else:
                event = event_in_queue

            if event is self._sentinel_event:
                self._queue.task_done()
                break
            try:
                # write event
                positions = self._ev_writer.write_event(event)

                # write index
                if isinstance(event_in_queue, EventWithIndex):
                    eventfile = self._ev_writer.name()
                    eventfile = get_relative_event_file_path(eventfile)
                    tensorlocation = TensorLocation(
                        tname=event_in_queue.tensorname,
                        mode=event_in_queue.get_mode(),
                        mode_step=event_in_queue.mode_step,
                        event_file_name=eventfile,
                        start_idx=positions[0],
                        length=positions[1],
                        worker=parse_worker_name_from_file(eventfile),
                    )
                    self._ev_writer.index_writer.add_index(tensorlocation)
                # Flush the event writer every so often.
                now = time.time()
                if now > self._next_event_flush_time:
                    self._ev_writer.flush()
                    # Do it again in two minutes.
                    self._next_event_flush_time = now + self._flush_secs
            finally:
                self._queue.task_done()