def _has_event_file_been_skipped(self, missing_event_file_name: str) -> bool:
     """
     Checks if an event file will ever be downloaded.
     if event_file present --> return False
     if the worker has written the next event file --> return True
     if none of the above --> return False
     :param missing_event_file_name:
     :return:
     """
     self.logger.info(f" Index Reader: Event File {missing_event_file_name} not found.")
     missing_worker = parse_worker_name_from_file(missing_event_file_name)
     missing_step = IndexFileLocationUtils.parse_step_from_index_file_name(
         missing_event_file_name
     )
     event_files = self.list_event_files(missing_event_file_name)
     for event_file in event_files:
         if missing_worker == parse_worker_name_from_file(event_file):
             step = IndexFileLocationUtils.parse_step_from_index_file_name(event_file)
             if missing_step == step:
                 """
                     The missing step file may have been written to disk before
                     we perform the list operation.
                 """
                 return False
             self.logger.warn(
                 f" Index Reader: Event File {missing_event_file_name} was written but not found "
                 f"\nHowever Event File {event_file} found."
             )
             self.logger.warn(f"IndexReader: Skipping {missing_event_file_name} ")
             return True
     return False
Example #2
0
def test_get_prefix_from_index_file():
    local_index_filepath = "/opt/ml/testing/run_1/index/000000000/000000000000_worker_0.json"
    prefix = IndexFileLocationUtils.get_prefix_from_index_file(local_index_filepath)

    assert prefix == "/opt/ml/testing/run_1"

    s3_index_filepath = (
        "s3://bucket-that-does-not-exist/run_1/index/000000000/000000000000_worker_0.json"
    )
    prefix = IndexFileLocationUtils.get_prefix_from_index_file(s3_index_filepath)

    assert prefix == "s3://bucket-that-does-not-exist/run_1"
 def list_index_files(self):
     index_dirname = IndexFileLocationUtils.get_index_path(self.path)
     # index files are json files or csv files ending with string ".csv" or ".json"
     index_files_regex = r"(.+)\.(json|csv)$"
     index_files = list_files_in_directory(index_dirname,
                                           file_regex=index_files_regex)
     return sorted(index_files)
 def read_index_files(
     self, start_after_key: str, range_steps=None
 ) -> Tuple[List[bytes], list, str, List[str]]:
     """
         Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
     :param start_after_key: str
     :param range_steps: str
     :return: Tuple( responses, steps, start_after_key, workers)
     """
     index_files = self.list_index_files()
     steps = []
     workers = []
     responses = []
     if start_after_key is not None:
         start_after_index = bisect_left(index_files, start_after_key)
     else:
         start_after_index = 0
     index_files = index_files[start_after_index:]  # ignore files we have already read
     for index_file in index_files:
         if self.index_file_cache.has_not_read(index_file):
             step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file)
             if (
                 range_steps is not None and step_in_range(range_steps, step)
             ) or range_steps is None:
                 steps.append(step)
                 workers.append(parse_worker_name_from_file(index_file))
                 self.logger.debug(
                     f"Sagemaker-Debugger: Read {os.path.getsize(index_file)} bytes from file {index_file}"
                 )
                 with open(index_file) as f:
                     responses.append(f.read().encode())
             self.index_file_cache.add(index_file, start_after_key)
     if len(index_files) > 0:
         start_after_key = index_files[-1]  # Last file that we have read
     return responses, steps, start_after_key, workers
    def read_index_files(
        self, start_after_key: str, range_steps=None
    ) -> Tuple[List[bytes], list, str, List[str]]:
        """
            Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
        :param start_after_key: str
        :param range_steps:
        :return: Tuple( responses, steps, start_after_key, workers)
        """
        object_requests = []
        steps = []
        workers = []
        index_files, start_after_key = self.list_index_files(start_after_key)
        self.logger.debug(f'Loaded Index Files: {",".join(index_files)}')
        for index_file in index_files:
            if self.index_file_cache.has_not_read(index_file):
                step = IndexFileLocationUtils.parse_step_from_index_file_name(index_file)
                if (
                    range_steps is not None and step_in_range(range_steps, step)
                ) or range_steps is None:
                    steps.append(step)
                    workers.append(parse_worker_name_from_file(index_file))
                    object_requests.append(
                        ReadObjectRequest(format(f"s3://{self.bucket_name}/") + index_file)
                    )
                self.index_file_cache.add(index_file, start_after_key)

        responses = self.s3_handler.get_objects(object_requests)
        return responses, steps, start_after_key, workers
    def list_index_files(self, start_after_key=None):
        index_files, last_index_token = list_s3_objects(
            self.bucket_name,
            IndexFileLocationUtils.get_index_path(self.prefix_name),
            start_after_key,
        )

        return index_files, last_index_token
Example #7
0
    def _update_last_index_token(self, new_index_token: str) -> None:
        """
        This function updates the last_index_token in the following scenarios:
            1. last_complete_step >= last_index_token_step :
                this means that the token isn't pointing to the latest completed step
            2. number of steps available ( complete or incomplete ) - (last_completed_step+1) > window_size_limit:
                we maintain a window to stop querying for older steps that have not completed.
                if the total number of steps, we are querying for completion is greater than our window_size_limit
                we update the last_index_token and last_complete_step by (window_size_limit // 2)
        :param new_index_token:
        :return:None
        """
        if self.last_index_token is None:
            last_index_token_step = 0
        else:
            last_index_token_step = IndexFileLocationUtils.parse_step_from_index_file_name(
                self.last_index_token)

        # Case 1: This case is not satisfied when all workers in a
        # distributed training job have not written a step
        if self.last_complete_step >= last_index_token_step:
            prefix = IndexFileLocationUtils.get_prefix_from_index_file(
                new_index_token)
            # sort lexicographically and select the last worker
            last_worker = sorted(list(self.worker_set))[-1]
            # below converts worker_name to serialized workerName
            # if it's a tf device, else no effect
            last_worker_serialized = serialize_tf_device(last_worker)
            self.last_index_token = IndexFileLocationUtils.get_index_key_for_step(
                prefix, self.last_complete_step, last_worker_serialized)
            self.logger.debug(
                f"Updated last index token to:{self.last_index_token}")

        # Case 2: This case is satisfied if the number of incomplete steps
        # is greater than the INCOMPLETE_STEP_WAIT_WINDOW
        available_step = self._global_to_mode.keys()
        if (len(available_step) - (self.last_complete_step + 1) >
                self._incomplete_wait_for_step_window):
            prefix = IndexFileLocationUtils.get_prefix_from_index_file(
                new_index_token)
            last_worker = sorted(list(self.worker_set))[-1]
            # below converts worker_name to serialized workerName
            # if it's a tf device, else no effect
            last_worker_serialized = serialize_tf_device(last_worker)
            self.last_index_token = IndexFileLocationUtils.get_index_key_for_step(
                prefix,
                self.last_complete_step +
                (self._incomplete_wait_for_step_window // 2),
                last_worker_serialized,
            )
            self.last_complete_step = IndexFileLocationUtils.parse_step_from_index_file_name(
                self.last_index_token)
            self.logger.info(
                f"Waiting for: {len(available_step) - (self.last_complete_step + 1)} Steps. \n"
                f"INCOMPLETE_STEP_WAIT_WINDOW: {self._incomplete_wait_for_step_window}. \n"
                f"Marking the last {self._incomplete_wait_for_step_window // 2} incomplete steps as complete"
                f"Updating last_index_token to: {self.last_index_token}. \n"
                f"Updating last_complete_step to: {self.last_complete_step}. ")
Example #8
0
def test_index():
    numpy_tensor = [
        np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
        np.array([[1.0, 2.0, 4.0], [3.0, 4.0, 5.0]], dtype=np.float32),
    ]
    runid = "default"
    logdir = "."
    step = 0
    worker = "worker_0"
    run_dir = os.path.join(logdir, runid)
    writer = FileWriter(trial_dir=run_dir,
                        step=step,
                        worker=worker,
                        verbose=True)
    for i in (0, len(numpy_tensor) - 1):
        n = "tensor" + str(i)
        writer.write_tensor(tdata=numpy_tensor[i], tname=n)
    writer.flush()
    writer.close()
    efl = TensorFileLocation(step_num=step, worker_name=worker)
    eventfile = efl.get_file_location(trial_dir=run_dir)
    indexfile = IndexFileLocationUtils.get_index_key_for_step(
        run_dir, step, worker)

    fo = open(eventfile, "rb")
    with open(indexfile) as idx_file:
        index_data = json.load(idx_file)
        tensor_payload = index_data["tensor_payload"]
        i = 0
        for tensor in tensor_payload:
            start_idx = int(tensor["start_idx"])
            fo.seek(start_idx, 0)
            length = int(tensor["length"])
            line = fo.read(length)
            zoo = open("test.txt", "wb")
            zoo.write(line)
            zoo.close()
            testfile_reader = FileReader("./test.txt")
            tensor_values = list(testfile_reader.read_tensors())
            assert np.allclose(
                tensor_values[0][2].all(),
                numpy_tensor[i].all()), "indexwriter not working"
            i = i + 1

    fo.close()
    shutil.rmtree(run_dir)
    os.remove("test.txt")
Example #9
0
def dummy_step_creator(trial_dir, global_step, mode, mode_step, worker_name):
    static_step_data = (
        '{"meta": {"mode": "TRAIN", "mode_step": 0, "event_file_name": ""}, '
        '"tensor_payload": ['
        '{"tensorname": "gradients/dummy:0", "start_idx": 0, "length": 1}'
        "]}")

    step = json.loads(static_step_data)
    step["meta"]["mode"] = mode
    step["meta"]["mode_step"] = mode_step

    index_file_location = IndexFileLocationUtils.get_index_key_for_step(
        trial_dir, global_step, worker_name)
    Path(os.path.dirname(index_file_location)).mkdir(parents=True,
                                                     exist_ok=True)
    with open(index_file_location, "w") as f:
        json.dump(step, f)
 def list_index_files(self):
     index_dirname = IndexFileLocationUtils.get_index_path(self.path)
     index_files = list_files_in_directory(index_dirname)
     return sorted(index_files)