Exemple #1
0
    def _update_last_index_token(self, new_index_token: str) -> None:
        """
        This function updates the last_index_token in the following scenarios:
            1. last_complete_step >= last_index_token_step :
                this means that the token isn't pointing to the latest completed step
            2. number of steps available ( complete or incomplete ) - (last_completed_step+1) > window_size_limit:
                we maintain a window to stop querying for older steps that have not completed.
                if the total number of steps, we are querying for completion is greater than our window_size_limit
                we update the last_index_token and last_complete_step by (window_size_limit // 2)
        :param new_index_token:
        :return:None
        """
        if self.last_index_token is None:
            last_index_token_step = 0
        else:
            last_index_token_step = IndexFileLocationUtils.parse_step_from_index_file_name(
                self.last_index_token)

        # Case 1: This case is not satisfied when all workers in a
        # distributed training job have not written a step
        if self.last_complete_step >= last_index_token_step:
            prefix = IndexFileLocationUtils.get_prefix_from_index_file(
                new_index_token)
            # sort lexicographically and select the last worker
            last_worker = sorted(list(self.worker_set))[-1]
            # below converts worker_name to serialized workerName
            # if it's a tf device, else no effect
            last_worker_serialized = serialize_tf_device(last_worker)
            self.last_index_token = IndexFileLocationUtils.get_index_key_for_step(
                prefix, self.last_complete_step, last_worker_serialized)
            self.logger.debug(
                f"Updated last index token to:{self.last_index_token}")

        # Case 2: This case is satisfied if the number of incomplete steps
        # is greater than the INCOMPLETE_STEP_WAIT_WINDOW
        available_step = self._global_to_mode.keys()
        if (len(available_step) - (self.last_complete_step + 1) >
                self._incomplete_wait_for_step_window):
            prefix = IndexFileLocationUtils.get_prefix_from_index_file(
                new_index_token)
            last_worker = sorted(list(self.worker_set))[-1]
            # below converts worker_name to serialized workerName
            # if it's a tf device, else no effect
            last_worker_serialized = serialize_tf_device(last_worker)
            self.last_index_token = IndexFileLocationUtils.get_index_key_for_step(
                prefix,
                self.last_complete_step +
                (self._incomplete_wait_for_step_window // 2),
                last_worker_serialized,
            )
            self.last_complete_step = IndexFileLocationUtils.parse_step_from_index_file_name(
                self.last_index_token)
            self.logger.info(
                f"Waiting for: {len(available_step) - (self.last_complete_step + 1)} Steps. \n"
                f"INCOMPLETE_STEP_WAIT_WINDOW: {self._incomplete_wait_for_step_window}. \n"
                f"Marking the last {self._incomplete_wait_for_step_window // 2} incomplete steps as complete"
                f"Updating last_index_token to: {self.last_index_token}. \n"
                f"Updating last_complete_step to: {self.last_complete_step}. ")
Exemple #2
0
def test_index():
    numpy_tensor = [
        np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
        np.array([[1.0, 2.0, 4.0], [3.0, 4.0, 5.0]], dtype=np.float32),
    ]
    runid = "default"
    logdir = "."
    step = 0
    worker = "worker_0"
    run_dir = os.path.join(logdir, runid)
    writer = FileWriter(trial_dir=run_dir,
                        step=step,
                        worker=worker,
                        verbose=True)
    for i in (0, len(numpy_tensor) - 1):
        n = "tensor" + str(i)
        writer.write_tensor(tdata=numpy_tensor[i], tname=n)
    writer.flush()
    writer.close()
    efl = TensorFileLocation(step_num=step, worker_name=worker)
    eventfile = efl.get_file_location(trial_dir=run_dir)
    indexfile = IndexFileLocationUtils.get_index_key_for_step(
        run_dir, step, worker)

    fo = open(eventfile, "rb")
    with open(indexfile) as idx_file:
        index_data = json.load(idx_file)
        tensor_payload = index_data["tensor_payload"]
        i = 0
        for tensor in tensor_payload:
            start_idx = int(tensor["start_idx"])
            fo.seek(start_idx, 0)
            length = int(tensor["length"])
            line = fo.read(length)
            zoo = open("test.txt", "wb")
            zoo.write(line)
            zoo.close()
            testfile_reader = FileReader("./test.txt")
            tensor_values = list(testfile_reader.read_tensors())
            assert np.allclose(
                tensor_values[0][2].all(),
                numpy_tensor[i].all()), "indexwriter not working"
            i = i + 1

    fo.close()
    shutil.rmtree(run_dir)
    os.remove("test.txt")
Exemple #3
0
def dummy_step_creator(trial_dir, global_step, mode, mode_step, worker_name):
    static_step_data = (
        '{"meta": {"mode": "TRAIN", "mode_step": 0, "event_file_name": ""}, '
        '"tensor_payload": ['
        '{"tensorname": "gradients/dummy:0", "start_idx": 0, "length": 1}'
        "]}")

    step = json.loads(static_step_data)
    step["meta"]["mode"] = mode
    step["meta"]["mode_step"] = mode_step

    index_file_location = IndexFileLocationUtils.get_index_key_for_step(
        trial_dir, global_step, worker_name)
    Path(os.path.dirname(index_file_location)).mkdir(parents=True,
                                                     exist_ok=True)
    with open(index_file_location, "w") as f:
        json.dump(step, f)