Esempio n. 1
0
    def _update_last_index_token(self, new_index_token: str) -> None:
        """
        This function updates the last_index_token in the following scenarios:
            1. last_complete_step >= last_index_token_step :
                this means that the token isn't pointing to the latest completed step
            2. number of steps available ( complete or incomplete ) - (last_completed_step+1) > window_size_limit:
                we maintain a window to stop querying for older steps that have not completed.
                if the total number of steps, we are querying for completion is greater than our window_size_limit
                we update the last_index_token and last_complete_step by (window_size_limit // 2)
        :param new_index_token:
        :return:None
        """
        if self.last_index_token is None:
            last_index_token_step = 0
        else:
            last_index_token_step = IndexFileLocationUtils.parse_step_from_index_file_name(
                self.last_index_token)

        # Case 1: This case is not satisfied when all workers in a
        # distributed training job have not written a step
        if self.last_complete_step >= last_index_token_step:
            prefix = IndexFileLocationUtils.get_prefix_from_index_file(
                new_index_token)
            # sort lexicographically and select the last worker
            last_worker = sorted(list(self.worker_set))[-1]
            # below converts worker_name to serialized workerName
            # if it's a tf device, else no effect
            last_worker_serialized = serialize_tf_device(last_worker)
            self.last_index_token = IndexFileLocationUtils.get_index_key_for_step(
                prefix, self.last_complete_step, last_worker_serialized)
            self.logger.debug(
                f"Updated last index token to:{self.last_index_token}")

        # Case 2: This case is satisfied if the number of incomplete steps
        # is greater than the INCOMPLETE_STEP_WAIT_WINDOW
        available_step = self._global_to_mode.keys()
        if (len(available_step) - (self.last_complete_step + 1) >
                self._incomplete_wait_for_step_window):
            prefix = IndexFileLocationUtils.get_prefix_from_index_file(
                new_index_token)
            last_worker = sorted(list(self.worker_set))[-1]
            # below converts worker_name to serialized workerName
            # if it's a tf device, else no effect
            last_worker_serialized = serialize_tf_device(last_worker)
            self.last_index_token = IndexFileLocationUtils.get_index_key_for_step(
                prefix,
                self.last_complete_step +
                (self._incomplete_wait_for_step_window // 2),
                last_worker_serialized,
            )
            self.last_complete_step = IndexFileLocationUtils.parse_step_from_index_file_name(
                self.last_index_token)
            self.logger.info(
                f"Waiting for: {len(available_step) - (self.last_complete_step + 1)} Steps. \n"
                f"INCOMPLETE_STEP_WAIT_WINDOW: {self._incomplete_wait_for_step_window}. \n"
                f"Marking the last {self._incomplete_wait_for_step_window // 2} incomplete steps as complete"
                f"Updating last_index_token to: {self.last_index_token}. \n"
                f"Updating last_complete_step to: {self.last_complete_step}. ")
def test_tf_device_name_serialize_and_deserialize():
    import tensorflow.compat.v1 as tf

    device_name = tf.test.gpu_device_name()
    if not bool(device_name):
        device_name = "/device:GPU:0"

    serialized_device_name = serialize_tf_device(device_name)
    assert deserialize_tf_device(serialized_device_name) == device_name

    device_name = "/replica:0/task:0/device:GPU:0"
    serialized_device_name = serialize_tf_device(device_name)
    assert deserialize_tf_device(serialized_device_name) == device_name
Esempio n. 3
0
def test_tf_device_name_serialize_and_deserialize():
    logger = get_logger()
    import tensorflow.compat.v1 as tf

    device_name = tf.test.gpu_device_name()
    if not bool(device_name):
        logger.warning(
            "There is no GPU Support on this machine. Please ignore the cuInit errors generated above"
        )
        device_name = "/device:GPU:0"

    serialized_device_name = serialize_tf_device(device_name)
    assert deserialize_tf_device(serialized_device_name) == device_name

    device_name = "/replica:0/task:0/device:GPU:0"
    serialized_device_name = serialize_tf_device(device_name)
    assert deserialize_tf_device(serialized_device_name) == device_name
Esempio n. 4
0
    def _add_to_device_map(self, tensor):
        tensors = []

        # In TF 2.x eager mode, we cannot rely on input tensors to
        # populate this device map as these tensors cannot be saved.
        # Due to this, while executing MirroredStrategy on multiple GPUs,
        # weights and biases in the form of values.MirroredVariable are the
        # first tensors to reach this point. Since MirroredVariable is not
        # processed here, MirroredStrategy distributed training jobs failed
        # on GPU. Adding a check and processing MirroredVariable for TF 2.x
        # eager mode alone.
        if is_tf_version_2x() and tf.executing_eagerly():
            from tensorflow.python.distribute import values

            if isinstance(tensor, values.DistributedValues):
                tensors = [t for t in tensor._values]
        else:
            tensors = [tensor]

        for t in tensors:
            if t.device and "CPU" not in t.device and t.device not in self.device_map:
                self.device_map[t.device] = serialize_tf_device(t.device)
Esempio n. 5
0
 def _add_to_device_map(self, tensor):
     if tensor.device and "CPU" not in tensor.device and tensor.device not in self.device_map:
         self.device_map[tensor.device] = serialize_tf_device(tensor.device)