def _update_last_index_token(self, new_index_token: str) -> None: """ This function updates the last_index_token in the following scenarios: 1. last_complete_step >= last_index_token_step : this means that the token isn't pointing to the latest completed step 2. number of steps available ( complete or incomplete ) - (last_completed_step+1) > window_size_limit: we maintain a window to stop querying for older steps that have not completed. if the total number of steps, we are querying for completion is greater than our window_size_limit we update the last_index_token and last_complete_step by (window_size_limit // 2) :param new_index_token: :return:None """ if self.last_index_token is None: last_index_token_step = 0 else: last_index_token_step = IndexFileLocationUtils.parse_step_from_index_file_name( self.last_index_token) # Case 1: This case is not satisfied when all workers in a # distributed training job have not written a step if self.last_complete_step >= last_index_token_step: prefix = IndexFileLocationUtils.get_prefix_from_index_file( new_index_token) # sort lexicographically and select the last worker last_worker = sorted(list(self.worker_set))[-1] # below converts worker_name to serialized workerName # if it's a tf device, else no effect last_worker_serialized = serialize_tf_device(last_worker) self.last_index_token = IndexFileLocationUtils.get_index_key_for_step( prefix, self.last_complete_step, last_worker_serialized) self.logger.debug( f"Updated last index token to:{self.last_index_token}") # Case 2: This case is satisfied if the number of incomplete steps # is greater than the INCOMPLETE_STEP_WAIT_WINDOW available_step = self._global_to_mode.keys() if (len(available_step) - (self.last_complete_step + 1) > self._incomplete_wait_for_step_window): prefix = IndexFileLocationUtils.get_prefix_from_index_file( new_index_token) last_worker = sorted(list(self.worker_set))[-1] # below converts worker_name to serialized workerName # if it's a tf device, else no effect last_worker_serialized = serialize_tf_device(last_worker) self.last_index_token = IndexFileLocationUtils.get_index_key_for_step( prefix, self.last_complete_step + (self._incomplete_wait_for_step_window // 2), last_worker_serialized, ) self.last_complete_step = IndexFileLocationUtils.parse_step_from_index_file_name( self.last_index_token) self.logger.info( f"Waiting for: {len(available_step) - (self.last_complete_step + 1)} Steps. \n" f"INCOMPLETE_STEP_WAIT_WINDOW: {self._incomplete_wait_for_step_window}. \n" f"Marking the last {self._incomplete_wait_for_step_window // 2} incomplete steps as complete" f"Updating last_index_token to: {self.last_index_token}. \n" f"Updating last_complete_step to: {self.last_complete_step}. ")
def test_tf_device_name_serialize_and_deserialize(): import tensorflow.compat.v1 as tf device_name = tf.test.gpu_device_name() if not bool(device_name): device_name = "/device:GPU:0" serialized_device_name = serialize_tf_device(device_name) assert deserialize_tf_device(serialized_device_name) == device_name device_name = "/replica:0/task:0/device:GPU:0" serialized_device_name = serialize_tf_device(device_name) assert deserialize_tf_device(serialized_device_name) == device_name
def test_tf_device_name_serialize_and_deserialize(): logger = get_logger() import tensorflow.compat.v1 as tf device_name = tf.test.gpu_device_name() if not bool(device_name): logger.warning( "There is no GPU Support on this machine. Please ignore the cuInit errors generated above" ) device_name = "/device:GPU:0" serialized_device_name = serialize_tf_device(device_name) assert deserialize_tf_device(serialized_device_name) == device_name device_name = "/replica:0/task:0/device:GPU:0" serialized_device_name = serialize_tf_device(device_name) assert deserialize_tf_device(serialized_device_name) == device_name
def _add_to_device_map(self, tensor): tensors = [] # In TF 2.x eager mode, we cannot rely on input tensors to # populate this device map as these tensors cannot be saved. # Due to this, while executing MirroredStrategy on multiple GPUs, # weights and biases in the form of values.MirroredVariable are the # first tensors to reach this point. Since MirroredVariable is not # processed here, MirroredStrategy distributed training jobs failed # on GPU. Adding a check and processing MirroredVariable for TF 2.x # eager mode alone. if is_tf_version_2x() and tf.executing_eagerly(): from tensorflow.python.distribute import values if isinstance(tensor, values.DistributedValues): tensors = [t for t in tensor._values] else: tensors = [tensor] for t in tensors: if t.device and "CPU" not in t.device and t.device not in self.device_map: self.device_map[t.device] = serialize_tf_device(t.device)
def _add_to_device_map(self, tensor): if tensor.device and "CPU" not in tensor.device and tensor.device not in self.device_map: self.device_map[tensor.device] = serialize_tf_device(tensor.device)