def get_tpu_system_metadata(self): """Retrieves TPU system metadata given a TPUClusterResolver.""" cluster_spec = self.cluster_spec() cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( # pylint: disable=protected-access self.master(), cluster_def=cluster_def, query_topology=False))
def get_tpu_system_metadata(tpu_cluster_resolver): """Retrieves TPU system metadata given a TPUClusterResolver.""" master = tpu_cluster_resolver.master() # pylint: disable=protected-access cluster_spec = tpu_cluster_resolver.cluster_spec() cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None tpu_system_metadata = (tpu_system_metadata_lib._query_tpu_system_metadata( master, cluster_def=cluster_def, query_topology=False)) return tpu_system_metadata
def get_tpu_system_metadata(tpu_cluster_resolver): """Retrieves TPU system metadata given a TPUClusterResolver.""" master = tpu_cluster_resolver.master() # pylint: disable=protected-access cluster_spec = tpu_cluster_resolver.cluster_spec() cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( master, cluster_def=cluster_def, query_topology=False)) return tpu_system_metadata
def _get_tpu_system_metadata(self): """Gets the (maybe cached) TPU system metadata.""" master = self._get_master_address() tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master) if tpu_system_metadata is not None: return tpu_system_metadata cluster_def = None if (self._config.session_config and self._config.session_config.cluster_def.job): cluster_def = self._config.session_config.cluster_def # pylint: disable=protected-access tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( master, cluster_def=cluster_def, query_topology=self.model_parallelism_enabled)) self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata return tpu_system_metadata
def _get_tpu_system_metadata(self): """Gets the (maybe cached) TPU system metadata.""" master = self._get_master_address() tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master) if tpu_system_metadata is not None: return tpu_system_metadata cluster_def = None if (self._config.session_config and self._config.session_config.cluster_def.job): cluster_def = self._config.session_config.cluster_def # pylint: disable=protected-access tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( master, cluster_def=cluster_def, query_topology=self.model_parallelism_enabled)) self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata return tpu_system_metadata
def get_tpu_system_metadata(self): """Returns the metadata of the TPU system. Users can call this method to get some facts of the TPU system, like total number of cores, number of TPU workers and the devices. E.g. ```python resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='') tpu_system_medata = resolver.get_tpu_system_metadata() num_hosts = tpu_system_medata.num_hosts ``` Returns: A `tf.tpu.experimental.TPUSystemMetadata` object. """ cluster_spec = self.cluster_spec() cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( # pylint: disable=protected-access self.master(), cluster_def=cluster_def, query_topology=False)) return tpu_system_metadata
def __init__(self, table_to_config_dict, feature_to_table_dict, batch_size, mode, master, optimization_parameters=None): """API for using TPU for embedding lookups. Args: table_to_config_dict: A dictionary mapping from string of table name to `TableConfig`. Table refers to an embedding table, e.g. `params` argument to `tf.nn.embedding_lookup_sparse()`. feature_to_table_dict: A dictionary mapping from string of feature name to string of table name. Feature refers to ids to lookup in embedding table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`. batch_size: An `int` representing the global batch size. mode: `TRAINING` or `INFERENCE`. master: A `string` representing the TensorFlow master to use. optimization_parameters: `AdagradParameters`, `AdamParameters`, `Stochasticgradientdescentparameters`. Must be set in training and must be `None` in inference. Raises: ValueError: if any input is invalid. """ _validate_table_to_config_dict(table_to_config_dict) # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`. self._table_to_config_dict = _create_ordered_dict(table_to_config_dict) self._combiners = _create_combiners(self._table_to_config_dict) _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict) self._feature_to_table_dict = _create_ordered_dict( feature_to_table_dict) self._table_to_features_dict = _create_table_to_features_dict( self._feature_to_table_dict) self._batch_size = batch_size self._master = master self._tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata(self._master)) # pylint: disable=protected-access if self._tpu_system_metadata.num_cores == 0: raise ValueError( 'TPUEmbedding needs TPUs, but master {} does not have ' 'TPUs.'.format(self._master)) self._num_hosts = self._tpu_system_metadata.num_hosts self._hosts = [ device.name for device in self._tpu_system_metadata.devices if 'device:CPU:' in device.name ] self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host self._num_cores = self._tpu_system_metadata.num_cores _validate_batch_size(self._batch_size, self._num_cores) self._batch_size_per_core = self._batch_size // self._num_cores # TODO(shizhiw): remove `mode`? if mode == TRAINING: _validate_optimization_parameters(optimization_parameters) self._optimization_parameters = optimization_parameters elif mode == INFERENCE: if optimization_parameters is not None: raise ValueError('`optimization_parameters` should be `None` ' 'for inference mode.') self._optimization_parameters = ( StochasticGradientDescentParameters(1.)) else: raise ValueError('`mode` only supports {} and {}; got {}.'.format( TRAINING, INFERENCE, mode)) self._mode = mode # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler` # and create special handler for inference that inherits from # StochasticGradientDescentHandler with more user-friendly error message # on get_slot(). self._optimizer_handler = _get_optimization_handler( self._optimization_parameters) self._config_proto = self._create_config_proto()
def __init__(self, table_to_config_dict, feature_to_table_dict, batch_size, mode, master, optimization_parameters=None, cluster_def=None, pipeline_execution_with_tensor_core=False): """API for using TPU for embedding lookups. Args: table_to_config_dict: A dictionary mapping from string of table name to `TableConfig`. Table refers to an embedding table, e.g. `params` argument to `tf.nn.embedding_lookup_sparse()`. feature_to_table_dict: A dictionary mapping from string of feature name to string of table name. Feature refers to ids to lookup in embedding table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`. batch_size: An `int` representing the global batch size. mode: `TRAINING` or `INFERENCE`. master: A `string` representing the TensorFlow master to use. optimization_parameters: `AdagradParameters`, `AdamParameters`, `Stochasticgradientdescentparameters`. Must be set in training and must be `None` in inference. cluster_def: A ClusterDef object describing the TPU cluster. pipeline_execution_with_tensor_core: setting this to `True` makes training faster, but trained model will be different if step N and step N+1 involve the same set of embedding IDs. Please see `tpu_embedding_configuration.proto` for details. Raises: ValueError: if any input is invalid. """ _validate_table_to_config_dict(table_to_config_dict) # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`. self._table_to_config_dict = _create_ordered_dict(table_to_config_dict) _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict) self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict) self._table_to_features_dict = _create_table_to_features_dict( self._feature_to_table_dict) self._combiners = _create_combiners(self._table_to_config_dict, self._table_to_features_dict) self._batch_size = batch_size self._master = master self._cluster_def = cluster_def self._tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( # pylint: disable=protected-access self._master, cluster_def=self._cluster_def)) if self._tpu_system_metadata.num_cores == 0: raise ValueError('TPUEmbedding needs TPUs, but master {} does not have ' 'TPUs.'.format(self._master)) self._num_hosts = self._tpu_system_metadata.num_hosts master_job_name = tpu_system_metadata_lib.master_job(self._master, self._cluster_def) self._hosts = sorted([ device.name for device in self._tpu_system_metadata.devices if 'device:CPU:' in device.name and (master_job_name is None or master_job_name in device.name)]) self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host self._num_cores = self._tpu_system_metadata.num_cores _validate_batch_size(self._batch_size, self._num_cores) self._batch_size_per_core = self._batch_size // self._num_cores # TODO(shizhiw): remove `mode`? if mode == TRAINING: _validate_optimization_parameters(optimization_parameters) self._optimization_parameters = optimization_parameters elif mode == INFERENCE: if optimization_parameters is not None: raise ValueError('`optimization_parameters` should be `None` ' 'for inference mode.') self._optimization_parameters = ( StochasticGradientDescentParameters(1.)) else: raise ValueError('`mode` only supports {} and {}; got {}.' .format(TRAINING, INFERENCE, mode)) self._mode = mode # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler` # and create special handler for inference that inherits from # StochasticGradientDescentHandler with more user-friendly error message # on get_slot(). self._optimizer_handler = _get_optimization_handler( self._optimization_parameters) self._pipeline_execution_with_tensor_core = ( pipeline_execution_with_tensor_core) self._config_proto = self._create_config_proto()