Ejemplo n.º 1
0
 def get_tpu_system_metadata(self):
     """Retrieves TPU system metadata given a TPUClusterResolver."""
     cluster_spec = self.cluster_spec()
     cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
     tpu_system_metadata = (
         tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
             self.master(),
             cluster_def=cluster_def,
             query_topology=False))
Ejemplo n.º 2
0
def get_tpu_system_metadata(tpu_cluster_resolver):
    """Retrieves TPU system metadata given a TPUClusterResolver."""
    master = tpu_cluster_resolver.master()

    # pylint: disable=protected-access
    cluster_spec = tpu_cluster_resolver.cluster_spec()
    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
    tpu_system_metadata = (tpu_system_metadata_lib._query_tpu_system_metadata(
        master, cluster_def=cluster_def, query_topology=False))

    return tpu_system_metadata
Ejemplo n.º 3
0
def get_tpu_system_metadata(tpu_cluster_resolver):
  """Retrieves TPU system metadata given a TPUClusterResolver."""
  master = tpu_cluster_resolver.master()

  # pylint: disable=protected-access
  cluster_spec = tpu_cluster_resolver.cluster_spec()
  cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
  tpu_system_metadata = (
      tpu_system_metadata_lib._query_tpu_system_metadata(
          master,
          cluster_def=cluster_def,
          query_topology=False))

  return tpu_system_metadata
Ejemplo n.º 4
0
    def _get_tpu_system_metadata(self):
        """Gets the (maybe cached) TPU system metadata."""
        master = self._get_master_address()
        tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
        if tpu_system_metadata is not None:
            return tpu_system_metadata

        cluster_def = None
        if (self._config.session_config
                and self._config.session_config.cluster_def.job):
            cluster_def = self._config.session_config.cluster_def

        # pylint: disable=protected-access
        tpu_system_metadata = (
            tpu_system_metadata_lib._query_tpu_system_metadata(
                master,
                cluster_def=cluster_def,
                query_topology=self.model_parallelism_enabled))

        self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
        return tpu_system_metadata
Ejemplo n.º 5
0
  def _get_tpu_system_metadata(self):
    """Gets the (maybe cached) TPU system metadata."""
    master = self._get_master_address()
    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
    if tpu_system_metadata is not None:
      return tpu_system_metadata

    cluster_def = None
    if (self._config.session_config and
        self._config.session_config.cluster_def.job):
      cluster_def = self._config.session_config.cluster_def

    # pylint: disable=protected-access
    tpu_system_metadata = (
        tpu_system_metadata_lib._query_tpu_system_metadata(
            master,
            cluster_def=cluster_def,
            query_topology=self.model_parallelism_enabled))

    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
    return tpu_system_metadata
Ejemplo n.º 6
0
    def get_tpu_system_metadata(self):
        """Returns the metadata of the TPU system.

    Users can call this method to get some facts of the TPU system, like
    total number of cores, number of TPU workers and the devices. E.g.
    ```python

    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
    tpu_system_medata = resolver.get_tpu_system_metadata()
    num_hosts = tpu_system_medata.num_hosts
    ```

    Returns:
      A `tf.tpu.experimental.TPUSystemMetadata` object.
    """
        cluster_spec = self.cluster_spec()
        cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
        tpu_system_metadata = (
            tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
                self.master(),
                cluster_def=cluster_def,
                query_topology=False))

        return tpu_system_metadata
Ejemplo n.º 7
0
    def __init__(self,
                 table_to_config_dict,
                 feature_to_table_dict,
                 batch_size,
                 mode,
                 master,
                 optimization_parameters=None):
        """API for using TPU for embedding lookups.

    Args:
      table_to_config_dict: A dictionary mapping from string of table name to
        `TableConfig`. Table refers to an embedding table, e.g. `params`
        argument to `tf.nn.embedding_lookup_sparse()`.
      feature_to_table_dict: A dictionary mapping from string of feature name
        to string of table name. Feature refers to ids to lookup in embedding
        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
      batch_size: An `int` representing the global batch size.
      mode: `TRAINING` or `INFERENCE`.
      master: A `string` representing the TensorFlow master to use.
      optimization_parameters: `AdagradParameters`, `AdamParameters`,
        `Stochasticgradientdescentparameters`. Must be set in training and must
        be `None` in inference.

    Raises:
      ValueError: if any input is invalid.
    """
        _validate_table_to_config_dict(table_to_config_dict)
        # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
        self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
        self._combiners = _create_combiners(self._table_to_config_dict)

        _validate_feature_to_table_dict(table_to_config_dict,
                                        feature_to_table_dict)
        self._feature_to_table_dict = _create_ordered_dict(
            feature_to_table_dict)
        self._table_to_features_dict = _create_table_to_features_dict(
            self._feature_to_table_dict)

        self._batch_size = batch_size

        self._master = master
        self._tpu_system_metadata = (
            tpu_system_metadata_lib._query_tpu_system_metadata(self._master))  # pylint: disable=protected-access
        if self._tpu_system_metadata.num_cores == 0:
            raise ValueError(
                'TPUEmbedding needs TPUs, but master {} does not have '
                'TPUs.'.format(self._master))
        self._num_hosts = self._tpu_system_metadata.num_hosts
        self._hosts = [
            device.name for device in self._tpu_system_metadata.devices
            if 'device:CPU:' in device.name
        ]
        self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
        self._num_cores = self._tpu_system_metadata.num_cores

        _validate_batch_size(self._batch_size, self._num_cores)
        self._batch_size_per_core = self._batch_size // self._num_cores

        # TODO(shizhiw): remove `mode`?
        if mode == TRAINING:
            _validate_optimization_parameters(optimization_parameters)
            self._optimization_parameters = optimization_parameters
        elif mode == INFERENCE:
            if optimization_parameters is not None:
                raise ValueError('`optimization_parameters` should be `None` '
                                 'for inference mode.')
            self._optimization_parameters = (
                StochasticGradientDescentParameters(1.))
        else:
            raise ValueError('`mode` only supports {} and {}; got {}.'.format(
                TRAINING, INFERENCE, mode))
        self._mode = mode

        # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
        # and create special handler for inference that inherits from
        # StochasticGradientDescentHandler with more user-friendly error message
        # on get_slot().
        self._optimizer_handler = _get_optimization_handler(
            self._optimization_parameters)

        self._config_proto = self._create_config_proto()
Ejemplo n.º 8
0
  def __init__(self,
               table_to_config_dict,
               feature_to_table_dict,
               batch_size,
               mode,
               master,
               optimization_parameters=None,
               cluster_def=None,
               pipeline_execution_with_tensor_core=False):
    """API for using TPU for embedding lookups.

    Args:
      table_to_config_dict: A dictionary mapping from string of table name to
        `TableConfig`. Table refers to an embedding table, e.g. `params`
        argument to `tf.nn.embedding_lookup_sparse()`.
      feature_to_table_dict: A dictionary mapping from string of feature name
        to string of table name. Feature refers to ids to lookup in embedding
        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
      batch_size: An `int` representing the global batch size.
      mode: `TRAINING` or `INFERENCE`.
      master: A `string` representing the TensorFlow master to use.
      optimization_parameters: `AdagradParameters`, `AdamParameters`,
        `Stochasticgradientdescentparameters`. Must be set in training and must
        be `None` in inference.
      cluster_def: A ClusterDef object describing the TPU cluster.
      pipeline_execution_with_tensor_core: setting this to `True` makes training
        faster, but trained model will be different if step N and step N+1
        involve the same set of embedding IDs. Please see
        `tpu_embedding_configuration.proto` for details.

    Raises:
      ValueError: if any input is invalid.
    """
    _validate_table_to_config_dict(table_to_config_dict)
    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)

    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
    self._table_to_features_dict = _create_table_to_features_dict(
        self._feature_to_table_dict)
    self._combiners = _create_combiners(self._table_to_config_dict,
                                        self._table_to_features_dict)

    self._batch_size = batch_size

    self._master = master
    self._cluster_def = cluster_def
    self._tpu_system_metadata = (
        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
            self._master, cluster_def=self._cluster_def))
    if self._tpu_system_metadata.num_cores == 0:
      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
                       'TPUs.'.format(self._master))
    self._num_hosts = self._tpu_system_metadata.num_hosts
    master_job_name = tpu_system_metadata_lib.master_job(self._master,
                                                         self._cluster_def)
    self._hosts = sorted([
        device.name for device in self._tpu_system_metadata.devices
        if 'device:CPU:' in device.name and (master_job_name is None or
                                             master_job_name in device.name)])
    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
    self._num_cores = self._tpu_system_metadata.num_cores

    _validate_batch_size(self._batch_size, self._num_cores)
    self._batch_size_per_core = self._batch_size // self._num_cores

    # TODO(shizhiw): remove `mode`?
    if mode == TRAINING:
      _validate_optimization_parameters(optimization_parameters)
      self._optimization_parameters = optimization_parameters
    elif mode == INFERENCE:
      if optimization_parameters is not None:
        raise ValueError('`optimization_parameters` should be `None` '
                         'for inference mode.')
      self._optimization_parameters = (
          StochasticGradientDescentParameters(1.))
    else:
      raise ValueError('`mode` only supports {} and {}; got {}.'
                       .format(TRAINING, INFERENCE, mode))
    self._mode = mode

    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
    # and create special handler for inference that inherits from
    # StochasticGradientDescentHandler with more user-friendly error message
    # on get_slot().
    self._optimizer_handler = _get_optimization_handler(
        self._optimization_parameters)
    self._pipeline_execution_with_tensor_core = (
        pipeline_execution_with_tensor_core)

    self._config_proto = self._create_config_proto()