Example #1
0
def get_tpu_system_metadata(tpu_cluster_resolver):
    """Retrieves TPU system metadata given a TPUClusterResolver."""
    master = tpu_cluster_resolver.master()

    # pylint: disable=protected-access
    cluster_def = (tpu_cluster_resolver.cluster_spec()
                   or server_lib.ClusterSpec({})).as_cluster_def()
    tpu_system_metadata = (tpu_system_metadata_lib._query_tpu_system_metadata(
        master, cluster_def=cluster_def, query_topology=True))

    return tpu_system_metadata
Example #2
0
def get_tpu_system_metadata(tpu_cluster_resolver):
    """Retrieves TPU system metadata given a TPUClusterResolver."""
    master = tpu_cluster_resolver.master()

    # pylint: disable=protected-access
    cluster_spec = tpu_cluster_resolver.cluster_spec()
    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
    tpu_system_metadata = (tpu_system_metadata_lib._query_tpu_system_metadata(
        master, cluster_def=cluster_def, query_topology=False))

    return tpu_system_metadata
Example #3
0
def get_tpu_system_metadata(tpu_cluster_resolver):
  """Retrieves TPU system metadata given a TPUClusterResolver."""
  master = tpu_cluster_resolver.master()

  # pylint: disable=protected-access
  cluster_spec = tpu_cluster_resolver.cluster_spec()
  cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
  tpu_system_metadata = (
      tpu_system_metadata_lib._query_tpu_system_metadata(
          master,
          cluster_def=cluster_def,
          query_topology=False))

  return tpu_system_metadata
Example #4
0
def get_tpu_system_metadata(tpu_cluster_resolver):
  """Retrieves TPU system metadata given a TPUClusterResolver."""
  master = tpu_cluster_resolver.master()

  # pylint: disable=protected-access
  cluster_def = (tpu_cluster_resolver.cluster_spec()
                 or server_lib.ClusterSpec({})).as_cluster_def()
  tpu_system_metadata = (
      tpu_system_metadata_lib._query_tpu_system_metadata(
          master,
          cluster_def=cluster_def,
          query_topology=True))

  return tpu_system_metadata
Example #5
0
    def _get_tpu_system_metadata(self):
        """Gets the (maybe cached) TPU system metadata."""
        master = self._get_master_address()
        tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
        if tpu_system_metadata is not None:
            return tpu_system_metadata

        # pylint: disable=protected-access
        tpu_system_metadata = (
            tpu_system_metadata_lib._query_tpu_system_metadata(
                master, query_topology=self.model_parallelism_enabled))

        self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
        return tpu_system_metadata
Example #6
0
  def _get_tpu_system_metadata(self):
    """Gets the (maybe cached) TPU system metadata."""
    master = self._get_master_address()
    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
    if tpu_system_metadata is not None:
      return tpu_system_metadata

    # pylint: disable=protected-access
    tpu_system_metadata = (
        tpu_system_metadata_lib._query_tpu_system_metadata(
            master, query_topology=self.model_parallelism_enabled))

    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
    return tpu_system_metadata
  def _get_tpu_system_metadata(self):
    """Gets the (maybe cached) TPU system metadata."""
    master = self._get_master_address()
    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
    if tpu_system_metadata is not None:
      return tpu_system_metadata

    cluster_def = None
    if (self._config.session_config and
        self._config.session_config.cluster_def.job):
      cluster_def = self._config.session_config.cluster_def

    # pylint: disable=protected-access
    tpu_system_metadata = (
        tpu_system_metadata_lib._query_tpu_system_metadata(
            master,
            cluster_def=cluster_def,
            query_topology=self.model_parallelism_enabled))

    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
    return tpu_system_metadata
Example #8
0
  def _get_tpu_system_metadata(self):
    """Gets the (maybe cached) TPU system metadata."""
    master = self._get_master_address()
    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
    if tpu_system_metadata is not None:
      return tpu_system_metadata

    cluster_def = None
    if (self._config.session_config and
        self._config.session_config.cluster_def.job):
      cluster_def = self._config.session_config.cluster_def

    # pylint: disable=protected-access
    tpu_system_metadata = (
        tpu_system_metadata_lib._query_tpu_system_metadata(
            master,
            cluster_def=cluster_def,
            query_topology=self.model_parallelism_enabled))

    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
    return tpu_system_metadata
    def __init__(self,
                 table_to_config_dict,
                 feature_to_table_dict,
                 batch_size,
                 mode,
                 master,
                 optimization_parameters=None):
        """API for using TPU for embedding lookups.

    Args:
      table_to_config_dict: A dictionary mapping from string of table name to
        `TableConfig`. Table refers to an embedding table, e.g. `params`
        argument to `tf.nn.embedding_lookup_sparse()`.
      feature_to_table_dict: A dictionary mapping from string of feature name
        to string of table name. Feature refers to ids to lookup in embedding
        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
      batch_size: An `int` representing the global batch size.
      mode: `TRAINING` or `INFERENCE`.
      master: A `string` representing the TensorFlow master to use.
      optimization_parameters: `AdagradParameters`, `AdamParameters`,
        `Stochasticgradientdescentparameters`. Must be set in training and must
        be `None` in inference.

    Raises:
      ValueError: if any input is invalid.
    """
        _validate_table_to_config_dict(table_to_config_dict)
        # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
        self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
        self._combiners = _create_combiners(self._table_to_config_dict)

        _validate_feature_to_table_dict(table_to_config_dict,
                                        feature_to_table_dict)
        self._feature_to_table_dict = _create_ordered_dict(
            feature_to_table_dict)
        self._table_to_features_dict = _create_table_to_features_dict(
            self._feature_to_table_dict)

        self._batch_size = batch_size

        self._master = master
        self._tpu_system_metadata = (
            tpu_system_metadata_lib._query_tpu_system_metadata(self._master))  # pylint: disable=protected-access
        if self._tpu_system_metadata.num_cores == 0:
            raise ValueError(
                'TPUEmbedding needs TPUs, but master {} does not have '
                'TPUs.'.format(self._master))
        self._num_hosts = self._tpu_system_metadata.num_hosts
        self._hosts = [
            device.name for device in self._tpu_system_metadata.devices
            if 'device:CPU:' in device.name
        ]
        self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
        self._num_cores = self._tpu_system_metadata.num_cores

        _validate_batch_size(self._batch_size, self._num_cores)
        self._batch_size_per_core = self._batch_size // self._num_cores

        self._init_ops = []

        # TODO(shizhiw): remove `mode`?
        if mode == TRAINING:
            _validate_optimization_parameters(optimization_parameters)
            self._optimization_parameters = optimization_parameters
        elif mode == INFERENCE:
            if optimization_parameters is not None:
                raise ValueError('`optimization_parameters` should be `None` '
                                 'for inference mode.')
            self._optimization_parameters = (
                StochasticGradientDescentParameters(1.))
        else:
            raise ValueError('`mode` only supports {} and {}; got {}.'.format(
                TRAINING, INFERENCE, mode))
        self._mode = mode

        # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
        # and create special handler for inference that inherits from
        # StochasticGradientDescentHandler with more user-friendly error message
        # on get_slot().
        self._optimizer_handler = _get_optimization_handler(
            self._optimization_parameters)

        dummy_table_variables_init_op = self._create_dummy_table_variables()
        self._init_ops.append(dummy_table_variables_init_op)

        self._config_proto = self._create_config_proto()
Example #10
0
  def __init__(self,
               table_to_config_dict,
               feature_to_table_dict,
               batch_size,
               mode,
               master,
               optimization_parameters=None):
    """API for using TPU for embedding lookups.

    Args:
      table_to_config_dict: A dictionary mapping from string of table name to
        `TableConfig`. Table refers to an embedding table, e.g. `params`
        argument to `tf.nn.embedding_lookup_sparse()`.
      feature_to_table_dict: A dictionary mapping from string of feature name
        to string of table name. Feature refers to ids to lookup in embedding
        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
      batch_size: An `int` representing the global batch size.
      mode: `TRAINING` or `INFERENCE`.
      master: A `string` representing the TensorFlow master to use.
      optimization_parameters: `AdagradParameters`, `AdamParameters`,
        `Stochasticgradientdescentparameters`. Must be set in training and must
        be `None` in inference.

    Raises:
      ValueError: if any input is invalid.
    """
    _validate_table_to_config_dict(table_to_config_dict)
    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
    self._combiners = _create_combiners(self._table_to_config_dict)

    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
    self._table_to_features_dict = _create_table_to_features_dict(
        self._feature_to_table_dict)

    self._batch_size = batch_size

    self._master = master
    self._tpu_system_metadata = (
        tpu_system_metadata_lib._query_tpu_system_metadata(self._master))  # pylint: disable=protected-access
    if self._tpu_system_metadata.num_cores == 0:
      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
                       'TPUs.'.format(self._master))
    self._num_hosts = self._tpu_system_metadata.num_hosts
    self._hosts = [device.name for device in self._tpu_system_metadata.devices
                   if 'device:CPU:' in device.name]
    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
    self._num_cores = self._tpu_system_metadata.num_cores

    _validate_batch_size(self._batch_size, self._num_cores)
    self._batch_size_per_core = self._batch_size // self._num_cores

    self._init_ops = []

    # TODO(shizhiw): remove `mode`?
    if mode == TRAINING:
      _validate_optimization_parameters(optimization_parameters)
      self._optimization_parameters = optimization_parameters
    elif mode == INFERENCE:
      if optimization_parameters is not None:
        raise ValueError('`optimization_parameters` should be `None` '
                         'for inference mode.')
      self._optimization_parameters = (
          StochasticGradientDescentParameters(1.))
    else:
      raise ValueError('`mode` only supports {} and {}; got {}.'
                       .format(TRAINING, INFERENCE, mode))
    self._mode = mode

    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
    # and create special handler for inference that inherits from
    # StochasticGradientDescentHandler with more user-friendly error message
    # on get_slot().
    self._optimizer_handler = _get_optimization_handler(
        self._optimization_parameters)

    dummy_table_variables_init_op = self._create_dummy_table_variables()
    self._init_ops.append(dummy_table_variables_init_op)

    self._config_proto = self._create_config_proto()