def master_job(self): """Returns the job name to use to place TPU computations on. Returns: A string containing the job name, or None if no job should be specified. Raises: ValueError: If the user needs to specify a tpu_job_name, because we are unable to infer the job name automatically, or if the user-specified job names are inappropriate. """ run_config = self._config # If the user specifies the tpu_job_name, use that. if run_config.tpu_config.tpu_job_name: return run_config.tpu_config.tpu_job_name # The tpu job is determined by the run_config. Right now, this method is # required as tpu_config is not part of the RunConfig. mode = self._assert_mode() master = (run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL else run_config.master) cluster_def = (run_config.session_config.cluster_def if run_config.session_config else None) try: master_job = tpu_system_metadata_lib.master_job( master, cluster_def) except ValueError as e: raise ValueError( str(e) + ' Please specify a tpu_job_name as part of ' 'your TPUConfig.') return master_job
def master_job(self): """Returns the job name to use to place TPU computations on. Returns: A string containing the job name, or None if no job should be specified. Raises: ValueError: If the user needs to specify a tpu_job_name, because we are unable to infer the job name automatically, or if the user-specified job names are inappropriate. """ run_config = self._config # If the user specifies the tpu_job_name, use that. if run_config.tpu_config.tpu_job_name: return run_config.tpu_config.tpu_job_name # The tpu job is determined by the run_config. Right now, this method is # required as tpu_config is not part of the RunConfig. mode = self._assert_mode() master = ( run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL else run_config.master) cluster_def = (run_config.session_config.cluster_def if run_config.session_config else None) return tpu_system_metadata_lib.master_job(master, cluster_def)
def __init__(self, table_to_config_dict, feature_to_table_dict, batch_size, mode, master, optimization_parameters=None, cluster_def=None, pipeline_execution_with_tensor_core=False): """API for using TPU for embedding lookups. Args: table_to_config_dict: A dictionary mapping from string of table name to `TableConfig`. Table refers to an embedding table, e.g. `params` argument to `tf.nn.embedding_lookup_sparse()`. feature_to_table_dict: A dictionary mapping from string of feature name to string of table name. Feature refers to ids to lookup in embedding table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`. batch_size: An `int` representing the global batch size. mode: `TRAINING` or `INFERENCE`. master: A `string` representing the TensorFlow master to use. optimization_parameters: `AdagradParameters`, `AdamParameters`, `Stochasticgradientdescentparameters`. Must be set in training and must be `None` in inference. cluster_def: A ClusterDef object describing the TPU cluster. pipeline_execution_with_tensor_core: setting this to `True` makes training faster, but trained model will be different if step N and step N+1 involve the same set of embedding IDs. Please see `tpu_embedding_configuration.proto` for details. Raises: ValueError: if any input is invalid. """ _validate_table_to_config_dict(table_to_config_dict) # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`. self._table_to_config_dict = _create_ordered_dict(table_to_config_dict) _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict) self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict) self._table_to_features_dict = _create_table_to_features_dict( self._feature_to_table_dict) self._combiners = _create_combiners(self._table_to_config_dict, self._table_to_features_dict) self._batch_size = batch_size self._master = master self._cluster_def = cluster_def self._tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( # pylint: disable=protected-access self._master, cluster_def=self._cluster_def)) if self._tpu_system_metadata.num_cores == 0: raise ValueError('TPUEmbedding needs TPUs, but master {} does not have ' 'TPUs.'.format(self._master)) self._num_hosts = self._tpu_system_metadata.num_hosts master_job_name = tpu_system_metadata_lib.master_job(self._master, self._cluster_def) self._hosts = sorted([ device.name for device in self._tpu_system_metadata.devices if 'device:CPU:' in device.name and (master_job_name is None or master_job_name in device.name)]) self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host self._num_cores = self._tpu_system_metadata.num_cores _validate_batch_size(self._batch_size, self._num_cores) self._batch_size_per_core = self._batch_size // self._num_cores # TODO(shizhiw): remove `mode`? if mode == TRAINING: _validate_optimization_parameters(optimization_parameters) self._optimization_parameters = optimization_parameters elif mode == INFERENCE: if optimization_parameters is not None: raise ValueError('`optimization_parameters` should be `None` ' 'for inference mode.') self._optimization_parameters = ( StochasticGradientDescentParameters(1.)) else: raise ValueError('`mode` only supports {} and {}; got {}.' .format(TRAINING, INFERENCE, mode)) self._mode = mode # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler` # and create special handler for inference that inherits from # StochasticGradientDescentHandler with more user-friendly error message # on get_slot(). self._optimizer_handler = _get_optimization_handler( self._optimization_parameters) self._pipeline_execution_with_tensor_core = ( pipeline_execution_with_tensor_core) self._config_proto = self._create_config_proto()
def __init__(self, table_to_config_dict, feature_to_table_dict, batch_size, mode, master, optimization_parameters=None, cluster_def=None, pipeline_execution_with_tensor_core=False): """API for using TPU for embedding lookups. Args: table_to_config_dict: A dictionary mapping from string of table name to `TableConfig`. Table refers to an embedding table, e.g. `params` argument to `tf.nn.embedding_lookup_sparse()`. feature_to_table_dict: A dictionary mapping from string of feature name to string of table name. Feature refers to ids to lookup in embedding table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`. batch_size: An `int` representing the global batch size. mode: `TRAINING` or `INFERENCE`. master: A `string` representing the TensorFlow master to use. optimization_parameters: `AdagradParameters`, `AdamParameters`, `Stochasticgradientdescentparameters`. Must be set in training and must be `None` in inference. cluster_def: A ClusterDef object describing the TPU cluster. pipeline_execution_with_tensor_core: setting this to `True` makes training faster, but trained model will be different if step N and step N+1 involve the same set of embedding IDs. Please see `tpu_embedding_configuration.proto` for details. Raises: ValueError: if any input is invalid. """ _validate_table_to_config_dict(table_to_config_dict) # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`. self._table_to_config_dict = _create_ordered_dict(table_to_config_dict) _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict) self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict) self._table_to_features_dict = _create_table_to_features_dict( self._feature_to_table_dict) self._combiners = _create_combiners(self._table_to_config_dict, self._table_to_features_dict) self._batch_size = batch_size self._master = master self._cluster_def = cluster_def self._tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( # pylint: disable=protected-access self._master, cluster_def=self._cluster_def)) if self._tpu_system_metadata.num_cores == 0: raise ValueError('TPUEmbedding needs TPUs, but master {} does not have ' 'TPUs.'.format(self._master)) self._num_hosts = self._tpu_system_metadata.num_hosts master_job_name = tpu_system_metadata_lib.master_job(self._master, self._cluster_def) self._hosts = sorted([ device.name for device in self._tpu_system_metadata.devices if 'device:CPU:' in device.name and (master_job_name is None or master_job_name in device.name)]) self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host self._num_cores = self._tpu_system_metadata.num_cores _validate_batch_size(self._batch_size, self._num_cores) self._batch_size_per_core = self._batch_size // self._num_cores # TODO(shizhiw): remove `mode`? if mode == TRAINING: _validate_optimization_parameters(optimization_parameters) self._optimization_parameters = optimization_parameters elif mode == INFERENCE: if optimization_parameters is not None: raise ValueError('`optimization_parameters` should be `None` ' 'for inference mode.') self._optimization_parameters = ( StochasticGradientDescentParameters(1.)) else: raise ValueError('`mode` only supports {} and {}; got {}.' .format(TRAINING, INFERENCE, mode)) self._mode = mode # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler` # and create special handler for inference that inherits from # StochasticGradientDescentHandler with more user-friendly error message # on get_slot(). self._optimizer_handler = _get_optimization_handler( self._optimization_parameters) self._pipeline_execution_with_tensor_core = ( pipeline_execution_with_tensor_core) self._config_proto = self._create_config_proto()