def wrapped_model_fn(model_fn, run_config): """Returns a new model_fn, which wraps the TPU support.""" # Verifies the model_fn signature according to Estimator framework. estimator_lib._verify_model_fn_args(model_fn, params=None) # pylint: disable=protected-access def _model_fn(features, labels, mode): """model_fn.""" # TODO(jhseu): Move to EVAL and PREDICT to TPU. if mode != model_fn_lib.ModeKeys.TRAIN: return model_fn(features, labels, mode) dequeue_fn, enqueue_fn = (_create_infeed_enqueue_ops_and_dequeue_fn( run_config, features, labels)) loss = _train_on_tpu_shards(run_config, train_step=_convert_model_fn_to_train_step( model_fn, dequeue_fn, mode, run_config)) # Gets the variables back from TPU nodes. This means the variables updated # by TPU will now be *synced* to host memory. update_ops = [ array_ops.check_numerics(v.read_value(), 'Gradient for %s is NaN' % v.name).op for v in variables.trainable_variables() ] hooks = [ TpuInfeedSessionHook(run_config, enqueue_fn), training.LoggingTensorHook( { 'loss': array_ops.identity(loss), 'step': training.get_global_step() }, every_n_secs=30) ] return model_fn_lib.EstimatorSpec( mode, loss=array_ops.identity(loss), training_hooks=hooks, train_op=control_flow_ops.group(*update_ops)) return _model_fn
def __init__(self, model_fn=None, model_dir=None, config=None, params=None, use_tpu=True): if config is None or not isinstance(config, tpu_config.RunConfig): raise ValueError( '`config` must be provided with type `tpu_config.RunConfig`') if use_tpu and params is not None and _BATCH_SIZE_KEY in params: if not isinstance(params[_BATCH_SIZE_KEY], int): raise ValueError( '`{}` in params must be an int'.format(_BATCH_SIZE_KEY)) params = copy.deepcopy(params) # The specified batch size is the batch size for the entire computation. # The input_fn is called per-shard, so we want to calculate the per-shard # batch size and pass that. if params[_BATCH_SIZE_KEY] % config.tpu_config.num_shards != 0: raise ValueError( 'batch size {} must be divisible by number of shards {}'. format(params[_BATCH_SIZE_KEY], config.tpu_config.num_shards)) if use_tpu: if not isinstance(config, tpu_config.RunConfig): raise ValueError('`config` must be `tpu_config.RunConfig`') # Verifies the model_fn signature according to Estimator framework. estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access # We cannot store config and params in this constructor as parent # constructor might change them, such as assigning a temp dir for # config.model_dir. model_function = wrapped_model_fn(model_fn) else: model_function = model_fn super(TpuEstimator, self).__init__(model_fn=model_function, model_dir=model_dir, config=config, params=params) self.use_tpu = use_tpu
def __init__(self, model_fn=None, model_dir=None, config=None, params=None, use_tpu=True): if config is None or not isinstance(config, tpu_config.RunConfig): raise ValueError( '`config` must be provided with type `tpu_config.RunConfig`') if use_tpu and params is not None and _BATCH_SIZE_KEY in params: if not isinstance(params[_BATCH_SIZE_KEY], int): raise ValueError( '`{}` in params must be an int'.format(_BATCH_SIZE_KEY)) params = copy.deepcopy(params) # The specified batch size is the batch size for the entire computation. # The input_fn is called per-shard, so we want to calculate the per-shard # batch size and pass that. if params[_BATCH_SIZE_KEY] % config.tpu_config.num_shards != 0: raise ValueError( 'batch size {} must be divisible by number of shards {}' .format(params[_BATCH_SIZE_KEY], config.tpu_config.num_shards)) if use_tpu: if not isinstance(config, tpu_config.RunConfig): raise ValueError('`config` must be `tpu_config.RunConfig`') # Verifies the model_fn signature according to Estimator framework. estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access # We cannot store config and params in this constructor as parent # constructor might change them, such as assigning a temp dir for # config.model_dir. model_function = wrapped_model_fn(model_fn) else: model_function = model_fn super(TpuEstimator, self).__init__( model_fn=model_function, model_dir=model_dir, config=config, params=params) self.use_tpu = use_tpu
def __init__(self, model_fn, model_dir=None, config=None, params=None): # Create a run configuration. if config is None: self._config = RunConfig() logging.info("Using default config.") else: if not isinstance(config, RunConfig): raise ValueError("config must be an instance of RunConfig, " "received {}.".format(config)) self._config = config if (model_dir is not None) and (self._config.model_dir is not None): if model_dir != self._config.model_dir: raise ValueError( "model_dir are set both in constructor and RunConfig, but with " "different values. In constructor: '{}', in RunConfig: " "'{}' ".format(model_dir, self._config.model_dir)) self._model_dir = model_dir or self._config.model_dir or generate_model_dir( ) if self._config.model_dir is None: self._config = self._config.replace(model_dir=self._model_dir) logging.info("Using config: {}".format(vars(self._config))) if self._config.session_config is None: self._session_config = config_pb2.ConfigProto( allow_soft_placement=True) else: self._session_config = self._config.session_config # Set device function depending if there are replicas or not. # pylint: disable=protected-access self._device_fn = tf_estimator._get_replica_device_setter(self._config) tf_estimator._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access self._model_fn = model_fn self._params = params or {}
def __init__(self, model_fn=None, model_dir=None, config=None, params=None, use_tpu=True, train_batch_size=None): """Constructs an `TPUEstimator` instance. Args: model_fn: Model function as required by `Estimator`. For training, the returned `EstimatorSpec` cannot have hooks as it is not supported in `TPUEstimator`. model_dir: Directory to save model parameters, graph and etc. This can also be used to load checkpoints from the directory into a estimator to continue training a previously saved model. If `None`, the model_dir in `config` will be used if set. If both are set, they must be same. If both are `None`, a temporary directory will be used. config: An `tpu_config.RunConfig` configuration object. Cannot be `None`. params: An optional `dict` of hyper parameters that will be passed into `input_fn` and `model_fn`. Keys are names of parameters, values are basic python types. There are reserved keys for `TPUEstimator`, including 'batch_size'. use_tpu: A bool indicating whether TPU support is enabled. Currently, only applied to training. Evaluate and predict still happen on CPU. train_batch_size: An int representing the global training batch size. TPUEstimator transforms this global batch size to a per-shard batch size, as params['batch_size'], when calling `input_fn` and `model_fn`. Cannot be `None` if `use_tpu` is `True`. Must be divisible by `config.tpu_config.num_shards`. Raises: ValueError: `params` has reserved keys already. """ if config is None or not isinstance(config, tpu_config.RunConfig): raise ValueError( '`config` must be provided with type `tpu_config.RunConfig`') if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS): raise ValueError( '{} are reserved keys but existed in params {}.'.format( _RESERVED_PARAMS_KEYS, params)) if use_tpu: if train_batch_size is None: raise ValueError('`train_batch_size` cannot be `None`') if not isinstance(train_batch_size, int): raise ValueError('`train_batch_size` must be an int') if train_batch_size < 1: raise ValueError('`train_batch_size` must be positive') # The specified batch size is the batch size for the entire computation. # The input_fn and model_fn are called per-shard, so we want to calculate # the per-shard batch size and pass that. if train_batch_size % config.tpu_config.num_shards != 0: raise ValueError( 'batch size {} must be divisible by number of shards {}' .format(train_batch_size, config.tpu_config.num_shards)) if use_tpu: # Verifies the model_fn signature according to Estimator framework. estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access # We cannot store config and params in this constructor as parent # constructor might change them, such as assigning a temp dir for # config.model_dir. model_function = augment_model_fn_with_tpu_support( model_fn, train_batch_size) else: model_function = model_fn super(TPUEstimator, self).__init__( model_fn=model_function, model_dir=model_dir, config=config, params=params) self._use_tpu = use_tpu self._train_batch_size = train_batch_size
def __init__(self, model_fn=None, model_dir=None, config=None, params=None, job_start_file='', warm_start_from=None): """Constructs an `NPUEstimator` instance. Args: model_fn: Model function as required by `Estimator` which returns EstimatorSpec. `training_hooks`, 'evaluation_hooks', and `prediction_hooks` must not capure any NPU Tensor inside the model_fn. config: An `NPURunConfig` configuration object. Cannot be `None`. params: An optional `dict` of hyper parameters that will be passed into `input_fn` and `model_fn`. Keys are names of parameters, values are basic python types.. job_start_file: The path of the job start file. Cannot be `None`. warm_start_from: Optional string filepath to a checkpoint or SavedModel to warm-start from, or a `tf.estimator.WarmStartSettings` object to fully configure warm-starting. If the string filepath is provided instead of a`tf.estimator.WarmStartSettings`, then all variables are warm-started, and it is assumed that vocabularies and `tf.Tensor` names are unchanged. """ logging.info("NPUEstimator init...") if config is None or not isinstance(config, NPURunConfig): raise ValueError( '`config` must be provided with type `NPUConfigs`') # Verifies the model_fn signature according to Estimator framework. estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access # Load the graph optimizers. config = self.__load_graph_optimizers(config) # Init npu system: get task and device info from configuration file. if not self.__load_job_info(job_start_file): raise ValueError( 'Load job info failed, ' 'please check whether `JOB_ID` is set in environment variable') # Check modie dir in NPUEstimator and NPURunConfig model_dir = self.__check_model_dir(model_dir, config) # Wrap model_fn to adding npu sessionhooks. model_function = self.__augment_model_fn(model_fn, model_dir, config) # Get the checkpoint file. if not warm_start_from: restore_from = self.__job_info._local_checkpoint_dir # tf use restore_from variable, no need to check safety. if restore_from is None or restore_from == "": restore_from = os.getenv('RESTORE_FROM') else: restore_from = warm_start_from # Passing non-None params as wrapped model_fn use it. params = params or {} with no_check_override(): super(NPUEstimator, self).__init__(model_fn=model_function, model_dir=model_dir, config=config, params=params, warm_start_from=restore_from)
def __init__(self, model_fn=None, model_dir=None, config=None, params=None, use_tpu=True, train_batch_size=None): """Constructs an `TPUEstimator` instance. Args: model_fn: Model function as required by `Estimator`. For training, the returned `EstimatorSpec` cannot have hooks as it is not supported in `TPUEstimator`. model_dir: Directory to save model parameters, graph and etc. This can also be used to load checkpoints from the directory into a estimator to continue training a previously saved model. If `None`, the model_dir in `config` will be used if set. If both are set, they must be same. If both are `None`, a temporary directory will be used. config: An `tpu_config.RunConfig` configuration object. Cannot be `None`. params: An optional `dict` of hyper parameters that will be passed into `input_fn` and `model_fn`. Keys are names of parameters, values are basic python types. There are reserved keys for `TPUEstimator`, including 'batch_size'. use_tpu: A bool indicating whether TPU support is enabled. Currently, only applied to training. Evaluate and predict still happen on CPU. train_batch_size: An int representing the global training batch size. TPUEstimator transforms this global batch size to a per-shard batch size, as params['batch_size'], when calling `input_fn` and `model_fn`. Cannot be `None` if `use_tpu` is `True`. Must be divisible by `config.tpu_config.num_shards`. Raises: ValueError: `params` has reserved keys already. """ if config is None or not isinstance(config, tpu_config.RunConfig): raise ValueError( '`config` must be provided with type `tpu_config.RunConfig`') if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS): raise ValueError( '{} are reserved keys but existed in params {}.'.format( _RESERVED_PARAMS_KEYS, params)) if use_tpu: if train_batch_size is None: raise ValueError('`train_batch_size` cannot be `None`') if not isinstance(train_batch_size, int): raise ValueError('`train_batch_size` must be an int') if train_batch_size < 1: raise ValueError('`train_batch_size` must be positive') # The specified batch size is the batch size for the entire computation. # The input_fn and model_fn are called per-shard, so we want to calculate # the per-shard batch size and pass that. if train_batch_size % config.tpu_config.num_shards != 0: raise ValueError( 'batch size {} must be divisible by number of shards {}'. format(train_batch_size, config.tpu_config.num_shards)) if use_tpu: # Verifies the model_fn signature according to Estimator framework. estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access # We cannot store config and params in this constructor as parent # constructor might change them, such as assigning a temp dir for # config.model_dir. model_function = augment_model_fn_with_tpu_support( model_fn, train_batch_size) else: model_function = model_fn super(TPUEstimator, self).__init__(model_fn=model_function, model_dir=model_dir, config=config, params=params) self._use_tpu = use_tpu self._train_batch_size = train_batch_size