Exemple #1
0
def _refresh_model_dir_and_session_config(config, model_dir):
    """Overwrite estimator config by `model_dir` and `session_config` if needed.

  Args:
    config: Original estimator config.
    model_dir: Estimator model checkpoint directory.

  Returns:
    Overwritten estimator config.

  Raises:
    ValueError: Model directory inconsistent between `model_dir` and `config`.
  """

    if config is None or not isinstance(config, NPURunConfig):
        raise ValueError(
            'config must be an instance of `NPURunConfig`, but provided %s.' %
            config)

    if config.session_config is None:
        session_config = run_config.get_default_session_config()
        config = NPURunConfig.replace(config, session_config=session_config)

    model_dir = compat_internal.path_to_str(model_dir)
    if model_dir is not None:
        if (getattr(config, 'model_dir', None) is not None
                and config.model_dir != model_dir):
            raise ValueError(
                "`model_dir` are set both in constructor and `NPURunConfig`, but with "
                "different values. In constructor: '{}', in `NPURunConfig`: "
                "'{}' ".format(model_dir, config.model_dir))
    if model_dir:
        config = NPURunConfig.replace(config, model_dir=model_dir)
    elif getattr(config, 'model_dir', None) is None:
        model_dir = tempfile.mkdtemp()
        logging.warning('Using temporary folder as model directory: %s',
                        model_dir)
        config = NPURunConfig.replace(config, model_dir=model_dir)

    return config
Exemple #2
0
    def __init__(self,
                 model_dir=None,
                 tf_random_seed=None,
                 save_summary_steps=100,
                 save_checkpoints_steps=_USE_DEFAULT,
                 save_checkpoints_secs=_USE_DEFAULT,
                 session_config=None,
                 keep_checkpoint_max=5,
                 keep_checkpoint_every_n_hours=10000,
                 log_step_count_steps=100,
                 distribute=None):
        """Constructs a RunConfig.

    All distributed training related properties `cluster_spec`, `is_chief`,
    `master` , `num_worker_replicas`, `num_ps_replicas`, `task_id`, and
    `task_type` are set based on the `TF_CONFIG` environment variable, if the
    pertinent information is present. The `TF_CONFIG` environment variable is a
    JSON object with attributes: `cluster` and `task`.

    `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from
    `server_lib.py`, mapping task types (usually one of the `TaskType` enums) to
    a list of task addresses.

    `task` has two attributes: `type` and `index`, where `type` can be any of
    the task types in `cluster`. ` When `TF_CONFIG` contains said information,
    the following properties are set on this class:

    * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
      present, must have one and only one node in the `chief` attribute of
      `cluster_spec`.
    * `task_type` is set to `TF_CONFIG['task']['type']`. Must set if
      `cluster_spec` is present; must be `worker` (the default value) if
      `cluster_spec` is not set.
    * `task_id` is set to `TF_CONFIG['task']['index']`. Must set if
      `cluster_spec` is present; must be 0 (the default value) if
      `cluster_spec` is not set.
    * `master` is determined by looking up `task_type` and `task_id` in the
      `cluster_spec`. Defaults to ''.
    * `num_ps_replicas` is set by counting the number of nodes listed
      in the `ps` attribute of `cluster_spec`. Defaults to 0.
    * `num_worker_replicas` is set by counting the number of nodes listed
      in the `worker` and `chief` attributes of `cluster_spec`. Defaults to 1.
    * `is_chief` is determined based on `task_type` and `cluster`.

    There is a special node with `task_type` as `evaluator`, which is not part
    of the (training) `cluster_spec`. It handles the distributed evaluation job.

    Example of non-chief node:
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'worker', 'index': 1}})
      config = RunConfig()
      assert config.master == 'host4:2222'
      assert config.task_id == 1
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 4
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'worker'
      assert not config.is_chief
    ```

    Example of chief node:
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'chief', 'index': 0}})
      config = RunConfig()
      assert config.master == 'host0:2222'
      assert config.task_id == 0
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 4
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'chief'
      assert config.is_chief
    ```

    Example of evaluator node (evaluator is not part of training cluster):
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'evaluator', 'index': 0}})
      config = RunConfig()
      assert config.master == ''
      assert config.evaluator_master == ''
      assert config.task_id == 0
      assert config.num_ps_replicas == 0
      assert config.num_worker_replicas == 0
      assert config.cluster_spec == {}
      assert config.task_type == 'evaluator'
      assert not config.is_chief
    ```

    N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set,
    `keep_checkpoint_max` might need to be adjusted accordingly, especially in
    distributed training. For example, setting `save_checkpoints_secs` as 60
    without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation
    that checkpoint would be garbage collected after 5 minutes. In distributed
    training, the evaluation job starts asynchronously and might fail to load or
    find the checkpoint due to race condition.

    Args:
      model_dir: directory where model parameters, graph, etc are saved. If
        `PathLike` object, the path will be resolved. If `None`, will use a
        default value set by the Estimator.
      tf_random_seed: Random seed for TensorFlow initializers.
        Setting this value allows consistency between reruns.
      save_summary_steps: Save summaries every this many steps.
      save_checkpoints_steps: Save checkpoints every this many steps. Can not be
          specified with `save_checkpoints_secs`.
      save_checkpoints_secs: Save checkpoints every this many seconds. Can not
          be specified with `save_checkpoints_steps`. Defaults to 600 seconds if
          both `save_checkpoints_steps` and `save_checkpoints_secs` are not set
          in constructor.  If both `save_checkpoints_steps` and
          `save_checkpoints_secs` are None, then checkpoints are disabled.
      session_config: a ConfigProto used to set session parameters, or None.
      keep_checkpoint_max: The maximum number of recent checkpoint files to
        keep. As new files are created, older files are deleted. If None or 0,
        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
        checkpoint files are kept.)
      keep_checkpoint_every_n_hours: Number of hours between each checkpoint
        to be saved. The default value of 10,000 hours effectively disables
        the feature.
      log_step_count_steps: The frequency, in number of global steps, that the
        global step/sec and the loss will be logged during training.
      distribute: an optional instance of
        `tf.contrib.distribute.DistributionStrategy`. If specified,
        then Estimator will distribute the user's model according to the policy
        specified by that strategy.

    Raises:
      ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
      are set.
    """
        if (save_checkpoints_steps == _USE_DEFAULT
                and save_checkpoints_secs == _USE_DEFAULT):
            save_checkpoints_steps = None
            save_checkpoints_secs = 600
        elif save_checkpoints_secs == _USE_DEFAULT:
            save_checkpoints_secs = None
        elif save_checkpoints_steps == _USE_DEFAULT:
            save_checkpoints_steps = None
        elif (save_checkpoints_steps is not None
              and save_checkpoints_secs is not None):
            raise ValueError(_SAVE_CKPT_ERR)

        tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
        if tf_config:
            logging.info('TF_CONFIG environment variable: %s', tf_config)

        model_dir = _get_model_dir(tf_config,
                                   compat_internal.path_to_str(model_dir))

        RunConfig._replace(
            self,
            allowed_properties_list=_DEFAULT_REPLACEABLE_LIST,
            model_dir=model_dir,
            tf_random_seed=tf_random_seed,
            save_summary_steps=save_summary_steps,
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=save_checkpoints_secs,
            session_config=session_config,
            keep_checkpoint_max=keep_checkpoint_max,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            log_step_count_steps=log_step_count_steps,
            distribute=distribute)

        self._init_distributed_setting_from_environment_var(tf_config)
Exemple #3
0
  def __init__(self,
               model_dir=None,
               tf_random_seed=None,
               save_summary_steps=100,
               save_checkpoints_steps=_USE_DEFAULT,
               save_checkpoints_secs=_USE_DEFAULT,
               session_config=None,
               keep_checkpoint_max=5,
               keep_checkpoint_every_n_hours=10000,
               log_step_count_steps=100,
               train_distribute=None,
               device_fn=None):
    """Constructs a RunConfig.

    All distributed training related properties `cluster_spec`, `is_chief`,
    `master` , `num_worker_replicas`, `num_ps_replicas`, `task_id`, and
    `task_type` are set based on the `TF_CONFIG` environment variable, if the
    pertinent information is present. The `TF_CONFIG` environment variable is a
    JSON object with attributes: `cluster` and `task`.

    `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from
    `server_lib.py`, mapping task types (usually one of the `TaskType` enums) to
    a list of task addresses.

    `task` has two attributes: `type` and `index`, where `type` can be any of
    the task types in `cluster`. When `TF_CONFIG` contains said information,
    the following properties are set on this class:

    * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
      present, must have one and only one node in the `chief` attribute of
      `cluster_spec`.
    * `task_type` is set to `TF_CONFIG['task']['type']`. Must set if
      `cluster_spec` is present; must be `worker` (the default value) if
      `cluster_spec` is not set.
    * `task_id` is set to `TF_CONFIG['task']['index']`. Must set if
      `cluster_spec` is present; must be 0 (the default value) if
      `cluster_spec` is not set.
    * `master` is determined by looking up `task_type` and `task_id` in the
      `cluster_spec`. Defaults to ''.
    * `num_ps_replicas` is set by counting the number of nodes listed
      in the `ps` attribute of `cluster_spec`. Defaults to 0.
    * `num_worker_replicas` is set by counting the number of nodes listed
      in the `worker` and `chief` attributes of `cluster_spec`. Defaults to 1.
    * `is_chief` is determined based on `task_type` and `cluster`.

    There is a special node with `task_type` as `evaluator`, which is not part
    of the (training) `cluster_spec`. It handles the distributed evaluation job.

    Example of non-chief node:
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'worker', 'index': 1}})
      config = RunConfig()
      assert config.master == 'host4:2222'
      assert config.task_id == 1
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 4
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'worker'
      assert not config.is_chief
    ```

    Example of chief node:
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'chief', 'index': 0}})
      config = RunConfig()
      assert config.master == 'host0:2222'
      assert config.task_id == 0
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 4
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'chief'
      assert config.is_chief
    ```

    Example of evaluator node (evaluator is not part of training cluster):
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'evaluator', 'index': 0}})
      config = RunConfig()
      assert config.master == ''
      assert config.evaluator_master == ''
      assert config.task_id == 0
      assert config.num_ps_replicas == 0
      assert config.num_worker_replicas == 0
      assert config.cluster_spec == {}
      assert config.task_type == 'evaluator'
      assert not config.is_chief
    ```

    N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set,
    `keep_checkpoint_max` might need to be adjusted accordingly, especially in
    distributed training. For example, setting `save_checkpoints_secs` as 60
    without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation
    that checkpoint would be garbage collected after 5 minutes. In distributed
    training, the evaluation job starts asynchronously and might fail to load or
    find the checkpoint due to race condition.

    Args:
      model_dir: directory where model parameters, graph, etc are saved. If
        `PathLike` object, the path will be resolved. If `None`, will use a
        default value set by the Estimator.
      tf_random_seed: Random seed for TensorFlow initializers.
        Setting this value allows consistency between reruns.
      save_summary_steps: Save summaries every this many steps.
      save_checkpoints_steps: Save checkpoints every this many steps. Can not be
          specified with `save_checkpoints_secs`.
      save_checkpoints_secs: Save checkpoints every this many seconds. Can not
          be specified with `save_checkpoints_steps`. Defaults to 600 seconds if
          both `save_checkpoints_steps` and `save_checkpoints_secs` are not set
          in constructor.  If both `save_checkpoints_steps` and
          `save_checkpoints_secs` are None, then checkpoints are disabled.
      session_config: a ConfigProto used to set session parameters, or None.
      keep_checkpoint_max: The maximum number of recent checkpoint files to
        keep. As new files are created, older files are deleted. If None or 0,
        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
        checkpoint files are kept.)
      keep_checkpoint_every_n_hours: Number of hours between each checkpoint
        to be saved. The default value of 10,000 hours effectively disables
        the feature.
      log_step_count_steps: The frequency, in number of global steps, that the
        global step/sec and the loss will be logged during training.
      train_distribute: an optional instance of
        `tf.contrib.distribute.DistributionStrategy`. If specified,
        then Estimator will distribute the user's model during training,
        according to the policy specified by that strategy.
      device_fn: A callable invoked for every `Operation` that takes the
        `Operation` and returns the device string. If `None`, defaults to
        the device function returned by `tf.train.replica_device_setter`
        with round-robin strategy.

    Raises:
      ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
      are set.
    """
    if (save_checkpoints_steps == _USE_DEFAULT and
        save_checkpoints_secs == _USE_DEFAULT):
      save_checkpoints_steps = None
      save_checkpoints_secs = 600
    elif save_checkpoints_secs == _USE_DEFAULT:
      save_checkpoints_secs = None
    elif save_checkpoints_steps == _USE_DEFAULT:
      save_checkpoints_steps = None
    elif (save_checkpoints_steps is not None and
          save_checkpoints_secs is not None):
      raise ValueError(_SAVE_CKPT_ERR)

    tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
    if tf_config:
      logging.info('TF_CONFIG environment variable: %s', tf_config)

    model_dir = _get_model_dir(tf_config,
                               compat_internal.path_to_str(model_dir))

    RunConfig._replace(
        self,
        allowed_properties_list=_DEFAULT_REPLACEABLE_LIST,
        model_dir=model_dir,
        tf_random_seed=tf_random_seed,
        save_summary_steps=save_summary_steps,
        save_checkpoints_steps=save_checkpoints_steps,
        save_checkpoints_secs=save_checkpoints_secs,
        session_config=session_config,
        keep_checkpoint_max=keep_checkpoint_max,
        keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
        log_step_count_steps=log_step_count_steps,
        train_distribute=train_distribute,
        device_fn=device_fn)

    self._init_distributed_setting_from_environment_var(tf_config)
  def __init__(self, model_fn, model_dir=None, config=None, params=None,
               warm_start_from=None):
    """Constructs an `Estimator` instance.

    See @{$estimators} for more information. To warm-start an `Estimator`:

    ```python
    estimator = tf.estimator.DNNClassifier(
        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
        hidden_units=[1024, 512, 256],
        warm_start_from="/path/to/checkpoint/dir")
    ```

    For more details on warm-start configuration, see
    @{tf.estimator.WarmStartSettings$WarmStartSettings}.

    Args:
      model_fn: Model function. Follows the signature:

        * Args:

          * `features`: This is the first item returned from the `input_fn`
                 passed to `train`, `evaluate`, and `predict`. This should be a
                 single `Tensor` or `dict` of same.
          * `labels`: This is the second item returned from the `input_fn`
                 passed to `train`, `evaluate`, and `predict`. This should be a
                 single `Tensor` or `dict` of same (for multi-head models). If
                 mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
                 the `model_fn`'s signature does not accept `mode`, the
                 `model_fn` must still be able to handle `labels=None`.
          * `mode`: Optional. Specifies if this training, evaluation or
                 prediction. See `ModeKeys`.
          * `params`: Optional `dict` of hyperparameters.  Will receive what
                 is passed to Estimator in `params` parameter. This allows
                 to configure Estimators from hyper parameter tuning.
          * `config`: Optional configuration object. Will receive what is passed
                 to Estimator in `config` parameter, or the default `config`.
                 Allows updating things in your model_fn based on configuration
                 such as `num_ps_replicas`, or `model_dir`.

        * Returns:
          `EstimatorSpec`

      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model. If `PathLike` object, the
        path will be resolved. If `None`, the model_dir in `config` will be used
        if set. If both are set, they must be same. If both are `None`, a
        temporary directory will be used.
      config: Configuration object.
      params: `dict` of hyper parameters that will be passed into `model_fn`.
              Keys are names of parameters, values are basic python types.
      warm_start_from: Optional string filepath to a checkpoint to warm-start
                       from, or a `tf.estimator.WarmStartSettings` object to
                       fully configure warm-starting.  If the string filepath is
                       provided instead of a `WarmStartSettings`, then all
                       variables are warm-started, and it is assumed that
                       vocabularies and Tensor names are unchanged.

    Raises:
      RuntimeError: If eager execution is enabled.
      ValueError: parameters of `model_fn` don't match `params`.
      ValueError: if this is called via a subclass and if that class overrides
        a member of `Estimator`.
    """
    if context.executing_eagerly():
      raise RuntimeError(
          'Estimators are not supported when eager execution is enabled.')

    Estimator._assert_members_are_not_overridden(self)

    if config is None:
      self._config = run_config.RunConfig()
      logging.info('Using default config.')
    else:
      if not isinstance(config, run_config.RunConfig):
        raise ValueError(
            'config must be an instance of RunConfig, but provided %s.' %
            config)
      self._config = config

    # Model directory.
    model_dir = compat_internal.path_to_str(model_dir)
    if (model_dir is not None) and (self._config.model_dir is not None):
      if model_dir != self._config.model_dir:
        # TODO(alanyee): remove this suppression after it is no longer needed
        # pylint: disable=g-doc-exception
        raise ValueError(
            "model_dir are set both in constructor and RunConfig, but with "
            "different values. In constructor: '{}', in RunConfig: "
            "'{}' ".format(model_dir, self._config.model_dir))
        # pylint: enable=g-doc-exception

    self._model_dir = model_dir or self._config.model_dir
    if self._model_dir is None:
      self._model_dir = tempfile.mkdtemp()
      logging.warning('Using temporary folder as model directory: %s',
                      self._model_dir)
    if self._config.model_dir is None:
      self._config = self._config.replace(model_dir=self._model_dir)
    logging.info('Using config: %s', str(vars(self._config)))

    if self._config.session_config is None:
      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    else:
      self._session_config = self._config.session_config

    self._device_fn = _get_replica_device_setter(self._config)

    if model_fn is None:
      raise ValueError('model_fn must be provided to Estimator.')
    _verify_model_fn_args(model_fn, params)
    self._model_fn = model_fn
    self._params = copy.deepcopy(params or {})

    # pylint: disable=protected-access
    self._warm_start_settings = (
        warm_starting_util._get_default_warm_start_settings(warm_start_from))
    def __init__(self,
                 model_dir=None,
                 tf_random_seed=None,
                 save_summary_steps=100,
                 save_checkpoints_steps=_USE_DEFAULT,
                 save_checkpoints_secs=_USE_DEFAULT,
                 session_config=None,
                 keep_checkpoint_max=5,
                 keep_checkpoint_every_n_hours=10000,
                 log_step_count_steps=100,
                 train_distribute=None,
                 device_fn=None,
                 protocol=None,
                 eval_distribute=None,
                 experimental_distribute=None,
                 experimental_max_worker_delay_secs=None,
                 session_creation_timeout_secs=7200):
        """Constructs a RunConfig.

    All distributed training related properties `cluster_spec`, `is_chief`,
    `master` , `num_worker_replicas`, `num_ps_replicas`, `task_id`, and
    `task_type` are set based on the `TF_CONFIG` environment variable, if the
    pertinent information is present. The `TF_CONFIG` environment variable is a
    JSON object with attributes: `cluster` and `task`.

    `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from
    `server_lib.py`, mapping task types (usually one of the `TaskType` enums) to
    a list of task addresses.

    `task` has two attributes: `type` and `index`, where `type` can be any of
    the task types in `cluster`. When `TF_CONFIG` contains said information,
    the following properties are set on this class:

    * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
      present, must have one and only one node in the `chief` attribute of
      `cluster_spec`.
    * `task_type` is set to `TF_CONFIG['task']['type']`. Must set if
      `cluster_spec` is present; must be `worker` (the default value) if
      `cluster_spec` is not set.
    * `task_id` is set to `TF_CONFIG['task']['index']`. Must set if
      `cluster_spec` is present; must be 0 (the default value) if
      `cluster_spec` is not set.
    * `master` is determined by looking up `task_type` and `task_id` in the
      `cluster_spec`. Defaults to ''.
    * `num_ps_replicas` is set by counting the number of nodes listed
      in the `ps` attribute of `cluster_spec`. Defaults to 0.
    * `num_worker_replicas` is set by counting the number of nodes listed
      in the `worker` and `chief` attributes of `cluster_spec`. Defaults to 1.
    * `is_chief` is determined based on `task_type` and `cluster`.

    There is a special node with `task_type` as `evaluator`, which is not part
    of the (training) `cluster_spec`. It handles the distributed evaluation job.

    Example of non-chief node:
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'worker', 'index': 1}})
      config = RunConfig()
      assert config.master == 'host4:2222'
      assert config.task_id == 1
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 4
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'worker'
      assert not config.is_chief
    ```

    Example of chief node:
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'chief', 'index': 0}})
      config = RunConfig()
      assert config.master == 'host0:2222'
      assert config.task_id == 0
      assert config.num_ps_replicas == 2
      assert config.num_worker_replicas == 4
      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
      assert config.task_type == 'chief'
      assert config.is_chief
    ```

    Example of evaluator node (evaluator is not part of training cluster):
    ```
      cluster = {'chief': ['host0:2222'],
                 'ps': ['host1:2222', 'host2:2222'],
                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
      os.environ['TF_CONFIG'] = json.dumps(
          {'cluster': cluster,
           'task': {'type': 'evaluator', 'index': 0}})
      config = RunConfig()
      assert config.master == ''
      assert config.evaluator_master == ''
      assert config.task_id == 0
      assert config.num_ps_replicas == 0
      assert config.num_worker_replicas == 0
      assert config.cluster_spec == {}
      assert config.task_type == 'evaluator'
      assert not config.is_chief
    ```

    N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set,
    `keep_checkpoint_max` might need to be adjusted accordingly, especially in
    distributed training. For example, setting `save_checkpoints_secs` as 60
    without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation
    that checkpoint would be garbage collected after 5 minutes. In distributed
    training, the evaluation job starts asynchronously and might fail to load or
    find the checkpoint due to race condition.

    Args:
      model_dir: directory where model parameters, graph, etc are saved. If
        `PathLike` object, the path will be resolved. If `None`, will use a
        default value set by the Estimator.
      tf_random_seed: Random seed for TensorFlow initializers. Setting this
        value allows consistency between reruns.
      save_summary_steps: Save summaries every this many steps.
      save_checkpoints_steps: Save checkpoints every this many steps. Can not be
        specified with `save_checkpoints_secs`.
      save_checkpoints_secs: Save checkpoints every this many seconds. Can not
        be specified with `save_checkpoints_steps`. Defaults to 600 seconds if
        both `save_checkpoints_steps` and `save_checkpoints_secs` are not set in
        constructor.  If both `save_checkpoints_steps` and
        `save_checkpoints_secs` are `None`, then checkpoints are disabled.
      session_config: a ConfigProto used to set session parameters, or `None`.
      keep_checkpoint_max: The maximum number of recent checkpoint files to
        keep. As new files are created, older files are deleted. If `None` or 0,
        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
        checkpoint files are kept). If a saver is passed to the estimator, this
        argument will be ignored.
      keep_checkpoint_every_n_hours: Number of hours between each checkpoint to
        be saved. The default value of 10,000 hours effectively disables the
        feature.
      log_step_count_steps: The frequency, in number of global steps, that the
        global step and the loss will be logged during training.  Also controls
        the frequency that the global steps / s will be logged (and written to
        summary) during training.
      train_distribute: An optional instance of `tf.distribute.Strategy`. If
        specified, then Estimator will distribute the user's model during
        training, according to the policy specified by that strategy. Setting
        `experimental_distribute.train_distribute` is preferred.
      device_fn: A callable invoked for every `Operation` that takes the
        `Operation` and returns the device string. If `None`, defaults to the
        device function returned by `tf.train.replica_device_setter` with
        round-robin strategy.
      protocol: An optional argument which specifies the protocol used when
        starting server. `None` means default to grpc.
      eval_distribute: An optional instance of `tf.distribute.Strategy`. If
        specified, then Estimator will distribute the user's model during
        evaluation, according to the policy specified by that strategy. Setting
        `experimental_distribute.eval_distribute` is preferred.
      experimental_distribute: An optional
        `tf.contrib.distribute.DistributeConfig` object specifying
        DistributionStrategy-related configuration. The `train_distribute` and
        `eval_distribute` can be passed as parameters to `RunConfig` or set in
        `experimental_distribute` but not both.
      experimental_max_worker_delay_secs: An optional integer specifying the
        maximum time a worker should wait before starting. By default, workers
        are started at staggered times, with each worker being delayed by up to
        60 seconds. This is intended to reduce the risk of divergence, which can
        occur when many workers simultaneously update the weights of a randomly
        initialized model. Users who warm-start their models and train them for
        short durations (a few minutes or less) should consider reducing this
        default to improve training times.
      session_creation_timeout_secs: Max time workers should wait for a session
        to become available (on initialization or when recovering a session)
        with MonitoredTrainingSession. Defaults to 7200 seconds, but users may
        want to set a lower value to detect problems with variable / session
        (re)-initialization more quickly.

    Raises:
      ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
      are set.
    """
        if (save_checkpoints_steps == _USE_DEFAULT
                and save_checkpoints_secs == _USE_DEFAULT):
            save_checkpoints_steps = None
            save_checkpoints_secs = 600
        elif save_checkpoints_secs == _USE_DEFAULT:
            save_checkpoints_secs = None
        elif save_checkpoints_steps == _USE_DEFAULT:
            save_checkpoints_steps = None
        elif (save_checkpoints_steps is not None
              and save_checkpoints_secs is not None):
            raise ValueError(_SAVE_CKPT_ERR)

        tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
        if tf_config:
            tf.compat.v1.logging.info('TF_CONFIG environment variable: %s',
                                      tf_config)

        model_dir = _get_model_dir(tf_config,
                                   compat_internal.path_to_str(model_dir))

        RunConfig._replace(
            self,
            allowed_properties_list=_DEFAULT_REPLACEABLE_LIST,
            model_dir=model_dir,
            tf_random_seed=tf_random_seed,
            save_summary_steps=save_summary_steps,
            save_checkpoints_steps=save_checkpoints_steps,
            save_checkpoints_secs=save_checkpoints_secs,
            session_config=session_config,
            keep_checkpoint_max=keep_checkpoint_max,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            log_step_count_steps=log_step_count_steps,
            train_distribute=train_distribute,
            device_fn=device_fn,
            protocol=protocol,
            eval_distribute=eval_distribute,
            experimental_distribute=experimental_distribute,
            experimental_max_worker_delay_secs=
            experimental_max_worker_delay_secs,
            session_creation_timeout_secs=session_creation_timeout_secs)

        # TODO(frankchn,priyag): Eventually use distributed coordinator for TPUs.
        if ((train_distribute and
             not train_distribute.__class__.__name__.startswith('TPUStrategy'))
                or
            (eval_distribute and
             not eval_distribute.__class__.__name__.startswith('TPUStrategy'))
                or experimental_distribute):
            tf.compat.v1.logging.info(
                'Initializing RunConfig with distribution strategies.')
            distribute_coordinator_training.init_run_config(self, tf_config)
        else:
            self._init_distributed_setting_from_environment_var(tf_config)
            self._maybe_overwrite_session_config_for_distributed_training()
Exemple #6
0
  def __init__(self, model_fn, model_dir=None, config=None, params=None,
               warm_start_from=None):
    """Constructs an `Estimator` instance.

    See @{$estimators} for more information. To warm-start an `Estimator`:

    ```python
    estimator = tf.estimator.DNNClassifier(
        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
        hidden_units=[1024, 512, 256],
        warm_start_from="/path/to/checkpoint/dir")
    ```

    For more details on warm-start configuration, see
    @{tf.estimator.WarmStartSettings$WarmStartSettings}.

    Args:
      model_fn: Model function. Follows the signature:

        * Args:

          * `features`: This is the first item returned from the `input_fn`
                 passed to `train`, `evaluate`, and `predict`. This should be a
                 single `Tensor` or `dict` of same.
          * `labels`: This is the second item returned from the `input_fn`
                 passed to `train`, `evaluate`, and `predict`. This should be a
                 single `Tensor` or `dict` of same (for multi-head models). If
                 mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
                 the `model_fn`'s signature does not accept `mode`, the
                 `model_fn` must still be able to handle `labels=None`.
          * `mode`: Optional. Specifies if this training, evaluation or
                 prediction. See `ModeKeys`.
          * `params`: Optional `dict` of hyperparameters.  Will receive what
                 is passed to Estimator in `params` parameter. This allows
                 to configure Estimators from hyper parameter tuning.
          * `config`: Optional configuration object. Will receive what is passed
                 to Estimator in `config` parameter, or the default `config`.
                 Allows updating things in your model_fn based on configuration
                 such as `num_ps_replicas`, or `model_dir`.

        * Returns:
          `EstimatorSpec`

      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model. If `PathLike` object, the
        path will be resolved. If `None`, the model_dir in `config` will be used
        if set. If both are set, they must be same. If both are `None`, a
        temporary directory will be used.
      config: Configuration object.
      params: `dict` of hyper parameters that will be passed into `model_fn`.
              Keys are names of parameters, values are basic python types.
      warm_start_from: Optional string filepath to a checkpoint to warm-start
                       from, or a `tf.estimator.WarmStartSettings` object to
                       fully configure warm-starting.  If the string filepath is
                       provided instead of a `WarmStartSettings`, then all
                       variables are warm-started, and it is assumed that
                       vocabularies and Tensor names are unchanged.

    Raises:
      RuntimeError: If eager execution is enabled.
      ValueError: parameters of `model_fn` don't match `params`.
      ValueError: if this is called via a subclass and if that class overrides
        a member of `Estimator`.
    """
    if context.in_eager_mode():
      raise RuntimeError(
          'Estimators are not supported when eager execution is enabled.')

    Estimator._assert_members_are_not_overridden(self)

    if config is None:
      self._config = run_config.RunConfig()
      logging.info('Using default config.')
    else:
      if not isinstance(config, run_config.RunConfig):
        raise ValueError(
            'config must be an instance of RunConfig, but provided %s.' %
            config)
      self._config = config

    # Model directory.
    model_dir = compat_internal.path_to_str(model_dir)
    if (model_dir is not None) and (self._config.model_dir is not None):
      if model_dir != self._config.model_dir:
        # TODO(alanyee): remove this suppression after it is no longer needed
        # pylint: disable=g-doc-exception
        raise ValueError(
            "model_dir are set both in constructor and RunConfig, but with "
            "different values. In constructor: '{}', in RunConfig: "
            "'{}' ".format(model_dir, self._config.model_dir))
        # pylint: enable=g-doc-exception

    self._model_dir = model_dir or self._config.model_dir
    if self._model_dir is None:
      self._model_dir = tempfile.mkdtemp()
      logging.warning('Using temporary folder as model directory: %s',
                      self._model_dir)
    if self._config.model_dir is None:
      self._config = self._config.replace(model_dir=self._model_dir)
    logging.info('Using config: %s', str(vars(self._config)))

    if self._config.session_config is None:
      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
    else:
      self._session_config = self._config.session_config

    self._device_fn = _get_replica_device_setter(self._config)

    if model_fn is None:
      raise ValueError('model_fn must be provided to Estimator.')
    _verify_model_fn_args(model_fn, params)
    self._model_fn = model_fn
    self._params = copy.deepcopy(params or {})

    # pylint: disable=protected-access
    self._warm_start_settings = (
        warm_starting_util._get_default_warm_start_settings(warm_start_from))