Ejemplo n.º 1
0
def evaluate(
        summary_dir: Text,
        environment_name: Text,
        policy: py_tf_eager_policy.PyTFEagerPolicyBase,
        variable_container: reverb_variable_container.ReverbVariableContainer,
        suite_load_fn: Callable[
            [Text], py_environment.PyEnvironment] = suite_mujoco.load,
        additional_metrics: Optional[Iterable[py_metric.PyStepMetric]] = None,
        is_running: Optional[Callable[[], bool]] = None) -> None:
    """Evaluates a policy iteratively fetching weights from variable container.

  Args:
    summary_dir: Directory which is used to store the summaries.
    environment_name: Name of the environment used to evaluate the policy.
    policy: The policy being evaluated. The weights of this policy are fetched
      from the variable container periodically.
    variable_container: Provides weights for the policy.
    suite_load_fn: Function that loads the environment (by calling it with the
      name of the environment) from a particular suite.
    additional_metrics: Optional collection of metrics that are computed as well
      during the evaluation. By default (`None`) it is empty.
    is_running: Optional callable which controls the running of the main
      evaluation loop (including fetching weights from the variable container
      and running the eval actor periodically). By default (`None`) this is a
      callable always returning `True` resulting in an infinite evaluation loop.
  """
    additional_metrics = additional_metrics or []
    is_running = is_running or (lambda: True)
    environment = suite_load_fn(environment_name)

    # Create the variable container.
    train_step = train_utils.create_train_step()
    variables = {
        reverb_variable_container.POLICY_KEY: policy.variables(),
        reverb_variable_container.TRAIN_STEP_KEY: train_step
    }
    variable_container.update(variables)
    prev_train_step_value = train_step.numpy()

    # Create the evaluator actor.
    eval_actor = actor.Actor(environment,
                             policy,
                             train_step,
                             episodes_per_run=1,
                             summary_dir=summary_dir,
                             metrics=actor.collect_metrics(buffer_size=1) +
                             additional_metrics,
                             name='eval_actor')

    # Run the experience evaluation loop.
    while is_running():
        eval_actor.run()
        logging.info('Evaluating using greedy policy at step: %d',
                     train_step.numpy())

        def is_train_step_the_same_or_behind():
            # Checks if the `train_step` received from variable conainer is the same
            # (or behind) the latest evaluated train step (`prev_train_step_value`).
            variable_container.update(variables)
            return train_step.numpy() <= prev_train_step_value

        train_utils.wait_for_predicate(
            wait_predicate_fn=is_train_step_the_same_or_behind)
        prev_train_step_value = train_step.numpy()
Ejemplo n.º 2
0
 def test_wait_for_predicate_second_false(self):
     """Tests predicate returning False on second call."""
     predicate_mock = mock.MagicMock(side_effect=[True, False])
     # 10 retry limit to avoid a near infinite loop on an error.
     train_utils.wait_for_predicate(predicate_mock, num_retries=10)
     self.assertEqual(predicate_mock.call_count, 2)
Ejemplo n.º 3
0
 def test_wait_for_predicate_timeout(self):
     """Tests predicate returning True forever and then timing out."""
     predicate_mock = mock.MagicMock(side_effect=[True, True, True])
     with self.assertRaises(TimeoutError):
         train_utils.wait_for_predicate(predicate_mock, num_retries=3)
Ejemplo n.º 4
0
def evaluate(
    summary_dir: Text,
    environment_name: Text,
    policy: py_tf_eager_policy.PyTFEagerPolicyBase,
    variable_container: reverb_variable_container.ReverbVariableContainer,
    suite_load_fn: Callable[[Text],
                            py_environment.PyEnvironment] = suite_mujoco.load,
    additional_metrics: Optional[Iterable[py_metric.PyStepMetric]] = None,
    is_running: Optional[Callable[[], bool]] = None,
    eval_interval: int = 1000,
    eval_episodes: int = 1,
    # TODO(b/178225158): Deprecate in favor of the reporting libray when ready.
    return_reporting_fn: Optional[Callable[[int, float], None]] = None
) -> None:
    """Evaluates a policy iteratively fetching weights from variable container.

  Args:
    summary_dir: Directory which is used to store the summaries.
    environment_name: Name of the environment used to evaluate the policy.
    policy: The policy being evaluated. The weights of this policy are fetched
      from the variable container periodically.
    variable_container: Provides weights for the policy.
    suite_load_fn: Function that loads the environment (by calling it with the
      name of the environment) from a particular suite.
    additional_metrics: Optional collection of metrics that are computed as well
      during the evaluation. By default (`None`) it is empty.
    is_running: Optional callable which controls the running of the main
      evaluation loop (including fetching weights from the variable container
      and running the eval actor periodically). By default (`None`) this is a
      callable always returning `True` resulting in an infinite evaluation loop.
    eval_interval: If set, eval is done at the given step interval or as close
      as possible based on polling.
    eval_episodes: Number of episodes to eval.
    return_reporting_fn: Optional callback function of the form `fn(train_step,
      average_return)` which reports the average return to a custom destination.
  """
    additional_metrics = additional_metrics or []
    is_running = is_running or (lambda: True)
    environment = suite_load_fn(environment_name)

    # Create the variable container.
    train_step = train_utils.create_train_step()
    variables = {
        reverb_variable_container.POLICY_KEY: policy.variables(),
        reverb_variable_container.TRAIN_STEP_KEY: train_step
    }
    variable_container.update(variables)
    prev_train_step_value = train_step.numpy()

    # Create the evaluator actor.
    metrics = actor.collect_metrics(buffer_size=eval_episodes)

    if return_reporting_fn:
        for m in metrics:
            if isinstance(m, py_metrics.AverageReturnMetric):
                average_return_metric = m
                break

    eval_actor = actor.Actor(environment,
                             policy,
                             train_step,
                             episodes_per_run=eval_episodes,
                             summary_dir=summary_dir,
                             summary_interval=eval_interval,
                             metrics=metrics + additional_metrics,
                             name='eval_actor')

    # Run the experience evaluation loop.
    last_eval_step = 0
    while is_running():

        # Eval every step if no `eval_interval` is set, or if on the first step, or
        # if the step is equal or greater than `last_eval_step` + `eval_interval`.
        # It is very possible when logging a specific interval that the steps evaled
        # will not be exact, e.g. 1001 and then 2003 vs. 1000 and then 2000.
        if (train_step.numpy() == 0
                or train_step.numpy() >= eval_interval + last_eval_step):
            logging.info('Evaluating using greedy policy at step: %d',
                         train_step.numpy())
            eval_actor.run()
            last_eval_step = train_step.numpy()

        def is_train_step_the_same_or_behind():
            # Checks if the `train_step` received from variable conainer is the same
            # (or behind) the latest evaluated train step (`prev_train_step_value`).
            variable_container.update(variables)
            return train_step.numpy() <= prev_train_step_value

        train_utils.wait_for_predicate(
            wait_predicate_fn=is_train_step_the_same_or_behind)
        prev_train_step_value = train_step.numpy()

        # Optionally report the average return metric via a callback.
        if return_reporting_fn:
            return_reporting_fn(train_step.numpy(),
                                average_return_metric.result())