Ejemplo n.º 1
0
    def test_policy(self, *, n_episodes=10):
        """Test current imitation policy on environment & give some rollout
    stats.

    Args:
      n_episodes (int): number of rolled-out episodes.

    Returns:
      dict: rollout statistics collected by
        `imitation.utils.rollout.rollout_stats()`.
    """
        reward_stats = rollout.rollout_stats(self.policy,
                                             self.env,
                                             n_episodes=n_episodes)
        return reward_stats
Ejemplo n.º 2
0
    def test_policy(self, *, min_episodes: int = 10) -> dict:
        """Test current imitation policy on environment & give some rollout stats.

    Args:
      min_episodes: Minimum number of rolled-out episodes.

    Returns:
      rollout statistics collected by `imitation.utils.rollout.rollout_stats()`.
    """
        trajs = rollout.generate_trajectories(
            self.policy,
            self.env,
            sample_until=rollout.min_episodes(min_episodes))
        reward_stats = rollout.rollout_stats(trajs)
        return reward_stats
Ejemplo n.º 3
0
  def test_policy(self, *, n_trajectories=10, true_reward=True):
    """Test current imitation policy on environment & give some rollout
    stats.

    Args:
      n_trajectories (int): number of rolled-out trajectories.
      true_reward (bool): should this use ground truth reward from underlying
        environment (True), or imitation reward (False)?

    Returns:
      dict: rollout statistics collected by
        `imitation.utils.rollout.rollout_stats()`.
    """
    self.imitation_trainer.set_env(self.env)
    reward_stats = rollout.rollout_stats(
        self.imitation_trainer,
        self.env if true_reward else self.wrapped_env,
        n_episodes=n_trajectories)
    return reward_stats
Ejemplo n.º 4
0
    def train_gen(self,
                  total_timesteps: Optional[int] = None,
                  learn_kwargs: Optional[dict] = None):
        """Trains the generator to maximize the discriminator loss.

    After the end of training populates the generator replay buffer (used in
    discriminator training) with `self.disc_batch_size` transitions.

    Args:
      total_timesteps: The number of transitions to sample from
        `self.venv_train_norm` during training. By default,
        `self.gen_batch_size`.
      learn_kwargs: kwargs for the Stable Baselines `RLModel.learn()`
        method.
    """
        if total_timesteps is None:
            total_timesteps = self.gen_batch_size
        if learn_kwargs is None:
            learn_kwargs = {}

        with logger.accumulate_means("gen"):
            self.gen_policy.learn(total_timesteps=total_timesteps,
                                  reset_num_timesteps=False,
                                  **learn_kwargs)

        with logger.accumulate_means("gen_buffer"):
            # Log stats for finished trajectories stored in the BufferingWrapper. This
            # will bias toward shorter trajectories because trajectories that
            # are partially finished at the time of this log are popped from
            # the buffer a few lines down.
            #
            # This is useful for getting some statistics for unnormalized rewards.
            # (The rewards logged during the call to `.learn()` are the ground truth
            # rewards, retrieved from Monitor.).
            trajs = self.venv_train_norm_buffering._trajectories
            if len(trajs) > 0:
                stats = rollout.rollout_stats(trajs)
                for k, v in stats.items():
                    util.logger.logkv(k, v)

        gen_samples = self.venv_train_norm_buffering.pop_transitions()
        self._gen_replay_buffer.store(gen_samples)
Ejemplo n.º 5
0
def policy_eval(_seed: int, env_name: str, timesteps: int, num_vec: int,
                parallel: bool, render: bool, policy_type: str,
                policy_path: str, log_dir: str):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    timesteps: Minimum number of timesteps to evaluate for.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    render: If True, renders interactively to the screen.
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)

  Returns:
    Statistics returned by `imitation.util.rollout.rollout_stats`.
  """
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)

    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir)
    if render:
        venv = InteractiveRender(venv)
    # TODO(adam): add support for videos using VideoRecorder?

    policy = serialize.load_policy(policy_type, policy_path, venv)
    stats = rollout.rollout_stats(policy, venv, n_timesteps=timesteps)

    return stats
Ejemplo n.º 6
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    max_episode_steps: If not None, then environments are wrapped by
        TimeLimit so that they have at most `max_episode_steps` steps per
        episode.
    render: If True, renders interactively to the screen.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    reward_type: If specified, overrides the environment reward with
        a reward of this.
    reward_path: If reward_type is specified, the path to a serialized reward
        of `reward_type` to override the environment reward with.

  Returns:
    Return value of `imitation.util.rollout.rollout_stats()`.
  """

    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps)
    venv = VecNormalize(venv, training=False, norm_reward=False)
    venv = venv.load(policy_path + "/vec_normalize.pkl", venv)

    if render:
        venv = InteractiveRender(venv, render_fps)
    # TODO(adam): add support for videos using VideoRecorder?

    with contextlib.ExitStack() as stack:
        if reward_type is not None:
            reward_fn_ctx = load_reward(reward_type, reward_path, venv)
            reward_fn = stack.enter_context(reward_fn_ctx)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            tf.logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        with serialize.load_policy(policy_type, policy_path, venv) as policy:
            trajs = rollout.generate_trajectories(policy, venv, sample_until)
    return rollout.rollout_stats(trajs)