def test_policy(self, *, n_episodes=10): """Test current imitation policy on environment & give some rollout stats. Args: n_episodes (int): number of rolled-out episodes. Returns: dict: rollout statistics collected by `imitation.utils.rollout.rollout_stats()`. """ reward_stats = rollout.rollout_stats(self.policy, self.env, n_episodes=n_episodes) return reward_stats
def test_policy(self, *, min_episodes: int = 10) -> dict: """Test current imitation policy on environment & give some rollout stats. Args: min_episodes: Minimum number of rolled-out episodes. Returns: rollout statistics collected by `imitation.utils.rollout.rollout_stats()`. """ trajs = rollout.generate_trajectories( self.policy, self.env, sample_until=rollout.min_episodes(min_episodes)) reward_stats = rollout.rollout_stats(trajs) return reward_stats
def test_policy(self, *, n_trajectories=10, true_reward=True): """Test current imitation policy on environment & give some rollout stats. Args: n_trajectories (int): number of rolled-out trajectories. true_reward (bool): should this use ground truth reward from underlying environment (True), or imitation reward (False)? Returns: dict: rollout statistics collected by `imitation.utils.rollout.rollout_stats()`. """ self.imitation_trainer.set_env(self.env) reward_stats = rollout.rollout_stats( self.imitation_trainer, self.env if true_reward else self.wrapped_env, n_episodes=n_trajectories) return reward_stats
def train_gen(self, total_timesteps: Optional[int] = None, learn_kwargs: Optional[dict] = None): """Trains the generator to maximize the discriminator loss. After the end of training populates the generator replay buffer (used in discriminator training) with `self.disc_batch_size` transitions. Args: total_timesteps: The number of transitions to sample from `self.venv_train_norm` during training. By default, `self.gen_batch_size`. learn_kwargs: kwargs for the Stable Baselines `RLModel.learn()` method. """ if total_timesteps is None: total_timesteps = self.gen_batch_size if learn_kwargs is None: learn_kwargs = {} with logger.accumulate_means("gen"): self.gen_policy.learn(total_timesteps=total_timesteps, reset_num_timesteps=False, **learn_kwargs) with logger.accumulate_means("gen_buffer"): # Log stats for finished trajectories stored in the BufferingWrapper. This # will bias toward shorter trajectories because trajectories that # are partially finished at the time of this log are popped from # the buffer a few lines down. # # This is useful for getting some statistics for unnormalized rewards. # (The rewards logged during the call to `.learn()` are the ground truth # rewards, retrieved from Monitor.). trajs = self.venv_train_norm_buffering._trajectories if len(trajs) > 0: stats = rollout.rollout_stats(trajs) for k, v in stats.items(): util.logger.logkv(k, v) gen_samples = self.venv_train_norm_buffering.pop_transitions() self._gen_replay_buffer.store(gen_samples)
def policy_eval(_seed: int, env_name: str, timesteps: int, num_vec: int, parallel: bool, render: bool, policy_type: str, policy_path: str, log_dir: str): """Rolls a policy out in an environment, collecting statistics. Args: _seed: generated by Sacred. env_name: Gym environment identifier. timesteps: Minimum number of timesteps to evaluate for. num_vec: Number of environments to run simultaneously. parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise, uses `DummyVecEnv`. render: If True, renders interactively to the screen. policy_type: A unique identifier for the saved policy, defined in POLICY_CLASSES. policy_path: A path to the serialized policy. log_dir: The directory to log intermediate output to. (As of 2019-07-19 this is just episode-by-episode reward from bench.Monitor.) Returns: Statistics returned by `imitation.util.rollout.rollout_stats`. """ tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info('Logging to %s', log_dir) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir) if render: venv = InteractiveRender(venv) # TODO(adam): add support for videos using VideoRecorder? policy = serialize.load_policy(policy_type, policy_path, venv) stats = rollout.rollout_stats(policy, venv, n_timesteps=timesteps) return stats
def eval_policy( _run, _seed: int, env_name: str, eval_n_timesteps: Optional[int], eval_n_episodes: Optional[int], num_vec: int, parallel: bool, render: bool, render_fps: int, log_dir: str, policy_type: str, policy_path: str, reward_type: Optional[str] = None, reward_path: Optional[str] = None, max_episode_steps: Optional[int] = None, ): """Rolls a policy out in an environment, collecting statistics. Args: _seed: generated by Sacred. env_name: Gym environment identifier. eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. num_vec: Number of environments to run simultaneously. parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise, uses `DummyVecEnv`. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. render: If True, renders interactively to the screen. log_dir: The directory to log intermediate output to. (As of 2019-07-19 this is just episode-by-episode reward from bench.Monitor.) policy_type: A unique identifier for the saved policy, defined in POLICY_CLASSES. policy_path: A path to the serialized policy. reward_type: If specified, overrides the environment reward with a reward of this. reward_path: If reward_type is specified, the path to a serialized reward of `reward_type` to override the environment reward with. Returns: Return value of `imitation.util.rollout.rollout_stats()`. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info('Logging to %s', log_dir) sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) venv = VecNormalize(venv, training=False, norm_reward=False) venv = venv.load(policy_path + "/vec_normalize.pkl", venv) if render: venv = InteractiveRender(venv, render_fps) # TODO(adam): add support for videos using VideoRecorder? with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") with serialize.load_policy(policy_type, policy_path, venv) as policy: trajs = rollout.generate_trajectories(policy, venv, sample_until) return rollout.rollout_stats(trajs)