def save(trainer, save_path): """Save discriminator and generator.""" # We implement this here and not in Trainer since we do not want to actually # serialize the whole Trainer (including e.g. expert demonstrations). trainer.discrim.save(os.path.join(save_path, "discrim")) # TODO(gleave): unify this with the saving logic in data_collect? # (Needs #43 to be merged before attempting.) serialize.save_stable_model(os.path.join(save_path, "gen_policy"), trainer.gen_policy, trainer.venv_train_norm)
def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] if rollout_ok and step % rollout_save_interval == 0: util.rollout.save( rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_ok and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:5d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True
def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: save_path = osp.join(rollout_dir, f"{step}.pkl") util.rollout.save(save_path, policy, venv, sample_until) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize)
def test_serialize_identity(env_name, model_cfg, normalize, tmpdir): """Test output actions of deserialized policy are same as original.""" orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False) vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) model_name, model_cls_name = model_cfg try: model_cls = registry.load_attr(model_cls_name) except (AttributeError, ImportError): # pragma: no cover pytest.skip( "Couldn't load stable baselines class. " "(Probably because mpi4py not installed.)" ) model = model_cls("MlpPolicy", venv) model.learn(1000) venv.env_method("seed", 0) venv.reset() if normalize: # don't want statistics to change as we collect rollouts vec_normalize.training = False orig_rollout = rollout.generate_transitions( model, venv, n_timesteps=1000, deterministic_policy=True, rng=np.random.RandomState(0), ) serialize.save_stable_model(tmpdir, model, vec_normalize) # We use `orig_venv` since `load_policy` automatically wraps `loaded` # with a VecNormalize, when appropriate. with serialize.load_policy(model_name, tmpdir, orig_venv) as loaded: orig_venv.env_method("seed", 0) orig_venv.reset() new_rollout = rollout.generate_transitions( loaded, orig_venv, n_timesteps=1000, deterministic_policy=True, rng=np.random.RandomState(0), ) assert np.allclose(orig_rollout.acts, new_rollout.acts)
def test_serialize_identity(env_name, model_cfg, normalize, tmpdir): """Test output actions of deserialized policy are same as original.""" orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False) vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) model_name, model_cls_name = model_cfg model_cls = registry.load_attr(model_cls_name) # FIXME(sam): verbose=1 is a hack to stop it from setting up SB logger model = model_cls("MlpPolicy", venv, verbose=1) model.learn(1000) venv.env_method("seed", 0) venv.reset() if normalize: # don't want statistics to change as we collect rollouts vec_normalize.training = False orig_rollout = rollout.generate_transitions( model, venv, n_timesteps=1000, deterministic_policy=True, rng=np.random.RandomState(0), ) serialize.save_stable_model(tmpdir, model, vec_normalize) # We use `orig_venv` since `load_policy` automatically wraps `loaded` # with a VecNormalize, when appropriate. loaded = serialize.load_policy(model_name, tmpdir, orig_venv) orig_venv.env_method("seed", 0) orig_venv.reset() new_rollout = rollout.generate_transitions( loaded, orig_venv, n_timesteps=1000, deterministic_policy=True, rng=np.random.RandomState(0), ) assert np.allclose(orig_rollout.acts, new_rollout.acts)
def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: util.rollout.save(rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True # Continue training.
def test_batch_reward_heatmaps(trajs, tmpdir, rand_policy): """Check that `batch_reward_heatmaps` builds a figure for each checkpoint.""" tmpdir = pathlib.Path(tmpdir) # Save dummy mountain car expert and rollouts. expert_policy = rand_policy expert_policy_path = tmpdir / "expert_policy" serialize.save_stable_model(str(expert_policy_path), expert_policy) rollout_path = tmpdir / "rollout.pkl" with open(rollout_path, "wb") as f: pickle.dump(trajs, f) # Generate reward function and generator policy checkpoints. log_dir = tmpdir / "train_adversarial" run = train_adversarial.train_ex.run( named_configs=["mountain_car"], config_updates=dict( rollout_path=rollout_path, checkpoint_interval=1, log_dir=(tmpdir / "train_adversarial"), total_timesteps=5000, ), ) assert run.status == "COMPLETED" checkpoints_dir = log_dir / "checkpoints" assert checkpoints_dir.is_dir() # Finally generate batched figures from checkpoints. fig_dict = mountain_car_plots.batch_reward_heatmaps(checkpoints_dir, exp_trajs=trajs) n_checkpoints = len(list(checkpoints_dir.iterdir())) n_expected_figs = mountain_car_plots.MC_NUM_ACTS * n_checkpoints assert len(fig_dict) == n_expected_figs for fig in fig_dict.values(): assert isinstance(fig, plt.Figure) plt.close(fig)
def rollouts_and_policy( _run, _seed: int, env_name: str, total_timesteps: int, *, log_dir: str, num_vec: int, parallel: bool, max_episode_steps: Optional[int], normalize: bool, normalize_kwargs: dict, init_rl_kwargs: dict, n_episodes_eval: int, reward_type: Optional[str], reward_path: Optional[str], rollout_save_interval: int, rollout_save_final: bool, rollout_save_n_timesteps: Optional[int], rollout_save_n_episodes: Optional[int], policy_save_interval: int, policy_save_final: bool, init_tensorboard: bool, ) -> dict: """Trains an expert policy from scratch and saves the rollouts and policy. Checkpoints: At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}/`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. normalize: If True, then rescale observations and reward. normalize_kwargs: kwargs for `VecNormalize`. init_rl_kwargs: kwargs for `init_rl`. n_episodes_eval: The number of episodes to average over when calculating the average ground truth reward return of the final policy. reward_type: If provided, then load the serialized reward of this type, wrapping the environment in this reward. This is useful to test whether a reward model transfers. For more information, see `imitation.rewards.serialize.load_reward`. reward_path: A specifier, such as a path to a file on disk, used by reward_type to load the reward model. For more information, see `imitation.rewards.serialize.load_reward`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb and "output/summary/...". Returns: The return value of `rollout_stats()` using the final policy. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps, rollout_save_n_episodes) eval_sample_until = util.rollout.min_episodes(n_episodes_eval) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) if init_tensorboard: sb_tensorboard_dir = osp.join(log_dir, "sb_tb") init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) log_callbacks = [] with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = RewardVecEnvWrapper(venv, reward_fn) log_callbacks.append(venv.log_callback) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv, **normalize_kwargs) policy = util.init_rl(venv, verbose=1, **init_rl_kwargs) # Make callback to save intermediate artifacts during training. step = 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: save_path = osp.join(rollout_dir, f"{step}.pkl") util.rollout.save(save_path, policy, venv, sample_until) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize) policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: save_path = osp.join(rollout_dir, "final.pkl") util.rollout.save(save_path, policy, venv, sample_until) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize) # Final evaluation of expert policy. trajs = util.rollout.generate_trajectories( policy, venv, eval_sample_until) stats = util.rollout.rollout_stats(trajs) return stats
def rollouts_and_policy( _seed: int, env_name: str, total_timesteps: int, *, log_dir: str = None, num_vec: int = 8, parallel: bool = False, max_episode_steps: Optional[int] = None, normalize: bool = True, make_blank_policy_kwargs: dict = {}, reward_type: Optional[str] = None, reward_path: Optional[str] = None, rollout_save_interval: int = 0, rollout_save_final: bool = False, rollout_save_n_timesteps: Optional[int] = None, rollout_save_n_episodes: Optional[int] = None, policy_save_interval: int = -1, policy_save_final: bool = True, ) -> None: """Trains an expert policy from scratch and saves the rollouts and policy. At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}.pkl`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. normalize: If True, then rescale observations and reward. make_blank_policy_kwargs: Kwargs for `make_blank_policy`. reward_type: If provided, then load the serialized reward of this type, wrapping the environment in this reward. This is useful to test whether a reward model transfers. For more information, see `imitation.rewards.serialize.load_reward`. reward_path: A specifier, such as a path to a file on disk, used by reward_type to load the reward model. For more information, see `imitation.rewards.serialize.load_reward`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. """ _validate_traj_generate_params(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) log_callbacks = [] with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = RewardVecEnvWrapper(venv, reward_fn) log_callbacks.append(venv.log_callback) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs) # Make callback to save intermediate artifacts during training. step = 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: util.rollout.save(rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True # Continue training. policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: util.rollout.save(rollout_dir, policy, venv, "final", n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize)
def rollouts_and_policy( _seed: int, env_name: str, total_timesteps: int, *, log_dir: str = None, num_vec: int = 8, parallel: bool = False, normalize: bool = True, make_blank_policy_kwargs: dict = {}, rollout_save_interval: int = 0, rollout_save_final: bool = False, rollout_save_n_timesteps: Optional[int] = None, rollout_save_n_episodes: Optional[int] = None, policy_save_interval: int = -1, policy_save_final: bool = True, ) -> None: """Trains an expert policy from scratch and saves the rollouts and policy. At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}.pkl`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. normalize: If True, then rescale observations and reward. make_blank_policy_kwargs: Kwargs for `make_blank_policy`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. """ _validate_traj_generate_params(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir) vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs) # Make callback to save intermediate artifacts during training. step = 0 rollout_ok = rollout_save_interval > 0 policy_ok = policy_save_interval > 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] if rollout_ok and step % rollout_save_interval == 0: util.rollout.save( rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_ok and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:5d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: util.rollout.save( rollout_dir, policy, venv, "final", n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize)
def rollouts_and_policy( _run, _seed: int, env_name: str, total_timesteps: int, *, log_dir: str, num_vec: int, parallel: bool, max_episode_steps: Optional[int], normalize: bool, normalize_kwargs: dict, init_rl_kwargs: dict, n_episodes_eval: int, reward_type: Optional[str], reward_path: Optional[str], rollout_save_final: bool, rollout_save_n_timesteps: Optional[int], rollout_save_n_episodes: Optional[int], policy_save_interval: int, policy_save_final: bool, init_tensorboard: bool, ) -> dict: """Trains an expert policy from scratch and saves the rollouts and policy. Checkpoints: At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}/`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. normalize: If True, then rescale observations and reward. normalize_kwargs: kwargs for `VecNormalize`. init_rl_kwargs: kwargs for `init_rl`. n_episodes_eval: The number of episodes to average over when calculating the average ground truth reward return of the final policy. reward_type: If provided, then load the serialized reward of this type, wrapping the environment in this reward. This is useful to test whether a reward model transfers. For more information, see `imitation.rewards.serialize.load_reward`. reward_path: A specifier, such as a path to a file on disk, used by reward_type to load the reward model. For more information, see `imitation.rewards.serialize.load_reward`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb and "output/summary/...". Returns: The return value of `rollout_stats()` using the final policy. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) sample_until = rollout.make_sample_until(rollout_save_n_timesteps, rollout_save_n_episodes) eval_sample_until = rollout.min_episodes(n_episodes_eval) logging.basicConfig(level=logging.INFO) logger.configure(folder=osp.join(log_dir, "rl"), format_strs=["tensorboard", "stdout"]) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) if init_tensorboard: # sb_tensorboard_dir = osp.join(log_dir, "sb_tb") # Convert sacred's ReadOnlyDict to dict so we can modify on next line. init_rl_kwargs = dict(init_rl_kwargs) # init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir # FIXME(sam): this is another hack to prevent SB3 from configuring the # logger on the first .learn() call. Remove it once SB3 issue #109 is # fixed. init_rl_kwargs["tensorboard_log"] = None venv = util.make_vec_env( env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps, ) callback_objs = [] if reward_type is not None: reward_fn = load_reward(reward_type, reward_path, venv) venv = RewardVecEnvWrapper(venv, reward_fn) callback_objs.append(venv.make_log_callback()) logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv, **normalize_kwargs) if policy_save_interval > 0: save_policy_callback = serialize.SavePolicyCallback( policy_dir, vec_normalize) save_policy_callback = callbacks.EveryNTimesteps( policy_save_interval, save_policy_callback) callback_objs.append(save_policy_callback) callback = callbacks.CallbackList(callback_objs) policy = util.init_rl(venv, verbose=1, **init_rl_kwargs) policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: save_path = osp.join(rollout_dir, "final.pkl") rollout.rollout_and_save(save_path, policy, venv, sample_until) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize) # Final evaluation of expert policy. trajs = rollout.generate_trajectories(policy, venv, eval_sample_until) stats = rollout.rollout_stats(trajs) return stats