Exemple #1
0
def test_wrap_learned_reward_no_crash(use_gail, env="CartPole-v1"):
  """
  Briefly train with AIRL, and then used the learned reward to wrap
  a duplicate environment. Finally, use that learned reward to train
  a policy.
  """
  trainer = init_test_trainer(env, use_gail)
  trainer.train(n_epochs=1)

  learned_reward_env = trainer.wrap_env_test_reward(env)
  policy = util.init_rl(env)
  policy.set_env(learned_reward_env)
  policy.learn(10)
    def add_data_ep_reward(self, env, name):
        """Calculate and record average episode returns."""
        sample_until = util.rollout.min_episodes(
            self.n_episodes_per_reward_data)

        gen_policy = self.trainer.gen_policy
        gen_ret = util.rollout.mean_return(gen_policy, env, sample_until)
        self.gen_ep_reward[name].append(gen_ret)
        tf.logging.info("generator return: {}".format(gen_ret))

        rand_policy = util.init_rl(self.trainer.venv)
        rand_ret = util.rollout.mean_return(rand_policy, env, sample_until)
        self.rand_ep_reward[name].append(rand_ret)
        tf.logging.info("random return: {}".format(rand_ret))
Exemple #3
0
    def ep_reward_plot_add_data(env, name):
      """Calculate and record average episode returns."""
      gen_policy = trainer.gen_policy
      gen_ret = util.rollout.mean_return(
          gen_policy, env, n_episodes=n_episodes_per_reward_data)
      gen_ep_reward[name].append(gen_ret)
      tf.logging.info("generator return: {}".format(gen_ret))

      rand_policy = util.init_rl(trainer.env)
      rand_ret = util.rollout.mean_return(
          rand_policy, env, n_episodes=n_episodes_per_reward_data)
      rand_ep_reward[name].append(rand_ret)
      tf.logging.info("random return: {}".format(rand_ret))

      if expert_policy is not None:
          exp_ret = util.rollout.mean_return(
              expert_policy, env, n_episodes=n_episodes_per_reward_data)
          exp_ep_reward[name].append(exp_ret)
          tf.logging.info("exp return: {}".format(exp_ret))
Exemple #4
0
def test_density_trainer(density_type, is_stationary):
    env_id = 'Pendulum-v0'
    rollouts = rollout.load_trajectories(f"tests/data/rollouts/{env_id}_*.pkl")
    env = util.make_vec_env(env_id, 2)
    imitation_trainer = util.init_rl(env)
    density_trainer = DensityTrainer(env,
                                     rollouts=rollouts,
                                     imitation_trainer=imitation_trainer,
                                     density_type=density_type,
                                     is_stationary=is_stationary,
                                     kernel='gaussian')
    novice_stats = density_trainer.test_policy()
    density_trainer.train_policy(2000)
    good_stats = density_trainer.test_policy()
    # Novice is bad
    assert novice_stats["return_mean"] < -500
    # Density is also pretty bad, but shouldn't make things more than 50% worse.
    # It would be nice to have a less flaky/more meaningful test here.
    assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
Exemple #5
0
    def add_data_ep_reward(self, epoch):
        """Calculate and record average episode returns."""
        if epoch in self.ep_reward_X:
            # Don't calculate ep reward twice.
            return
        self.ep_reward_X.append(epoch)

        gen_policy = self.trainer.gen_policy
        rand_policy = util.init_rl(self.trainer.venv)
        sample_until = util.rollout.min_episodes(
            self.n_episodes_per_reward_data)
        trajs_rand = util.rollout.generate_trajectories(
            rand_policy, self.venv_norm_obs, sample_until)
        trajs_gen = util.rollout.generate_trajectories(gen_policy,
                                                       self.venv_norm_obs,
                                                       sample_until)

        for reward_fn, reward_name in [
            (None, "Ground Truth Reward"),
            (self.trainer.reward_train, "Train Reward"),
            (self.trainer.reward_test, "Test Reward")
        ]:
            if reward_fn is None:
                trajs_rand_rets = [np.sum(traj.rews) for traj in trajs_rand]
                trajs_gen_rets = [np.sum(traj.rews) for traj in trajs_gen]
            else:
                trajs_rand_rets = [
                    np.sum(util.rollout.recalc_rewards_traj(traj, reward_fn))
                    for traj in trajs_rand
                ]
                trajs_gen_rets = [
                    np.sum(util.rollout.recalc_rewards_traj(traj, reward_fn))
                    for traj in trajs_gen
                ]

            gen_ret = np.mean(trajs_gen_rets)
            rand_ret = np.mean(trajs_rand_rets)
            self.gen_ep_reward[reward_name].append(gen_ret)
            self.rand_ep_reward[reward_name].append(rand_ret)
            tf.logging.info(f"{reward_name} generator return: {gen_ret}")
            tf.logging.info(f"{reward_name} random return: {rand_ret}")
def test_density_trainer(density_type, is_stationary):
  env_name = 'Pendulum-v0'
  with open("tests/data/expert_models/pendulum_0/rollouts/final.pkl",
            "rb") as f:
    rollouts = pickle.load(f)
  env = util.make_vec_env(env_name, 2)
  imitation_trainer = util.init_rl(env)
  density_trainer = DensityTrainer(env,
                                   rollouts=rollouts,
                                   imitation_trainer=imitation_trainer,
                                   density_type=density_type,
                                   is_stationary=is_stationary,
                                   kernel='gaussian')
  novice_stats = density_trainer.test_policy()
  density_trainer.train_policy(2000)
  good_stats = density_trainer.test_policy()
  # Novice is bad
  assert novice_stats["return_mean"] < -500
  # Density is also pretty bad, but shouldn't make things more than 50% worse.
  # It would be nice to have a less flaky/more meaningful test here.
  assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
Exemple #7
0
def init_trainer(
    env_name: str,
    expert_trajectories: Sequence[rollout.Trajectory],
    *,
    log_dir: str,
    seed: int = 0,
    use_gail: bool = False,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    scale: bool = True,
    airl_entropy_weight: float = 1.0,
    discrim_kwargs: dict = {},
    reward_kwargs: dict = {},
    trainer_kwargs: dict = {},
    init_rl_kwargs: dict = {},
):
    """Builds an AdversarialTrainer, ready to be trained on a vectorized
    environment and expert demonstrations.

  Args:
    env_name: The string id of a gym environment.
    expert_trajectories: Demonstrations from expert.
    seed: Random seed.
    log_dir: Directory for logging output. Will generate a unique sub-directory
        within this directory for all output.
    use_gail: If True, then train using GAIL. If False, then train
        using AIRL.
    num_vec: The number of vectorized environments.
    parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv.
    max_episode_steps: If specified, wraps VecEnv in TimeLimit wrapper with
        this episode length before returning.
    policy_dir: The directory containing the pickled experts for
        generating rollouts.
    scale: If True, then scale input Tensors to the interval [0, 1].
    airl_entropy_weight: Only applicable for AIRL. The `entropy_weight`
        argument of `DiscrimNetAIRL.__init__`.
    trainer_kwargs: Arguments for the Trainer constructor.
    reward_kwargs: Arguments for the `*RewardNet` constructor.
    discrim_kwargs: Arguments for the `DiscrimNet*` constructor.
    init_rl_kwargs: Keyword arguments passed to `init_rl`,
        used to initialize the RL algorithm.
  """
    util.logger.configure(folder=log_dir,
                          format_strs=['tensorboard', 'stdout'])
    env = util.make_vec_env(env_name,
                            num_vec,
                            seed=seed,
                            parallel=parallel,
                            log_dir=log_dir,
                            max_episode_steps=max_episode_steps)
    gen_policy = util.init_rl(env, verbose=1, **init_rl_kwargs)

    if use_gail:
        discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                             env.action_space,
                                             scale=scale,
                                             **discrim_kwargs)
    else:
        rn = BasicShapedRewardNet(env.observation_space,
                                  env.action_space,
                                  scale=scale,
                                  **reward_kwargs)
        discrim = discrim_net.DiscrimNetAIRL(
            rn, entropy_weight=airl_entropy_weight, **discrim_kwargs)

    expert_demos = util.rollout.flatten_trajectories(expert_trajectories)
    trainer = AdversarialTrainer(env,
                                 gen_policy,
                                 discrim,
                                 expert_demos,
                                 log_dir=log_dir,
                                 **trainer_kwargs)
    return trainer
Exemple #8
0
def init_trainer(
    env_id: str,
    rollout_glob: str,
    *,
    n_expert_demos: Optional[int] = None,
    seed: int = 0,
    log_dir: str = None,
    use_gail: bool = False,
    num_vec: int = 8,
    parallel: bool = False,
    max_n_files: int = 1,
    scale: bool = True,
    airl_entropy_weight: float = 1.0,
    discrim_kwargs: bool = {},
    reward_kwargs: bool = {},
    trainer_kwargs: bool = {},
    make_blank_policy_kwargs: bool = {},
):
    """Builds a Trainer, ready to be trained on a vectorized environment
  and expert demonstrations.

  Args:
    env_id: The string id of a gym environment.
    rollout_glob: Argument for `imitation.util.rollout.load_trajectories`.
    n_expert_demos: The number of expert trajectories to actually use
        after loading them via `load_trajectories`.
        If None, then use all available trajectories.
        If `n_expert_demos` is an `int`, then use
        exactly `n_expert_demos` trajectories, erroring if there aren't
        enough trajectories. If there are surplus trajectories, then use the
        first `n_expert_demos` trajectories and drop the rest.
    seed: Random seed.
    log_dir: Directory for logging output.
    use_gail: If True, then train using GAIL. If False, then train
        using AIRL.
    num_vec: The number of vectorized environments.
    parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv.
    max_n_files: If provided, then only load the most recent `max_n_files`
        files, as sorted by modification times.
    policy_dir: The directory containing the pickled experts for
        generating rollouts.
    scale: If True, then scale input Tensors to the interval [0, 1].
    airl_entropy_weight: Only applicable for AIRL. The `entropy_weight`
        argument of `DiscrimNetAIRL.__init__`.
    trainer_kwargs: Arguments for the Trainer constructor.
    reward_kwargs: Arguments for the `*RewardNet` constructor.
    discrim_kwargs: Arguments for the `DiscrimNet*` constructor.
    make_blank_policy_kwargs: Keyword arguments passed to `make_blank_policy`,
        used to initialize the trainer.
  """
    env = util.make_vec_env(env_id,
                            num_vec,
                            seed=seed,
                            parallel=parallel,
                            log_dir=log_dir)
    gen_policy = util.init_rl(env, verbose=1, **make_blank_policy_kwargs)

    if use_gail:
        discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                             env.action_space,
                                             scale=scale,
                                             **discrim_kwargs)
    else:
        rn = BasicShapedRewardNet(env.observation_space,
                                  env.action_space,
                                  scale=scale,
                                  **reward_kwargs)
        discrim = discrim_net.DiscrimNetAIRL(
            rn, entropy_weight=airl_entropy_weight, **discrim_kwargs)

    expert_demos = util.rollout.load_trajectories(rollout_glob,
                                                  max_n_files=max_n_files)
    if n_expert_demos is not None:
        assert len(expert_demos) >= n_expert_demos
        expert_demos = expert_demos[:n_expert_demos]

    expert_rollouts = util.rollout.flatten_trajectories(expert_demos)[:3]
    trainer = Trainer(env, gen_policy, discrim, expert_rollouts,
                      **trainer_kwargs)
    return trainer
def rollouts_and_policy(
  _run,
  _seed: int,
  env_name: str,
  total_timesteps: int,
  *,
  log_dir: str,
  num_vec: int,
  parallel: bool,
  max_episode_steps: Optional[int],
  normalize: bool,
  normalize_kwargs: dict,
  init_rl_kwargs: dict,

  n_episodes_eval: int,

  reward_type: Optional[str],
  reward_path: Optional[str],

  rollout_save_interval: int,
  rollout_save_final: bool,
  rollout_save_n_timesteps: Optional[int],
  rollout_save_n_episodes: Optional[int],

  policy_save_interval: int,
  policy_save_final: bool,

  init_tensorboard: bool,
) -> dict:
  """Trains an expert policy from scratch and saves the rollouts and policy.

  Checkpoints:
    At applicable training steps `step` (where step is either an integer or
    "final"):

      - Policies are saved to `{log_dir}/policies/{step}/`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      normalize: If True, then rescale observations and reward.
      normalize_kwargs: kwargs for `VecNormalize`.
      init_rl_kwargs: kwargs for `init_rl`.

      n_episodes_eval: The number of episodes to average over when calculating
          the average ground truth reward return of the final policy.

      reward_type: If provided, then load the serialized reward of this type,
          wrapping the environment in this reward. This is useful to test
          whether a reward model transfers. For more information, see
          `imitation.rewards.serialize.load_reward`.
      reward_path: A specifier, such as a path to a file on disk, used by
          reward_type to load the reward model. For more information, see
          `imitation.rewards.serialize.load_reward`.

      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.

      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.

      init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb
          and "output/summary/...".

  Returns:
    The return value of `rollout_stats()` using the final policy.
  """
  os.makedirs(log_dir, exist_ok=True)
  sacred_util.build_sacred_symlink(log_dir, _run)

  sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps,
                                                rollout_save_n_episodes)
  eval_sample_until = util.rollout.min_episodes(n_episodes_eval)

  with util.make_session():
    tf.logging.set_verbosity(tf.logging.INFO)
    sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                        format_strs=['tensorboard', 'stdout'])

    rollout_dir = osp.join(log_dir, "rollouts")
    policy_dir = osp.join(log_dir, "policies")
    os.makedirs(rollout_dir, exist_ok=True)
    os.makedirs(policy_dir, exist_ok=True)

    if init_tensorboard:
      sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
      init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir

    venv = util.make_vec_env(env_name, num_vec, seed=_seed,
                             parallel=parallel, log_dir=log_dir,
                             max_episode_steps=max_episode_steps)

    log_callbacks = []
    with contextlib.ExitStack() as stack:
      if reward_type is not None:
        reward_fn_ctx = load_reward(reward_type, reward_path, venv)
        reward_fn = stack.enter_context(reward_fn_ctx)
        venv = RewardVecEnvWrapper(venv, reward_fn)
        log_callbacks.append(venv.log_callback)
        tf.logging.info(
            f"Wrapped env in reward {reward_type} from {reward_path}.")

      vec_normalize = None
      if normalize:
        venv = vec_normalize = VecNormalize(venv, **normalize_kwargs)

      policy = util.init_rl(venv, verbose=1, **init_rl_kwargs)

      # Make callback to save intermediate artifacts during training.
      step = 0

      def callback(locals_: dict, _) -> bool:
        nonlocal step
        step += 1
        policy = locals_['self']

        # TODO(adam): make logging frequency configurable
        for callback in log_callbacks:
          callback(sb_logger)

        if rollout_save_interval > 0 and step % rollout_save_interval == 0:
          save_path = osp.join(rollout_dir, f"{step}.pkl")
          util.rollout.save(save_path, policy, venv, sample_until)
        if policy_save_interval > 0 and step % policy_save_interval == 0:
          output_dir = os.path.join(policy_dir, f'{step:05d}')
          serialize.save_stable_model(output_dir, policy, vec_normalize)

      policy.learn(total_timesteps, callback=callback)

      # Save final artifacts after training is complete.
      if rollout_save_final:
        save_path = osp.join(rollout_dir, "final.pkl")
        util.rollout.save(save_path, policy, venv, sample_until)
      if policy_save_final:
        output_dir = os.path.join(policy_dir, "final")
        serialize.save_stable_model(output_dir, policy, vec_normalize)

      # Final evaluation of expert policy.
      trajs = util.rollout.generate_trajectories(
          policy, venv, eval_sample_until)
      stats = util.rollout.rollout_stats(trajs)

  return stats
Exemple #10
0
def rollouts_and_policy(
    _seed: int,
    env_name: str,
    total_timesteps: int,
    *,
    log_dir: str = None,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    normalize: bool = True,
    make_blank_policy_kwargs: dict = {},
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    rollout_save_interval: int = 0,
    rollout_save_final: bool = False,
    rollout_save_n_timesteps: Optional[int] = None,
    rollout_save_n_episodes: Optional[int] = None,
    policy_save_interval: int = -1,
    policy_save_final: bool = True,
) -> None:
    """Trains an expert policy from scratch and saves the rollouts and policy.

  At applicable training steps `step` (where step is either an integer or
  "final"):

      - Policies are saved to `{log_dir}/policies/{step}.pkl`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      normalize: If True, then rescale observations and reward.
      make_blank_policy_kwargs: Kwargs for `make_blank_policy`.

      reward_type: If provided, then load the serialized reward of this type,
          wrapping the environment in this reward. This is useful to test
          whether a reward model transfers. For more information, see
          `imitation.rewards.serialize.load_reward`.
      reward_path: A specifier, such as a path to a file on disk, used by
          reward_type to load the reward model. For more information, see
          `imitation.rewards.serialize.load_reward`.

      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.

      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.
  """
    _validate_traj_generate_params(rollout_save_n_timesteps,
                                   rollout_save_n_episodes)

    with util.make_session():
        tf.logging.set_verbosity(tf.logging.INFO)
        sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                            format_strs=['tensorboard', 'stdout'])

        rollout_dir = osp.join(log_dir, "rollouts")
        policy_dir = osp.join(log_dir, "policies")
        os.makedirs(rollout_dir, exist_ok=True)
        os.makedirs(policy_dir, exist_ok=True)

        venv = util.make_vec_env(env_name,
                                 num_vec,
                                 seed=_seed,
                                 parallel=parallel,
                                 log_dir=log_dir,
                                 max_episode_steps=max_episode_steps)

        log_callbacks = []
        with contextlib.ExitStack() as stack:
            if reward_type is not None:
                reward_fn_ctx = load_reward(reward_type, reward_path, venv)
                reward_fn = stack.enter_context(reward_fn_ctx)
                venv = RewardVecEnvWrapper(venv, reward_fn)
                log_callbacks.append(venv.log_callback)
                tf.logging.info(
                    f"Wrapped env in reward {reward_type} from {reward_path}.")

            vec_normalize = None
            if normalize:
                venv = vec_normalize = VecNormalize(venv)

            policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs)

            # Make callback to save intermediate artifacts during training.
            step = 0

            def callback(locals_: dict, _) -> bool:
                nonlocal step
                step += 1
                policy = locals_['self']

                # TODO(adam): make logging frequency configurable
                for callback in log_callbacks:
                    callback(sb_logger)

                if rollout_save_interval > 0 and step % rollout_save_interval == 0:
                    util.rollout.save(rollout_dir,
                                      policy,
                                      venv,
                                      step,
                                      n_timesteps=rollout_save_n_timesteps,
                                      n_episodes=rollout_save_n_episodes)
                if policy_save_interval > 0 and step % policy_save_interval == 0:
                    output_dir = os.path.join(policy_dir, f'{step:05d}')
                    serialize.save_stable_model(output_dir, policy,
                                                vec_normalize)
                return True  # Continue training.

            policy.learn(total_timesteps, callback=callback)

            # Save final artifacts after training is complete.
            if rollout_save_final:
                util.rollout.save(rollout_dir,
                                  policy,
                                  venv,
                                  "final",
                                  n_timesteps=rollout_save_n_timesteps,
                                  n_episodes=rollout_save_n_episodes)
            if policy_save_final:
                output_dir = os.path.join(policy_dir, "final")
                serialize.save_stable_model(output_dir, policy, vec_normalize)
def rollouts_and_policy(
  _seed: int,
  env_name: str,
  total_timesteps: int,
  *,
  log_dir: str = None,
  num_vec: int = 8,
  parallel: bool = False,
  normalize: bool = True,
  make_blank_policy_kwargs: dict = {},

  rollout_save_interval: int = 0,
  rollout_save_final: bool = False,
  rollout_save_n_timesteps: Optional[int] = None,
  rollout_save_n_episodes: Optional[int] = None,

  policy_save_interval: int = -1,
  policy_save_final: bool = True,
) -> None:
  """Trains an expert policy from scratch and saves the rollouts and policy.

  At applicable training steps `step` (where step is either an integer or
  "final"):

      - Policies are saved to `{log_dir}/policies/{step}.pkl`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      normalize: If True, then rescale observations and reward.
      make_blank_policy_kwargs: Kwargs for `make_blank_policy`.
      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.
      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.
  """
  _validate_traj_generate_params(rollout_save_n_timesteps,
                                 rollout_save_n_episodes)

  with util.make_session():
    tf.logging.set_verbosity(tf.logging.INFO)
    sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                        format_strs=['tensorboard', 'stdout'])

    rollout_dir = osp.join(log_dir, "rollouts")
    policy_dir = osp.join(log_dir, "policies")
    os.makedirs(rollout_dir, exist_ok=True)
    os.makedirs(policy_dir, exist_ok=True)

    venv = util.make_vec_env(env_name, num_vec, seed=_seed,
                             parallel=parallel, log_dir=log_dir)
    vec_normalize = None
    if normalize:
      venv = vec_normalize = VecNormalize(venv)

    policy = util.init_rl(venv, verbose=1,
                          **make_blank_policy_kwargs)

    # Make callback to save intermediate artifacts during training.
    step = 0
    rollout_ok = rollout_save_interval > 0
    policy_ok = policy_save_interval > 0

    def callback(locals_: dict, _) -> bool:
      nonlocal step
      step += 1
      policy = locals_['self']

      if rollout_ok and step % rollout_save_interval == 0:
        util.rollout.save(
          rollout_dir, policy, venv, step,
          n_timesteps=rollout_save_n_timesteps,
          n_episodes=rollout_save_n_episodes)
      if policy_ok and step % policy_save_interval == 0:
        output_dir = os.path.join(policy_dir, f'{step:5d}')
        serialize.save_stable_model(output_dir, policy, vec_normalize)
      return True

    policy.learn(total_timesteps, callback=callback)

    # Save final artifacts after training is complete.
    if rollout_save_final:
      util.rollout.save(
        rollout_dir, policy, venv, "final",
        n_timesteps=rollout_save_n_timesteps,
        n_episodes=rollout_save_n_episodes)
    if policy_save_final:
      output_dir = os.path.join(policy_dir, "final")
      serialize.save_stable_model(output_dir, policy, vec_normalize)