Esempio n. 1
0
def test_serialize_identity(env_name, net_cls, tmpdir):
    """Does output of deserialized reward network match that of original?"""
    logging.info(f"Testing {net_cls}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)

    tmppath = os.path.join(tmpdir, "reward.pt")
    th.save(original, tmppath)
    loaded = th.load(tmppath)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)

    unshaped_fn = serialize.load_reward("RewardNet_unshaped", tmppath, venv)
    shaped_fn = serialize.load_reward("RewardNet_shaped", tmppath, venv)
    rewards = {
        "train": [],
        "test": [],
    }
    for net in [original, loaded]:
        trans_args = (
            transitions.obs,
            transitions.acts,
            transitions.next_obs,
            transitions.dones,
        )
        rewards["train"].append(net.predict_reward_train(*trans_args))
        rewards["test"].append(net.predict_reward_test(*trans_args))

    args = (
        transitions.obs,
        transitions.acts,
        transitions.next_obs,
        transitions.dones,
    )
    rewards["train"].append(shaped_fn(*args))
    rewards["test"].append(unshaped_fn(*args))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
Esempio n. 2
0
def test_serialize_identity(session, env_name, reward_net):
    """Does output of deserialized reward network match that of original?"""
    net_name, net_cls = reward_net
    print(f"Testing {net_name}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with tf.variable_scope("original"):
        original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    with tempfile.TemporaryDirectory(
            prefix='imitation-serialize-rew') as tmpdir:
        original.save(tmpdir)
        with tf.variable_scope("loaded"):
            loaded = net_cls.load(tmpdir)

        assert original.observation_space == loaded.observation_space
        assert original.action_space == loaded.action_space

        rollouts = rollout.generate_transitions(random, venv, n_timesteps=100)
        feed_dict = {}
        outputs = {'train': [], 'test': []}
        for net in [original, loaded]:
            feed_dict.update(_make_feed_dict(net, rollouts))
            outputs['train'].append(net.reward_output_train)
            outputs['test'].append(net.reward_output_test)

        unshaped_name = f"{net_name}_unshaped"
        shaped_name = f"{net_name}_shaped"
        with serialize.load_reward(unshaped_name, tmpdir, venv) as unshaped_fn:
            with serialize.load_reward(shaped_name, tmpdir, venv) as shaped_fn:
                rewards = session.run(outputs, feed_dict=feed_dict)

                old_obs, actions, new_obs, _ = rollouts
                steps = np.zeros((old_obs.shape[0], ))
                rewards['train'].append(
                    shaped_fn(old_obs, actions, new_obs, steps))
                rewards['test'].append(
                    unshaped_fn(old_obs, actions, new_obs, steps))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
Esempio n. 3
0
def test_serialize_identity(session, env_name, net_cls, tmpdir):
    """Does output of deserialized reward network match that of original?"""
    logging.info(f"Testing {net_cls}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with tf.variable_scope("original"):
        original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    original.save(tmpdir)
    with tf.variable_scope("loaded"):
        loaded = reward_net.RewardNet.load(tmpdir)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)
    feed_dict = {}
    outputs = {"train": [], "test": []}
    for net in [original, loaded]:
        feed_dict.update(_make_feed_dict(net, transitions))
        outputs["train"].append(net.reward_output_train)
        outputs["test"].append(net.reward_output_test)

    with serialize.load_reward("RewardNet_unshaped", tmpdir, venv) as unshaped_fn:
        with serialize.load_reward("RewardNet_shaped", tmpdir, venv) as shaped_fn:
            rewards = session.run(outputs, feed_dict=feed_dict)

            args = (
                transitions.obs,
                transitions.acts,
                transitions.next_obs,
                transitions.dones,
            )
            rewards["train"].append(shaped_fn(*args))
            rewards["test"].append(unshaped_fn(*args))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
Esempio n. 4
0
def test_reward_valid(env_name, reward_type):
    """Test output of reward function is appropriate shape and type."""
    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    TRAJECTORY_LEN = 10
    obs = _sample(venv.observation_space, TRAJECTORY_LEN)
    actions = _sample(venv.action_space, TRAJECTORY_LEN)
    next_obs = _sample(venv.observation_space, TRAJECTORY_LEN)
    steps = np.arange(0, TRAJECTORY_LEN)

    reward_fn = serialize.load_reward(reward_type, "foobar", venv)
    pred_reward = reward_fn(obs, actions, next_obs, steps)

    assert pred_reward.shape == (TRAJECTORY_LEN, )
    assert isinstance(pred_reward[0], numbers.Number)
Esempio n. 5
0
def rollouts_and_policy(
  _run,
  _seed: int,
  env_name: str,
  total_timesteps: int,
  *,
  log_dir: str,
  num_vec: int,
  parallel: bool,
  max_episode_steps: Optional[int],
  normalize: bool,
  normalize_kwargs: dict,
  init_rl_kwargs: dict,

  n_episodes_eval: int,

  reward_type: Optional[str],
  reward_path: Optional[str],

  rollout_save_interval: int,
  rollout_save_final: bool,
  rollout_save_n_timesteps: Optional[int],
  rollout_save_n_episodes: Optional[int],

  policy_save_interval: int,
  policy_save_final: bool,

  init_tensorboard: bool,
) -> dict:
  """Trains an expert policy from scratch and saves the rollouts and policy.

  Checkpoints:
    At applicable training steps `step` (where step is either an integer or
    "final"):

      - Policies are saved to `{log_dir}/policies/{step}/`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      normalize: If True, then rescale observations and reward.
      normalize_kwargs: kwargs for `VecNormalize`.
      init_rl_kwargs: kwargs for `init_rl`.

      n_episodes_eval: The number of episodes to average over when calculating
          the average ground truth reward return of the final policy.

      reward_type: If provided, then load the serialized reward of this type,
          wrapping the environment in this reward. This is useful to test
          whether a reward model transfers. For more information, see
          `imitation.rewards.serialize.load_reward`.
      reward_path: A specifier, such as a path to a file on disk, used by
          reward_type to load the reward model. For more information, see
          `imitation.rewards.serialize.load_reward`.

      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.

      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.

      init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb
          and "output/summary/...".

  Returns:
    The return value of `rollout_stats()` using the final policy.
  """
  os.makedirs(log_dir, exist_ok=True)
  sacred_util.build_sacred_symlink(log_dir, _run)

  sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps,
                                                rollout_save_n_episodes)
  eval_sample_until = util.rollout.min_episodes(n_episodes_eval)

  with util.make_session():
    tf.logging.set_verbosity(tf.logging.INFO)
    sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                        format_strs=['tensorboard', 'stdout'])

    rollout_dir = osp.join(log_dir, "rollouts")
    policy_dir = osp.join(log_dir, "policies")
    os.makedirs(rollout_dir, exist_ok=True)
    os.makedirs(policy_dir, exist_ok=True)

    if init_tensorboard:
      sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
      init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir

    venv = util.make_vec_env(env_name, num_vec, seed=_seed,
                             parallel=parallel, log_dir=log_dir,
                             max_episode_steps=max_episode_steps)

    log_callbacks = []
    with contextlib.ExitStack() as stack:
      if reward_type is not None:
        reward_fn_ctx = load_reward(reward_type, reward_path, venv)
        reward_fn = stack.enter_context(reward_fn_ctx)
        venv = RewardVecEnvWrapper(venv, reward_fn)
        log_callbacks.append(venv.log_callback)
        tf.logging.info(
            f"Wrapped env in reward {reward_type} from {reward_path}.")

      vec_normalize = None
      if normalize:
        venv = vec_normalize = VecNormalize(venv, **normalize_kwargs)

      policy = util.init_rl(venv, verbose=1, **init_rl_kwargs)

      # Make callback to save intermediate artifacts during training.
      step = 0

      def callback(locals_: dict, _) -> bool:
        nonlocal step
        step += 1
        policy = locals_['self']

        # TODO(adam): make logging frequency configurable
        for callback in log_callbacks:
          callback(sb_logger)

        if rollout_save_interval > 0 and step % rollout_save_interval == 0:
          save_path = osp.join(rollout_dir, f"{step}.pkl")
          util.rollout.save(save_path, policy, venv, sample_until)
        if policy_save_interval > 0 and step % policy_save_interval == 0:
          output_dir = os.path.join(policy_dir, f'{step:05d}')
          serialize.save_stable_model(output_dir, policy, vec_normalize)

      policy.learn(total_timesteps, callback=callback)

      # Save final artifacts after training is complete.
      if rollout_save_final:
        save_path = osp.join(rollout_dir, "final.pkl")
        util.rollout.save(save_path, policy, venv, sample_until)
      if policy_save_final:
        output_dir = os.path.join(policy_dir, "final")
        serialize.save_stable_model(output_dir, policy, vec_normalize)

      # Final evaluation of expert policy.
      trajs = util.rollout.generate_trajectories(
          policy, venv, eval_sample_until)
      stats = util.rollout.rollout_stats(trajs)

  return stats
Esempio n. 6
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    max_episode_steps: If not None, then environments are wrapped by
        TimeLimit so that they have at most `max_episode_steps` steps per
        episode.
    render: If True, renders interactively to the screen.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    reward_type: If specified, overrides the environment reward with
        a reward of this.
    reward_path: If reward_type is specified, the path to a serialized reward
        of `reward_type` to override the environment reward with.

  Returns:
    Return value of `imitation.util.rollout.rollout_stats()`.
  """

    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps)
    venv = VecNormalize(venv, training=False, norm_reward=False)
    venv = venv.load(policy_path + "/vec_normalize.pkl", venv)

    if render:
        venv = InteractiveRender(venv, render_fps)
    # TODO(adam): add support for videos using VideoRecorder?

    with contextlib.ExitStack() as stack:
        if reward_type is not None:
            reward_fn_ctx = load_reward(reward_type, reward_path, venv)
            reward_fn = stack.enter_context(reward_fn_ctx)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            tf.logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        with serialize.load_policy(policy_type, policy_path, venv) as policy:
            trajs = rollout.generate_trajectories(policy, venv, sample_until)
    return rollout.rollout_stats(trajs)
Esempio n. 7
0
def batch_reward_heatmaps(
    checkpoints_dir: Union[str, pathlib.Path],
    n_gen_trajs: int = 50,
    exp_trajs: Optional[List[types.Trajectory]] = None,
) -> Dict[pathlib.Path, plt.Figure]:
    """Build multiple mountain car reward heatmaps from a checkpoint directory.

    One plot is generated for every combination of action and checkpoint timestep.

    Args:
        checkpoints_dir: Path to `checkpoint/` directory from AIRL or GAIL output
            directory.
        n_gen_trajs: The number of trajectories to rollout using each generator
            checkpoint. The transitions in the trajectory are scatterplotted on top of
            the heatmap from the same checkpoint timestamp. Nonpositive indicates that
            no trajectories should be plotted.
        exp_trajs: Expert trajectories for scatterplotting. Generator trajectories
            are dynamically generated from generator checkpoints.

    Returns:
        A dictionary mapping relative paths to `plt.Figure`. Every key is of the
        form "{action_name}/{checkpoint_step}" where action_name is "left",
        "neutral", or "right".
    """
    result = {}
    venv = vec_env.DummyVecEnv([lambda: gym.make("MountainCar-v0")])
    checkpoints_dir = pathlib.Path(checkpoints_dir)
    for checkpoint_dir in sorted(checkpoints_dir.iterdir()):
        vec_normalize_path = checkpoint_dir / "gen_policy" / "vec_normalize.pkl"
        discrim_path = checkpoint_dir / "discrim.pt"
        policy_path = checkpoint_dir / "gen_policy"

        if n_gen_trajs > 0:
            # `load_policy` automatically loads VecNormalize for policy evaluation.
            gen_policy = policies_serialize.load_policy(
                "ppo", str(policy_path), venv)
            gen_trajs = rollout.generate_trajectories(
                gen_policy,
                venv,
                sample_until=rollout.min_episodes(n_gen_trajs))
        else:
            gen_trajs = None

        # `gen_trajs` contains unnormalized observations.
        # Load VecNormalize for use in RewardFn, which doesn't automatically
        # normalize input observations.
        with open(vec_normalize_path, "rb") as f:
            vec_normalize = pickle.load(f)  # type: vec_env.VecNormalize
        vec_normalize.training = False

        reward_fn = rewards_serialize.load_reward("DiscrimNet", discrim_path,
                                                  venv)
        norm_rew_fn = common.build_norm_reward_fn(reward_fn=reward_fn,
                                                  vec_normalize=vec_normalize)
        for act in range(MC_NUM_ACTS):
            fig = make_heatmap(
                act=act,
                reward_fn=norm_rew_fn,
                gen_trajs=gen_trajs,
                exp_trajs=exp_trajs,
            )
            path = pathlib.Path(ACT_NAMES[act], checkpoint_dir.name)
            result[path] = fig
    return result
Esempio n. 8
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    videos: bool,
    video_kwargs: Mapping[str, Any],
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

    Args:
      _seed: generated by Sacred.
      env_name: Gym environment identifier.
      eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
          one of `eval_n_episodes` and `eval_n_timesteps`.
      eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
          one of `eval_n_episodes` and `eval_n_timesteps`.
      num_vec: Number of environments to run simultaneously.
      parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
          uses `DummyVecEnv`.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      render: If True, renders interactively to the screen.
      render_fps: The target number of frames per second to render on screen.
      videos: If True, saves videos to `log_dir`.
      video_kwargs: Keyword arguments passed through to `video_wrapper.VideoWrapper`.
      log_dir: The directory to log intermediate output to, such as episode reward.
      policy_type: A unique identifier for the saved policy,
          defined in POLICY_CLASSES.
      policy_path: A path to the serialized policy.
      reward_type: If specified, overrides the environment reward with
          a reward of this.
      reward_path: If reward_type is specified, the path to a serialized reward
          of `reward_type` to override the environment reward with.

    Returns:
      Return value of `imitation.util.rollout.rollout_stats()`.
    """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    logging.basicConfig(level=logging.INFO)
    logging.info("Logging to %s", log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    post_wrappers = [video_wrapper_factory(log_dir, **video_kwargs)
                     ] if videos else None
    venv = util.make_vec_env(
        env_name,
        num_vec,
        seed=_seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
        post_wrappers=post_wrappers,
    )

    try:
        if render:
            # As of July 31, 2020, DummyVecEnv rendering only works with num_vec=1
            # due to a bug on Stable Baselines 3.
            venv = InteractiveRender(venv, render_fps)

        if reward_type is not None:
            reward_fn = load_reward(reward_type, reward_path, venv)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        policy = serialize.load_policy(policy_type, policy_path, venv)
        trajs = rollout.generate_trajectories(policy, venv, sample_until)
        return rollout.rollout_stats(trajs)
    finally:
        venv.close()
Esempio n. 9
0
def rollouts_and_policy(
    _seed: int,
    env_name: str,
    total_timesteps: int,
    *,
    log_dir: str = None,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    normalize: bool = True,
    make_blank_policy_kwargs: dict = {},
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    rollout_save_interval: int = 0,
    rollout_save_final: bool = False,
    rollout_save_n_timesteps: Optional[int] = None,
    rollout_save_n_episodes: Optional[int] = None,
    policy_save_interval: int = -1,
    policy_save_final: bool = True,
) -> None:
    """Trains an expert policy from scratch and saves the rollouts and policy.

  At applicable training steps `step` (where step is either an integer or
  "final"):

      - Policies are saved to `{log_dir}/policies/{step}.pkl`.
      - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

  Args:
      env_name: The gym.Env name. Loaded as VecEnv.
      total_timesteps: Number of training timesteps in `model.learn()`.
      log_dir: The root directory to save metrics and checkpoints to.
      num_vec: Number of environments in VecEnv.
      parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      normalize: If True, then rescale observations and reward.
      make_blank_policy_kwargs: Kwargs for `make_blank_policy`.

      reward_type: If provided, then load the serialized reward of this type,
          wrapping the environment in this reward. This is useful to test
          whether a reward model transfers. For more information, see
          `imitation.rewards.serialize.load_reward`.
      reward_path: A specifier, such as a path to a file on disk, used by
          reward_type to load the reward model. For more information, see
          `imitation.rewards.serialize.load_reward`.

      rollout_save_interval: The number of training updates in between
          intermediate rollout saves. If the argument is nonpositive, then
          don't save intermediate updates.
      rollout_save_final: If True, then save rollouts right after training is
          finished.
      rollout_save_n_timesteps: The minimum number of timesteps saved in every
          file. Could be more than `rollout_save_n_timesteps` because
          trajectories are saved by episode rather than by transition.
          Must set exactly one of `rollout_save_n_timesteps`
          and `rollout_save_n_episodes`.
      rollout_save_n_episodes: The number of episodes saved in every
          file. Must set exactly one of `rollout_save_n_timesteps` and
          `rollout_save_n_episodes`.

      policy_save_interval: The number of training updates between saves. Has
          the same semantics are `rollout_save_interval`.
      policy_save_final: If True, then save the policy right after training is
          finished.
  """
    _validate_traj_generate_params(rollout_save_n_timesteps,
                                   rollout_save_n_episodes)

    with util.make_session():
        tf.logging.set_verbosity(tf.logging.INFO)
        sb_logger.configure(folder=osp.join(log_dir, 'rl'),
                            format_strs=['tensorboard', 'stdout'])

        rollout_dir = osp.join(log_dir, "rollouts")
        policy_dir = osp.join(log_dir, "policies")
        os.makedirs(rollout_dir, exist_ok=True)
        os.makedirs(policy_dir, exist_ok=True)

        venv = util.make_vec_env(env_name,
                                 num_vec,
                                 seed=_seed,
                                 parallel=parallel,
                                 log_dir=log_dir,
                                 max_episode_steps=max_episode_steps)

        log_callbacks = []
        with contextlib.ExitStack() as stack:
            if reward_type is not None:
                reward_fn_ctx = load_reward(reward_type, reward_path, venv)
                reward_fn = stack.enter_context(reward_fn_ctx)
                venv = RewardVecEnvWrapper(venv, reward_fn)
                log_callbacks.append(venv.log_callback)
                tf.logging.info(
                    f"Wrapped env in reward {reward_type} from {reward_path}.")

            vec_normalize = None
            if normalize:
                venv = vec_normalize = VecNormalize(venv)

            policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs)

            # Make callback to save intermediate artifacts during training.
            step = 0

            def callback(locals_: dict, _) -> bool:
                nonlocal step
                step += 1
                policy = locals_['self']

                # TODO(adam): make logging frequency configurable
                for callback in log_callbacks:
                    callback(sb_logger)

                if rollout_save_interval > 0 and step % rollout_save_interval == 0:
                    util.rollout.save(rollout_dir,
                                      policy,
                                      venv,
                                      step,
                                      n_timesteps=rollout_save_n_timesteps,
                                      n_episodes=rollout_save_n_episodes)
                if policy_save_interval > 0 and step % policy_save_interval == 0:
                    output_dir = os.path.join(policy_dir, f'{step:05d}')
                    serialize.save_stable_model(output_dir, policy,
                                                vec_normalize)
                return True  # Continue training.

            policy.learn(total_timesteps, callback=callback)

            # Save final artifacts after training is complete.
            if rollout_save_final:
                util.rollout.save(rollout_dir,
                                  policy,
                                  venv,
                                  "final",
                                  n_timesteps=rollout_save_n_timesteps,
                                  n_episodes=rollout_save_n_episodes)
            if policy_save_final:
                output_dir = os.path.join(policy_dir, "final")
                serialize.save_stable_model(output_dir, policy, vec_normalize)
Esempio n. 10
0
def rollouts_and_policy(
    _run,
    _seed: int,
    env_name: str,
    total_timesteps: int,
    *,
    log_dir: str,
    num_vec: int,
    parallel: bool,
    max_episode_steps: Optional[int],
    normalize: bool,
    normalize_kwargs: dict,
    init_rl_kwargs: dict,
    n_episodes_eval: int,
    reward_type: Optional[str],
    reward_path: Optional[str],
    rollout_save_final: bool,
    rollout_save_n_timesteps: Optional[int],
    rollout_save_n_episodes: Optional[int],
    policy_save_interval: int,
    policy_save_final: bool,
    init_tensorboard: bool,
) -> dict:
    """Trains an expert policy from scratch and saves the rollouts and policy.

    Checkpoints:
      At applicable training steps `step` (where step is either an integer or
      "final"):

        - Policies are saved to `{log_dir}/policies/{step}/`.
        - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

    Args:
        env_name: The gym.Env name. Loaded as VecEnv.
        total_timesteps: Number of training timesteps in `model.learn()`.
        log_dir: The root directory to save metrics and checkpoints to.
        num_vec: Number of environments in VecEnv.
        parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
        max_episode_steps: If not None, then environments are wrapped by
            TimeLimit so that they have at most `max_episode_steps` steps per
            episode.
        normalize: If True, then rescale observations and reward.
        normalize_kwargs: kwargs for `VecNormalize`.
        init_rl_kwargs: kwargs for `init_rl`.

        n_episodes_eval: The number of episodes to average over when calculating
            the average ground truth reward return of the final policy.

        reward_type: If provided, then load the serialized reward of this type,
            wrapping the environment in this reward. This is useful to test
            whether a reward model transfers. For more information, see
            `imitation.rewards.serialize.load_reward`.
        reward_path: A specifier, such as a path to a file on disk, used by
            reward_type to load the reward model. For more information, see
            `imitation.rewards.serialize.load_reward`.

        rollout_save_interval: The number of training updates in between
            intermediate rollout saves. If the argument is nonpositive, then
            don't save intermediate updates.
        rollout_save_final: If True, then save rollouts right after training is
            finished.
        rollout_save_n_timesteps: The minimum number of timesteps saved in every
            file. Could be more than `rollout_save_n_timesteps` because
            trajectories are saved by episode rather than by transition.
            Must set exactly one of `rollout_save_n_timesteps`
            and `rollout_save_n_episodes`.
        rollout_save_n_episodes: The number of episodes saved in every
            file. Must set exactly one of `rollout_save_n_timesteps` and
            `rollout_save_n_episodes`.

        policy_save_interval: The number of training updates between saves. Has
            the same semantics are `rollout_save_interval`.
        policy_save_final: If True, then save the policy right after training is
            finished.

        init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb
            and "output/summary/...".

    Returns:
      The return value of `rollout_stats()` using the final policy.
    """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    sample_until = rollout.make_sample_until(rollout_save_n_timesteps,
                                             rollout_save_n_episodes)
    eval_sample_until = rollout.min_episodes(n_episodes_eval)

    logging.basicConfig(level=logging.INFO)
    logger.configure(folder=osp.join(log_dir, "rl"),
                     format_strs=["tensorboard", "stdout"])

    rollout_dir = osp.join(log_dir, "rollouts")
    policy_dir = osp.join(log_dir, "policies")
    os.makedirs(rollout_dir, exist_ok=True)
    os.makedirs(policy_dir, exist_ok=True)

    if init_tensorboard:
        # sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
        # Convert sacred's ReadOnlyDict to dict so we can modify on next line.
        init_rl_kwargs = dict(init_rl_kwargs)
        # init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir
        # FIXME(sam): this is another hack to prevent SB3 from configuring the
        # logger on the first .learn() call. Remove it once SB3 issue #109 is
        # fixed.
        init_rl_kwargs["tensorboard_log"] = None

    venv = util.make_vec_env(
        env_name,
        num_vec,
        seed=_seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
    )

    callback_objs = []
    if reward_type is not None:
        reward_fn = load_reward(reward_type, reward_path, venv)
        venv = RewardVecEnvWrapper(venv, reward_fn)
        callback_objs.append(venv.make_log_callback())
        logging.info(
            f"Wrapped env in reward {reward_type} from {reward_path}.")

    vec_normalize = None
    if normalize:
        venv = vec_normalize = VecNormalize(venv, **normalize_kwargs)

    if policy_save_interval > 0:
        save_policy_callback = serialize.SavePolicyCallback(
            policy_dir, vec_normalize)
        save_policy_callback = callbacks.EveryNTimesteps(
            policy_save_interval, save_policy_callback)
        callback_objs.append(save_policy_callback)
    callback = callbacks.CallbackList(callback_objs)

    policy = util.init_rl(venv, verbose=1, **init_rl_kwargs)
    policy.learn(total_timesteps, callback=callback)

    # Save final artifacts after training is complete.
    if rollout_save_final:
        save_path = osp.join(rollout_dir, "final.pkl")
        rollout.rollout_and_save(save_path, policy, venv, sample_until)
    if policy_save_final:
        output_dir = os.path.join(policy_dir, "final")
        serialize.save_stable_model(output_dir, policy, vec_normalize)

    # Final evaluation of expert policy.
    trajs = rollout.generate_trajectories(policy, venv, eval_sample_until)
    stats = rollout.rollout_stats(trajs)

    return stats