Esempio n. 1
0
def eval_policy(
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    max_episode_steps: If not None, then environments are wrapped by
        TimeLimit so that they have at most `max_episode_steps` steps per
        episode.
    render: If True, renders interactively to the screen.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    reward_type: If specified, overrides the environment reward with
        a reward of this.
    reward_path: If reward_type is specified, the path to a serialized reward
        of `reward_type` to override the environment reward with.

  Returns:
    Return value of `imitation.util.rollout.rollout_stats()`.
  """
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps)

    if render:
        venv = InteractiveRender(venv, render_fps)
    # TODO(adam): add support for videos using VideoRecorder?

    with contextlib.ExitStack() as stack:
        if reward_type is not None:
            reward_fn_ctx = load_reward(reward_type, reward_path, venv)
            reward_fn = stack.enter_context(reward_fn_ctx)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            tf.logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        with serialize.load_policy(policy_type, policy_path, venv) as policy:
            trajs = rollout.generate_trajectories(policy, venv, sample_until)
    return rollout.rollout_stats(trajs)
Esempio n. 2
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    videos: bool,
    video_kwargs: Mapping[str, Any],
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

    Args:
      _seed: generated by Sacred.
      env_name: Gym environment identifier.
      eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
          one of `eval_n_episodes` and `eval_n_timesteps`.
      eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
          one of `eval_n_episodes` and `eval_n_timesteps`.
      num_vec: Number of environments to run simultaneously.
      parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
          uses `DummyVecEnv`.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      render: If True, renders interactively to the screen.
      render_fps: The target number of frames per second to render on screen.
      videos: If True, saves videos to `log_dir`.
      video_kwargs: Keyword arguments passed through to `video_wrapper.VideoWrapper`.
      log_dir: The directory to log intermediate output to, such as episode reward.
      policy_type: A unique identifier for the saved policy,
          defined in POLICY_CLASSES.
      policy_path: A path to the serialized policy.
      reward_type: If specified, overrides the environment reward with
          a reward of this.
      reward_path: If reward_type is specified, the path to a serialized reward
          of `reward_type` to override the environment reward with.

    Returns:
      Return value of `imitation.util.rollout.rollout_stats()`.
    """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    logging.basicConfig(level=logging.INFO)
    logging.info("Logging to %s", log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    post_wrappers = [video_wrapper_factory(log_dir, **video_kwargs)
                     ] if videos else None
    venv = util.make_vec_env(
        env_name,
        num_vec,
        seed=_seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
        post_wrappers=post_wrappers,
    )

    try:
        if render:
            # As of July 31, 2020, DummyVecEnv rendering only works with num_vec=1
            # due to a bug on Stable Baselines 3.
            venv = InteractiveRender(venv, render_fps)

        if reward_type is not None:
            reward_fn = load_reward(reward_type, reward_path, venv)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        policy = serialize.load_policy(policy_type, policy_path, venv)
        trajs = rollout.generate_trajectories(policy, venv, sample_until)
        return rollout.rollout_stats(trajs)
    finally:
        venv.close()
Esempio n. 3
0
def train_preferences(
    _seed: int,  # pylint:disable=invalid-name
    # Dataset
    env_name: str,
    discount: float,
    num_vec: int,
    policy_type: str,
    policy_path: str,
    # Target specification
    target_reward_type: str,
    target_reward_path: str,
    # Model parameters
    model_reward_type: regress_utils.EnvRewardFactory,
    trajectory_length: int,
    total_timesteps: int,
    batch_timesteps: int,
    learning_rate: float,
    weight_l2_reg: float,
    reward_l2_reg: float,
    accuracy_threshold: float,
    # Logging
    log_dir: str,
) -> Mapping[str, Any]:
    """Entry-point into script for synthetic preference comparisons."""
    venv = util.make_vec_env(env_name, n_envs=num_vec, seed=_seed)

    make_source = functools.partial(regress_utils.make_model,
                                    model_reward_type)

    def make_trainer(model, model_scope, target):
        del target
        model_params = model_scope.global_variables()
        batch_size = batch_timesteps // trajectory_length
        kwargs = {"learning_rate": learning_rate}
        return preferences.PreferenceComparisonTrainer(
            model,
            model_params,
            batch_size=batch_size,
            optimizer_kwargs=kwargs,
            weight_l2_reg=weight_l2_reg,
            reward_l2_reg=reward_l2_reg,
            accuracy_threshold=accuracy_threshold,
        )

    with policies_serialize.load_policy(policy_type, policy_path,
                                        venv) as policy:

        def do_training(target, trainer):
            # Specify in terms of total_timesteps so longer trajectory_length
            # does not give model more data.
            total_comparisons = total_timesteps // trajectory_length
            return trainer.fit_synthetic(
                venv,
                policy=policy,
                target=target,
                trajectory_length=trajectory_length,
                total_comparisons=total_comparisons,
            )

        return regress_utils.regress(
            seed=_seed,
            env_name=env_name,
            discount=discount,
            make_source=make_source,
            source_init=True,
            make_trainer=make_trainer,
            do_training=do_training,
            target_reward_type=target_reward_type,
            target_reward_path=target_reward_path,
            log_dir=log_dir,
        )
Esempio n. 4
0
def train(_run, _seed: int, env_name: str, rollout_path: str, normalize: bool,
          normalize_kwargs: dict, n_expert_demos: Optional[int], log_dir: str,
          init_trainer_kwargs: dict, total_timesteps: int,
          n_episodes_eval: int, init_tensorboard: bool,
          checkpoint_interval: int, dac: bool, rollout_save_n_timesteps: int,
          rollout_save_n_episodes: int, num_vec: int, parallel: bool,
          max_episode_steps: Optional[int]) -> dict:
    """Train an adversarial-network-based imitation learning algorithm.

  Plots (turn on using `plot_interval > 0`):
    - Plot discriminator loss during discriminator training steps in blue and
      discriminator loss during generator training steps in red.
    - Plot the performance of the generator policy versus the performance of
      a random policy. Also plot the performance of an expert policy if that is
      provided in the arguments.

  Checkpoints:
    - DiscrimNets are saved to f"{log_dir}/checkpoints/{step}/discrim/",
      where step is either the training epoch or "final".
    - Generator policies are saved to
      f"{log_dir}/checkpoints/{step}/gen_policy/".

  Args:
    _seed: Random seed.
    env_name: The environment to train in.
    rollout_path: Path to pickle containing list of Trajectories. Used as
      expert demonstrations.
    n_expert_demos: The number of expert trajectories to actually use
      after loading them from `rollout_path`.
      If None, then use all available trajectories.
      If `n_expert_demos` is an `int`, then use exactly `n_expert_demos`
      trajectories, erroring if there aren't enough trajectories. If there are
      surplus trajectories, then use the
      first `n_expert_demos` trajectories and drop the rest.
    log_dir: Directory to save models and other logging to.

    init_trainer_kwargs: Keyword arguments passed to `init_trainer`,
      used to initialize the trainer.
    total_timesteps: The number of transitions to sample from the environment
      during training.
    n_episodes_eval: The number of episodes to average over when calculating
      the average episode reward of the imitation policy for return.

    plot_interval: The number of epochs between each plot. If negative,
      then plots are disabled. If zero, then only plot at the end of training.
    n_plot_episodes: The number of episodes averaged over when
      calculating the average episode reward of a policy for the performance
      plots.
    extra_episode_data_interval: Usually mean episode rewards are calculated
      immediately before every plot. Set this parameter to a nonnegative number
      to also add episode reward data points every
      `extra_episodes_data_interval` epochs.
    show_plots: Figures are always saved to `f"{log_dir}/plots/*.png"`. If
      `show_plots` is True, then also show plots as they are created.
    init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`.

    checkpoint_interval: Save the discriminator and generator models every
      `checkpoint_interval` epochs and after training is complete. If 0,
      then only save weights after training is complete. If <0, then don't
      save weights at all.

  Returns:
    A dictionary with two keys. "imit_stats" gives the return value of
      `rollout_stats()` on rollouts test-reward-wrapped
      environment, using the final policy (remember that the ground-truth reward
      can be recovered from the "monitor_return" key). "expert_stats" gives the
      return value of `rollout_stats()` on the expert demonstrations loaded from
      `rollout_path`.
  """
    total_timesteps = int(total_timesteps)

    tf.logging.info("Logging to %s", log_dir)
    os.makedirs(log_dir, exist_ok=True)
    # try:
    #     sacred_util.build_sacred_symlink(log_dir, _run)
    # except Exception as e:
    #     print("didnt build symlink")
    # # Calculate stats for expert rollouts. Used for plot and return value.
    # with open(rollout_path, "rb") as f:
    #     expert_trajs = pickle.load(f)

    # if n_expert_demos is not None:
    #     assert len(expert_trajs) >= n_expert_demos
    #     expert_trajs = expert_trajs[:n_expert_demos]
    #
    # # expert_stats = util.rollout.rollout_stats(expert_trajs)
    sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps,
                                                  rollout_save_n_episodes)

    with util.make_session():
        venv = util.make_vec_env(env_name,
                                 num_vec,
                                 seed=_seed,
                                 parallel=parallel,
                                 log_dir=log_dir,
                                 max_episode_steps=max_episode_steps,
                                 dac=dac)

        print("type of venv is: ", type(venv))
        vec_normalize = None
        venv = vec_normalize = VecNormalize(venv)
        print("type of venv is: ", type(venv))
        # time.sleep(10)
        gen_policy_path = os.path.join(log_dir, "checkpoints", "final",
                                       "gen_policy")
        print("gen policy path is: ", gen_policy_path)
        time.sleep(10)
        with serialize.load_policy('ppo2', gen_policy_path, venv) as policy:
            print(policy)
            print('right before: ', type(venv))
            time.sleep(10)
            util.rollout.save(gen_policy_path, policy, venv, sample_until)