Exemple #1
0
def test_reward_overwrite():
    """Test that reward wrapper actually overwrites base rewards."""
    env_name = "Pendulum-v0"
    num_envs = 3
    env = util.make_vec_env(env_name, num_envs)
    reward_fn = FunkyReward()
    wrapped_env = reward_wrapper.RewardVecEnvWrapper(env, reward_fn)
    policy = RandomPolicy(env.observation_space, env.action_space)
    sample_until = rollout.min_episodes(10)
    default_stats = rollout.rollout_stats(
        rollout.generate_trajectories(policy, env, sample_until))
    wrapped_stats = rollout.rollout_stats(
        rollout.generate_trajectories(policy, wrapped_env, sample_until))
    # Pendulum-v0 always has negative rewards
    assert default_stats["return_max"] < 0
    # ours gives between 1 * traj_len and num_envs * traj_len reward
    # (trajectories are all constant length of 200 in Pendulum)
    steps = wrapped_stats["len_mean"]
    assert wrapped_stats["return_min"] == 1 * steps
    assert wrapped_stats["return_max"] == num_envs * steps

    # check that wrapped reward is negative (all pendulum rewards is negative)
    # and other rewards are non-negative
    rand_act, _, _, _ = policy.step(wrapped_env.reset())
    _, rew, _, infos = wrapped_env.step(rand_act)
    assert np.all(rew >= 0)
    assert np.all([info_dict["wrapped_env_rew"] < 0 for info_dict in infos])
Exemple #2
0
    def __init__(
        self,
        venv: VecEnv,
        rollouts: Sequence[types.Trajectory],
        imitation_trainer: BaseRLModel,
        *,
        standardise_inputs: bool = True,
        kernel: str = "gaussian",
        kernel_bandwidth: float = 0.5,
        density_type: str = STATE_ACTION_DENSITY,
        is_stationary: bool = False,
    ):
        r"""Family of simple imitation learning baseline algorithms that apply RL to
         maximise a rough density estimate of the demonstration trajectories.
         Specifically, it constructs a non-parametric estimate of `p(s)`, `p(s,s')`,
         `p_t(s,a)`, etc. (depending on options), then rewards the imitation learner
         with `r_t(s,a,s')=\log p_t(s,a,s')` (or `\log p(s,s')`, or whatever the
         user wants the model to condition on).

         Args:
             venv: environment to train on.
             rollouts: list of expert trajectories to imitate.
             imitation_trainer: RL algorithm & initial policy that will
                 be used to train the imitation learner.
             kernel, kernel_bandwidth, density_type, is_stationary,
                 n_expert_trajectories: these are passed directly to `DensityReward`;
                 refer to documentation for that class."""
        self.venv = venv
        self.imitation_trainer = imitation_trainer
        self.reward_fn = DensityReward(
            trajectories=rollouts,
            density_type=density_type,
            obs_space=self.venv.observation_space,
            act_space=self.venv.action_space,
            is_stationary=is_stationary,
            kernel=kernel,
            kernel_bandwidth=kernel_bandwidth,
            standardise_inputs=standardise_inputs,
        )
        self.wrapped_env = reward_wrapper.RewardVecEnvWrapper(
            self.venv, self.reward_fn)
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        with self.graph.as_default():
            self.sess.run(tf.global_variables_initializer())
Exemple #3
0
    def wrap_env_train_reward(self, env):
        """Returns the given Env wrapped with a reward function that returns
    the AIRL training reward (discriminator confusion).

    The wrapped `Env`'s reward is directly evaluated from the reward network,
    and therefore changes whenever `self.train()` is called.

    Args:
        env (str, Env, or VecEnv): The Env that we want to wrap. If a
            string environment name is given or a Env is given, then we first
            convert to a VecEnv before continuing.
        wrapped_env (VecEnv): The wrapped environment with a new reward.
    """
        env = util.maybe_load_env(env, vectorize=True)
        if self.debug_use_ground_truth:
            return env
        else:
            return reward_wrapper.RewardVecEnvWrapper(
                env, self._policy_train_reward_fn)
Exemple #4
0
    def wrap_env_test_reward(self, env):
        """Returns the given Env wrapped with a reward function that returns
    the reward learned by this Trainer.

    The wrapped `Env`'s reward is directly evaluated from the reward network,
    and therefore changes whenever `self.train()` is called.

    Args:
        env (str, Env, or VecEnv): The Env that should be wrapped. If a
            string environment name is given or a Env is given, then we first
            make a VecEnv before continuing.

    Returns:
        wrapped_env (VecEnv): The wrapped environment with a new reward.
    """
        env = util.maybe_load_env(env, vectorize=True)
        if self.debug_use_ground_truth:
            return env
        else:
            return reward_wrapper.RewardVecEnvWrapper(env,
                                                      self._test_reward_fn)
Exemple #5
0
def adversarial_learning(
    venv,
    expert=None,
    expert_venv=None,
    expert_trajectories=None,
    state_only=False,
    policy_fn=get_ppo,
    total_timesteps=20000,
    gen_batch_size=200,
    disc_batch_size=100,
    updates_per_batch=2,
    policy_lr=1e-3,
    reward_lr=1e-3,
    is_airl=True,
    **kwargs,
):
    # Set up generator
    gen_policy = policy_fn(venv, learning_rate=policy_lr)
    policy = gen_policy

    # Set up discriminator
    if is_airl:
        rn = BasicShapedRewardNet(
            venv.observation_space,
            venv.action_space,
            theta_units=[32, 32],
            phi_units=[32, 32],
            scale=True,
            state_only=state_only,
        )
        discrim = DiscrimNetAIRL(rn, entropy_weight=1.0)
    else:
        rn = None
        discrim = DiscrimNetGAIL(venv.observation_space, venv.action_space)

    # Set up optimizer
    train_op = tf.train.AdamOptimizer(learning_rate=reward_lr).minimize(
        tf.reduce_mean(discrim.disc_loss))

    # Set up environment reward
    reward_train = functools.partial(
        discrim.reward_train, gen_log_prob_fn=gen_policy.action_probability)
    venv_train = reward_wrapper.RewardVecEnvWrapper(venv, reward_train)
    venv_train_buffering = BufferingWrapper(venv_train)
    gen_policy.set_env(venv_train_buffering)  # possibly redundant

    # Set up replay buffers
    gen_replay_buffer_capacity = 20 * gen_batch_size
    gen_replay_buffer = buffer.ReplayBuffer(gen_replay_buffer_capacity, venv)

    if expert_trajectories is not None:
        expert_transitions = flatten_trajectories(expert_trajectories)
        exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_transitions)
    else:
        exp_replay_buffer = buffer.ReplayBuffer(gen_replay_buffer_capacity,
                                                venv)

    # Start training
    sess = tf.get_default_session()
    sess.run(tf.global_variables_initializer())

    num_epochs = int(np.ceil(total_timesteps / gen_batch_size))

    for epoch in range(num_epochs):
        # Train gen
        gen_policy.learn(total_timesteps=gen_batch_size,
                         reset_num_timesteps=True)
        gen_replay_buffer.store(venv_train_buffering.pop_transitions())

        if expert_trajectories is None:
            exp_replay_buffer.store(
                flatten_trajectories(
                    sample_trajectories(expert_venv,
                                        expert,
                                        n_timesteps=gen_batch_size)))

        # Train disc
        for _ in range(updates_per_batch):
            disc_minibatch_size = disc_batch_size // updates_per_batch
            half_minibatch = disc_minibatch_size // 2

            gen_samples = gen_replay_buffer.sample(half_minibatch)
            expert_samples = exp_replay_buffer.sample(half_minibatch)

            obs = np.concatenate([gen_samples.obs, expert_samples.obs])
            acts = np.concatenate([gen_samples.acts, expert_samples.acts])
            next_obs = np.concatenate(
                [gen_samples.next_obs, expert_samples.next_obs])
            labels = np.concatenate(
                [np.ones(half_minibatch),
                 np.zeros(half_minibatch)])

            log_act_prob = gen_policy.action_probability(obs,
                                                         actions=acts,
                                                         logp=True)
            log_act_prob = log_act_prob.reshape((disc_minibatch_size, ))

            _, logits_v, loss_v = sess.run(
                [
                    train_op,
                    discrim._disc_logits_gen_is_high,
                    discrim._disc_loss,
                ],
                feed_dict={
                    discrim.obs_ph: obs,
                    discrim.act_ph: acts,
                    discrim.next_obs_ph: next_obs,
                    discrim.labels_gen_is_one_ph: labels,
                    discrim.log_policy_act_prob_ph: log_act_prob,
                },
            )

    results = {}
    results["reward_model"] = rn
    results["discrim"] = discrim
    results["policy"] = gen_policy

    return results
Exemple #6
0
    def __init__(
        self,
        venv: vec_env.VecEnv,
        gen_algo: on_policy_algorithm.OnPolicyAlgorithm,
        discrim: discrim_nets.DiscrimNet,
        expert_data: Union[Iterable[Mapping], types.Transitions],
        expert_batch_size: int,
        n_disc_updates_per_round: int = 2,
        *,
        log_dir: str = "output/",
        normalize_obs: bool = True,
        normalize_reward: bool = True,
        disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam,
        disc_opt_kwargs: Optional[Mapping] = None,
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
    ):
        """Builds AdversarialTrainer.

        Args:
            venv: The vectorized environment to train in.
            gen_algo: The generator RL algorithm that is trained to maximize
                discriminator confusion. The generator batch size
                `self.gen_batch_size` is inferred from `gen_algo.n_steps`.
            discrim: The discriminator network. This will be moved to the same
                device as `gen_algo`.
            expert_data: Either a `torch.utils.data.DataLoader`-like object or an
                instance of `Transitions` which is automatically converted into a
                shuffled version of the former type.

                If the argument passed is a `DataLoader`, then it must yield batches of
                expert data via its `__iter__` method. Each batch is a dictionary whose
                keys "obs", "acts", "next_obs", and "dones", correspond to Tensor or
                NumPy array values each with batch dimension equal to
                `expert_batch_size`. If any batch dimension doesn't equal
                `expert_batch_size` then a `ValueError` is raised.

                If the argument is a `Transitions` instance, then `len(expert_data)`
                must be at least `expert_batch_size`.
            expert_batch_size: The number of samples in each batch yielded from
                the expert data loader. The discriminator batch size is twice this
                number because each discriminator batch contains a generator sample for
                every expert sample.
            n_discrim_updates_per_round: The number of discriminator updates after each
                round of generator updates in AdversarialTrainer.learn().
            log_dir: Directory to store TensorBoard logs, plots, etc. in.
            normalize_obs: Whether to normalize observations with `VecNormalize`.
            normalize_reward: Whether to normalize rewards with `VecNormalize`.
            disc_opt_cls: The optimizer for discriminator training.
            disc_opt_kwargs: Parameters for discriminator training.
            gen_replay_buffer_capacity: The capacity of the
                generator replay buffer (the number of obs-action-obs samples from
                the generator that can be stored).

                By default this is equal to `self.gen_batch_size`, meaning that we
                sample only from the most recent batch of generator samples.
            init_tensorboard: If True, makes various discriminator
                TensorBoard summaries.
            init_tensorboard_graph: If both this and `init_tensorboard` are True,
                then write a Tensorboard graph summary to disk.
            debug_use_ground_truth: If True, use the ground truth reward for
                `self.train_env`.
                This disables the reward wrapping that would normally replace
                the environment reward with the learned reward. This is useful for
                sanity checking that the policy training is functional.
        """

        assert (
            logger.is_configured()
        ), "Requires call to imitation.util.logger.configure"
        self._global_step = 0
        self._disc_step = 0
        self.n_disc_updates_per_round = n_disc_updates_per_round

        if expert_batch_size <= 0:
            raise ValueError(f"expert_batch_size={expert_batch_size} must be positive.")

        self.expert_batch_size = expert_batch_size
        if isinstance(expert_data, types.Transitions):
            if len(expert_data) < expert_batch_size:
                raise ValueError(
                    "Provided Transitions instance as `expert_data` argument but "
                    "len(expert_data) < expert_batch_size. "
                    f"({len(expert_data)} < {expert_batch_size})."
                )

            self.expert_data_loader = th_data.DataLoader(
                expert_data,
                batch_size=expert_batch_size,
                collate_fn=types.transitions_collate_fn,
                shuffle=True,
                drop_last=True,
            )
        else:
            self.expert_data_loader = expert_data
        self._endless_expert_iterator = util.endless_iter(self.expert_data_loader)

        self.debug_use_ground_truth = debug_use_ground_truth
        self.venv = venv
        self.gen_algo = gen_algo
        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self.discrim = discrim.to(self.gen_algo.device)
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs or {}
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._disc_opt = self._disc_opt_cls(
            self.discrim.parameters(), **self._disc_opt_kwargs
        )

        if self._init_tensorboard:
            logging.info("building summary directory at " + self._log_dir)
            summary_dir = os.path.join(self._log_dir, "summary")
            os.makedirs(summary_dir, exist_ok=True)
            self._summary_writer = thboard.SummaryWriter(summary_dir)

        self.venv_buffering = wrappers.BufferingWrapper(self.venv)
        self.venv_norm_obs = vec_env.VecNormalize(
            self.venv_buffering,
            norm_reward=False,
            norm_obs=normalize_obs,
        )

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.venv_wrapped = self.venv_norm_obs
            self.gen_callback = None
        else:
            self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper(
                self.venv_norm_obs, self.discrim.predict_reward_train
            )
            self.gen_callback = self.venv_wrapped.make_log_callback()
        self.venv_train = vec_env.VecNormalize(
            self.venv_wrapped, norm_obs=False, norm_reward=normalize_reward
        )

        self.gen_algo.set_env(self.venv_train)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv
        )
Exemple #7
0
    def __init__(
        self,
        venv: VecEnv,
        gen_policy: BaseRLModel,
        discrim: discrim_net.DiscrimNet,
        expert_demos: rollout.Transitions,
        *,
        log_dir: str = 'output/',
        disc_batch_size: int = 2048,
        disc_minibatch_size: int = 256,
        disc_opt_cls: tf.train.Optimizer = tf.train.AdamOptimizer,
        disc_opt_kwargs: dict = {},
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
    ):
        """Builds Trainer.

    Args:
        venv: The vectorized environment to train in.
        gen_policy: The generator policy that is trained to maximize
          discriminator confusion. The generator batch size
          `self.gen_batch_size` is inferred from `gen_policy.n_batch`.
        discrim: The discriminator network.
          For GAIL, use a DiscrimNetGAIL. For AIRL, use a DiscrimNetAIRL.
        expert_demos: Transitions from an expert dataset.
        log_dir: Directory to store TensorBoard logs, plots, etc. in.
        disc_batch_size: The default number of expert and generator transitions
          samples to feed to the discriminator in each call to
          `self.train_disc()`. (Half of the samples are expert and half of the
          samples are generator).
        disc_minibatch_size: The discriminator minibatch size. Each
          discriminator batch is split into minibatches and an Adam update is
          applied on the gradient resulting form each minibatch. Must evenly
          divide `disc_batch_size`. Must be an even number.
        disc_opt_cls: The optimizer for discriminator training.
        disc_opt_kwargs: Parameters for discriminator training.
        gen_replay_buffer_capacity: The capacity of the
          generator replay buffer (the number of obs-action-obs samples from
          the generator that can be stored).

          By default this is equal to `20 * self.gen_batch_size`.
        init_tensorboard: If True, makes various discriminator
          TensorBoard summaries.
        init_tensorboard_graph: If both this and `init_tensorboard` are True,
          then write a Tensorboard graph summary to disk.
        debug_use_ground_truth: If True, use the ground truth reward for
          `self.train_env`.
          This disables the reward wrapping that would normally replace
          the environment reward with the learned reward. This is useful for
          sanity checking that the policy training is functional.
    """
        assert util.logger.is_configured(), ("Requires call to "
                                             "imitation.util.logger.configure")
        self._sess = tf.get_default_session()
        self._global_step = tf.train.create_global_step()

        assert disc_batch_size % disc_minibatch_size == 0
        assert disc_minibatch_size % 2 == 0, (
            "discriminator minibatch size must be even "
            "(equal split between generator and expert samples)")
        self.disc_batch_size = disc_batch_size
        self.disc_minibatch_size = disc_minibatch_size

        self.debug_use_ground_truth = debug_use_ground_truth

        self.venv = venv
        self._expert_demos = expert_demos
        self._gen_policy = gen_policy

        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self._discrim = discrim
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._build_graph()
        self._sess.run(tf.global_variables_initializer())

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.reward_train = self.reward_test = None
            self.venv_train = self.venv_test = self.venv
        else:
            self.reward_train = partial(
                self.discrim.reward_train,
                gen_log_prob_fn=self._gen_policy.action_probability)
            self.reward_test = self.discrim.reward_test
            self.venv_train = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.reward_train)
            self.venv_test = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.reward_test)

        self.venv_train_norm = VecNormalize(self.venv_train)
        self.venv_train_norm_buffering = BufferingWrapper(self.venv_train_norm)
        self.gen_policy.set_env(self.venv_train_norm_buffering)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = 20 * self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv)
        self._exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_demos)
        if self.disc_batch_size // 2 > len(self._exp_replay_buffer):
            warn(
                "The discriminator batch size is more than twice the number of "
                "expert samples. This means that we will be reusing samples every "
                "discrim batch.")
Exemple #8
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    max_episode_steps: If not None, then environments are wrapped by
        TimeLimit so that they have at most `max_episode_steps` steps per
        episode.
    render: If True, renders interactively to the screen.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    reward_type: If specified, overrides the environment reward with
        a reward of this.
    reward_path: If reward_type is specified, the path to a serialized reward
        of `reward_type` to override the environment reward with.

  Returns:
    Return value of `imitation.util.rollout.rollout_stats()`.
  """

    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps)
    venv = VecNormalize(venv, training=False, norm_reward=False)
    venv = venv.load(policy_path + "/vec_normalize.pkl", venv)

    if render:
        venv = InteractiveRender(venv, render_fps)
    # TODO(adam): add support for videos using VideoRecorder?

    with contextlib.ExitStack() as stack:
        if reward_type is not None:
            reward_fn_ctx = load_reward(reward_type, reward_path, venv)
            reward_fn = stack.enter_context(reward_fn_ctx)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            tf.logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        with serialize.load_policy(policy_type, policy_path, venv) as policy:
            trajs = rollout.generate_trajectories(policy, venv, sample_until)
    return rollout.rollout_stats(trajs)
Exemple #9
0
    def __init__(
        self,
        venv: vec_env.VecEnv,
        gen_algo: base_class.BaseAlgorithm,
        discrim: discrim_nets.DiscrimNet,
        expert_data: Union[datasets.Dataset[types.Transitions],
                           types.Transitions],
        *,
        log_dir: str = "output/",
        disc_batch_size: int = 2048,
        disc_minibatch_size: int = 256,
        disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam,
        disc_opt_kwargs: Optional[Mapping] = None,
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
        device: Union[str, th.device] = "auto",
    ):
        """Builds AdversarialTrainer.

        Args:
            venv: The vectorized environment to train in.
            gen_algo: The generator RL algorithm that is trained to maximize
              discriminator confusion. The generator batch size
              `self.gen_batch_size` is inferred from `gen_algo.n_steps`.
            discrim: The discriminator network. This will be moved to the same
              device as `gen_algo`.
            expert_data: Either a `Dataset` of expert `Transitions`, or an instance of
              `Transitions` to be automatically converted into a
              `Dataset[Transitions]`.
            log_dir: Directory to store TensorBoard logs, plots, etc. in.
            disc_batch_size: The default number of expert and generator transitions
              samples to feed to the discriminator in each call to
              `self.train_disc()`. (Half of the samples are expert and half of the
              samples are generator).
            disc_minibatch_size: The discriminator minibatch size. Each
              discriminator batch is split into minibatches and an Adam update is
              applied on the gradient resulting form each minibatch. Must evenly
              divide `disc_batch_size`. Must be an even number.
            disc_opt_cls: The optimizer for discriminator training.
            disc_opt_kwargs: Parameters for discriminator training.
            gen_replay_buffer_capacity: The capacity of the
              generator replay buffer (the number of obs-action-obs samples from
              the generator that can be stored).

              By default this is equal to `self.gen_batch_size`, meaning that we
              sample only from the most recent batch of generator samples.
            init_tensorboard: If True, makes various discriminator
              TensorBoard summaries.
            init_tensorboard_graph: If both this and `init_tensorboard` are True,
              then write a Tensorboard graph summary to disk.
            debug_use_ground_truth: If True, use the ground truth reward for
              `self.train_env`.
              This disables the reward wrapping that would normally replace
              the environment reward with the learned reward. This is useful for
              sanity checking that the policy training is functional.
        """
        assert (logger.is_configured()
                ), "Requires call to imitation.util.logger.configure"
        self._global_step = 0
        self._disc_step = 0

        assert disc_batch_size % disc_minibatch_size == 0
        assert disc_minibatch_size % 2 == 0, (
            "discriminator minibatch size must be even "
            "(equal split between generator and expert samples)")
        self.disc_batch_size = disc_batch_size
        self.disc_minibatch_size = disc_minibatch_size
        self.debug_use_ground_truth = debug_use_ground_truth
        self.venv = venv
        self.gen_algo = gen_algo
        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self.discrim = discrim.to(self.gen_algo.device)
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs or {}
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._disc_opt = self._disc_opt_cls(self.discrim.parameters(),
                                            **self._disc_opt_kwargs)

        if self._init_tensorboard:
            logging.info("building summary directory at " + self._log_dir)
            summary_dir = os.path.join(self._log_dir, "summary")
            os.makedirs(summary_dir, exist_ok=True)
            self._summary_writer = thboard.SummaryWriter(summary_dir)

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.venv_train = self.venv_test = self.venv
        else:
            self.venv_train = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.discrim.predict_reward_train)
            self.venv_test = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.discrim.predict_reward_test)

        self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train)
        self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering)
        self.gen_algo.set_env(self.venv_train_norm)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv)

        if isinstance(expert_data, types.Transitions):
            # Somehow, pytype doesn't recognize that `expert_data` is Transitions.
            expert_data = datasets.TransitionsDictDatasetAdaptor(
                expert_data,  # pytype: disable=wrong-arg-types
            )
        self._expert_dataset = expert_data

        expert_ds_size = self._expert_dataset.size()
        if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size:
            warnings.warn(
                "The discriminator batch size is more than twice the number of "
                "expert samples. This means that we will be reusing expert samples "
                "every discrim batch.",
                category=RuntimeWarning,
            )
def preferences(
    venv,
    expert=None,
    evaluate_trajectories_fn=None,
    n_pairs_per_batch=50,
    n_timesteps_per_query=None,
    reward_lr=1e-3,
    policy_lr=1e-3,
    policy_epoch_timesteps=200,
    total_timesteps=10000,
    state_only=False,
    use_rnd_bonus=False,
    rnd_lr=1e-3,
    rnd_coeff=0.5,
    normalize_extrinsic=False,
    egreedy_sampling=False,
    **kwargs,
):
    if n_pairs_per_batch is None:
        horizon = get_horizon(venv)
        n_pairs_per_batch = (n_timesteps_per_query / (2 * horizon))

    if evaluate_trajectories_fn is None:
        reward_eval_fn = reward_eval_path_fn(venv)
        evaluate_trajectories_fn = get_eval_trajectories_fn(reward_eval_fn)

    # Create reward model
    rn = BasicShapedRewardNet(
        venv.observation_space,
        venv.action_space,
        theta_units=[32, 32],
        phi_units=[32, 32],
        scale=True,
        state_only=state_only,
    )

    # Compute trajectory probabilities
    preferences_ph = tf.placeholder(
        shape=(None, 2),
        dtype=tf.float32,
        name="preferences",
    )
    num_segments = 2 * tf.shape(preferences_ph)[0]
    rewards_out = tf.reshape(rn.reward_output_train, [num_segments, -1])
    returns_out = tf.reduce_sum(rewards_out, axis=1)
    returns = tf.reshape(returns_out, shape=[-1, 2])
    log_probs = tf.nn.log_softmax(returns, axis=1)

    # Write loss and optimizer op
    loss = (-1) * tf.reduce_sum(log_probs * preferences_ph)
    optimizer = tf.train.AdamOptimizer(learning_rate=reward_lr)
    reward_train_op = optimizer.minimize(loss)

    base_extrinsic_reward_fn = get_reward_fn_from_model(rn)

    if not use_rnd_bonus:
        reward_fn = base_extrinsic_reward_fn
    else:
        # Random network distillation bonus
        rnd_size = 50

        inputs = [rn.obs_inp, rn.act_inp]
        inputs = [tf.layers.flatten(x) for x in inputs]
        inputs = tf.concat(inputs, axis=1)

        rnd_target_net = build_mlp([32, 32, 32], output_size=rnd_size)
        rnd_target = sequential(inputs, rnd_target_net)

        rnd_pred_net = build_mlp([32, 32, 32], output_size=rnd_size)
        rnd_pred = sequential(inputs, rnd_pred_net)

        rnd_loss = tf.reduce_mean((tf.stop_gradient(rnd_target) - rnd_pred)**2)
        rnd_optimizer = tf.train.AdamOptimizer(learning_rate=rnd_lr)
        rnd_train_op = rnd_optimizer.minimize(rnd_loss)

        runn_rnd_rews = RunningMeanVar(alpha=0.01)

        def rnd_reward_fn(obs, acts=None, *args, **kwargs):
            if acts is None:
                acts = [venv.action_space.sample()]
            int_rew = sess.run(rnd_loss,
                               feed_dict={
                                   rn.obs_ph: obs,
                                   rn.act_ph: acts
                               })
            int_rew_old = int_rew
            int_rew = runn_rnd_rews.exp_update(int_rew)

            return int_rew

        if normalize_extrinsic:
            runn_ext_rews = RunningMeanVar(alpha=0.01)

        def extrinsic_reward_fn(*args, **kwargs):
            ext_rew = base_extrinsic_reward_fn(*args, **kwargs)
            if normalize_extrinsic:
                ext_rew = runn_ext_rews.exp_update(ext_rew)
            return ext_rew

        def reward_fn(*args, **kwargs):
            return extrinsic_reward_fn(
                *args, **kwargs) + rnd_coeff * rnd_reward_fn(*args, **kwargs)

    # Create learner from reward model
    venv_train = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
    policy = PPO2(MlpPolicy, venv_train, learning_rate=policy_lr)

    # Start training
    sess = tf.get_default_session()
    sess.run(tf.global_variables_initializer())

    sampling_policy = make_egreedy(policy,
                                   venv) if egreedy_sampling else policy

    num_epochs = int(np.ceil(total_timesteps / policy_epoch_timesteps))

    for epoch in range(num_epochs):
        trajectories = sample_trajectories(venv, sampling_policy,
                                           2 * n_pairs_per_batch)

        segments = get_segments(trajectories)

        seg_returns = evaluate_trajectories_fn(segments)
        seg_returns = seg_returns.reshape(-1, 2)
        preferences = np.stack(
            [
                seg_returns[:, 0] > seg_returns[:, 1],
                seg_returns[:, 1] > seg_returns[:, 0],
            ],
            axis=1,
        )

        obs = np.concatenate([seg.obs for seg in segments])
        acts = np.concatenate([seg.acts for seg in segments])
        next_obs = np.concatenate([seg.next_obs for seg in segments])

        ops = [reward_train_op]
        if use_rnd_bonus:
            ops.append(rnd_train_op)

        sess.run(
            ops,
            feed_dict={
                rn.obs_ph: obs,
                rn.act_ph: acts,
                rn.next_obs_ph: next_obs,
                preferences_ph: preferences,
            },
        )

        policy.learn(total_timesteps=policy_epoch_timesteps)

    results = {}
    results["reward_model"] = rn
    results["policy"] = policy

    return results
Exemple #11
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    videos: bool,
    video_kwargs: Mapping[str, Any],
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

    Args:
      _seed: generated by Sacred.
      env_name: Gym environment identifier.
      eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
          one of `eval_n_episodes` and `eval_n_timesteps`.
      eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
          one of `eval_n_episodes` and `eval_n_timesteps`.
      num_vec: Number of environments to run simultaneously.
      parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
          uses `DummyVecEnv`.
      max_episode_steps: If not None, then environments are wrapped by
          TimeLimit so that they have at most `max_episode_steps` steps per
          episode.
      render: If True, renders interactively to the screen.
      render_fps: The target number of frames per second to render on screen.
      videos: If True, saves videos to `log_dir`.
      video_kwargs: Keyword arguments passed through to `video_wrapper.VideoWrapper`.
      log_dir: The directory to log intermediate output to, such as episode reward.
      policy_type: A unique identifier for the saved policy,
          defined in POLICY_CLASSES.
      policy_path: A path to the serialized policy.
      reward_type: If specified, overrides the environment reward with
          a reward of this.
      reward_path: If reward_type is specified, the path to a serialized reward
          of `reward_type` to override the environment reward with.

    Returns:
      Return value of `imitation.util.rollout.rollout_stats()`.
    """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    logging.basicConfig(level=logging.INFO)
    logging.info("Logging to %s", log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    post_wrappers = [video_wrapper_factory(log_dir, **video_kwargs)
                     ] if videos else None
    venv = util.make_vec_env(
        env_name,
        num_vec,
        seed=_seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
        post_wrappers=post_wrappers,
    )

    try:
        if render:
            # As of July 31, 2020, DummyVecEnv rendering only works with num_vec=1
            # due to a bug on Stable Baselines 3.
            venv = InteractiveRender(venv, render_fps)

        if reward_type is not None:
            reward_fn = load_reward(reward_type, reward_path, venv)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        policy = serialize.load_policy(policy_type, policy_path, venv)
        trajs = rollout.generate_trajectories(policy, venv, sample_until)
        return rollout.rollout_stats(trajs)
    finally:
        venv.close()
    def __init__(self,
                 venv: VecEnv,
                 gen_policy: BaseRLModel,
                 discrim: discrim_net.DiscrimNet,
                 expert_demos: rollout.Transitions,
                 *,
                 disc_opt_cls: tf.train.Optimizer = tf.train.AdamOptimizer,
                 disc_opt_kwargs: dict = {},
                 n_disc_samples_per_buffer: int = 200,
                 gen_replay_buffer_capacity: Optional[int] = None,
                 init_tensorboard: bool = False,
                 init_tensorboard_graph: bool = False,
                 debug_use_ground_truth: bool = False):
        """Builds Trainer.

    Args:
        venv: The vectorized environment to train in.
        gen_policy: The generator policy that is trained to maximize
                    discriminator confusion.
        discrim: The discriminator network.
            For GAIL, use a DiscrimNetGAIL. For AIRL, use a DiscrimNetAIRL.
        expert_demos: Transitions from an expert dataset.
        disc_opt_cls: The optimizer for discriminator training.
        disc_opt_kwargs: Parameters for discriminator training.
        n_disc_samples_per_buffer: The number of obs-act-obs triples
            sampled from each replay buffer (expert and generator) during each
            step of discriminator training. This is also the number of triples
            stored in the replay buffer after each epoch of generator training.
        gen_replay_buffer_capacity: The capacity of the
            generator replay buffer (the number of obs-action-obs samples from
            the generator that can be stored).

            By default this is equal to `20 * n_disc_samples_per_buffer`.
        init_tensorboard: If True, makes various discriminator
            TensorBoard summaries.
        init_tensorboard_graph: If both this and `init_tensorboard` are True,
            then write a Tensorboard graph summary to disk.
        debug_use_ground_truth: If True, use the ground truth reward for
            `self.train_env`.
            This disables the reward wrapping that would normally replace
            the environment reward with the learned reward. This is useful for
            sanity checking that the policy training is functional.
    """
        self._sess = tf.get_default_session()
        self._global_step = tf.train.create_global_step()

        self._n_disc_samples_per_buffer = n_disc_samples_per_buffer
        self.debug_use_ground_truth = debug_use_ground_truth

        self.venv = venv
        self._expert_demos = expert_demos
        self._gen_policy = gen_policy

        # Discriminator and reward output
        self._discrim = discrim
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs
        with tf.variable_scope("trainer"):
            with tf.variable_scope("discriminator"):
                self._build_disc_train()
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        if init_tensorboard:
            with tf.name_scope("summaries"):
                self._build_summarize()
        self._sess.run(tf.global_variables_initializer())

        if debug_use_ground_truth:
            self.venv_train = self.venv_test = self.venv
        else:
            reward_train = partial(
                self.discrim.reward_train,
                gen_log_prob_fn=self._gen_policy.action_probability)
            self.venv_train = reward_wrapper.RewardVecEnvWrapper(
                self.venv, reward_train)
            self.venv_test = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.discrim.reward_test)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = 20 * self._n_disc_samples_per_buffer
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv)
        self._populate_gen_replay_buffer()
        self._exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_demos)
        if n_disc_samples_per_buffer > len(self._exp_replay_buffer):
            warn("The discriminator batch size is larger than the number of "
                 "expert samples.")
Exemple #13
0
    def __init__(
        self,
        venv: vec_env.VecEnv,
        gen_policy: base_class.BaseRLModel,
        discrim: discrim_net.DiscrimNet,
        expert_data: Union[datasets.Dataset[types.Transitions],
                           types.Transitions],
        *,
        log_dir: str = "output/",
        disc_batch_size: int = 2048,
        disc_minibatch_size: int = 256,
        disc_opt_cls: Type[tf.train.Optimizer] = tf.train.AdamOptimizer,
        disc_opt_kwargs: Optional[Mapping] = None,
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
    ):
        """Builds AdversarialTrainer.

        Args:
            venv: The vectorized environment to train in.
            gen_policy: The generator policy that is trained to maximize
              discriminator confusion. The generator batch size
              `self.gen_batch_size` is inferred from `gen_policy.n_batch`.
            discrim: The discriminator network.
            expert_data: Either a `Dataset` of expert `Transitions`, or an instance of
                `Transitions` to be automatically converted into a
                `Dataset[Transitions]`.
            log_dir: Directory to store TensorBoard logs, plots, etc. in.
            disc_batch_size: The default number of expert and generator transitions
              samples to feed to the discriminator in each call to
              `self.train_disc()`. (Half of the samples are expert and half of the
              samples are generator).
            disc_minibatch_size: The discriminator minibatch size. Each
              discriminator batch is split into minibatches and an Adam update is
              applied on the gradient resulting form each minibatch. Must evenly
              divide `disc_batch_size`. Must be an even number.
            disc_opt_cls: The optimizer for discriminator training.
            disc_opt_kwargs: Parameters for discriminator training.
            gen_replay_buffer_capacity: The capacity of the
              generator replay buffer (the number of obs-action-obs samples from
              the generator that can be stored).

              By default this is equal to `self.gen_batch_size`, meaning that we
              sample only from the most recent batch of generator samples.
            init_tensorboard: If True, makes various discriminator
              TensorBoard summaries.
            init_tensorboard_graph: If both this and `init_tensorboard` are True,
              then write a Tensorboard graph summary to disk.
            debug_use_ground_truth: If True, use the ground truth reward for
              `self.train_env`.
              This disables the reward wrapping that would normally replace
              the environment reward with the learned reward. This is useful for
              sanity checking that the policy training is functional.
        """
        assert (logger.is_configured()
                ), "Requires call to imitation.util.logger.configure"
        self._sess = tf.get_default_session()
        self._global_step = tf.train.create_global_step()

        assert disc_batch_size % disc_minibatch_size == 0
        assert disc_minibatch_size % 2 == 0, (
            "discriminator minibatch size must be even "
            "(equal split between generator and expert samples)")
        self.disc_batch_size = disc_batch_size
        self.disc_minibatch_size = disc_minibatch_size
        self.debug_use_ground_truth = debug_use_ground_truth
        self.venv = venv
        self._gen_policy = gen_policy
        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self._discrim = discrim
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs or {}
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._build_graph()
        self._sess.run(tf.global_variables_initializer())

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.reward_train = self.reward_test = None
            self.venv_train = self.venv_test = self.venv
        else:
            self.reward_train = partial(
                self.discrim.reward_train,
                # The generator policy uses normalized observations
                # but the reward function (self.reward_train) and discriminator use
                # and receive unnormalized observations. Therefore to get the right
                # log action probs for AIRL's ent bonus, we need to normalize obs.
                gen_log_prob_fn=self._gen_log_action_prob_from_unnormalized,
            )
            self.reward_test = self.discrim.reward_test
            self.venv_train = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.reward_train)
            self.venv_test = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.reward_test)

        self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train)
        self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering)
        self.gen_policy.set_env(self.venv_train_norm)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv)

        if isinstance(expert_data, types.Transitions):
            # Somehow, pytype doesn't recognize that `expert_data` is Transitions.
            expert_data = datasets.TransitionsDictDatasetAdaptor(
                expert_data,  # pytype: disable=wrong-arg-types
            )
        self._expert_dataset = expert_data

        expert_ds_size = self.expert_dataset.size()
        if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size:
            warnings.warn(
                "The discriminator batch size is more than twice the number of "
                "expert samples. This means that we will be reusing expert samples "
                "every discrim batch.",
                category=RuntimeWarning,
            )