Esempio n. 1
0
    def __init__(
        self,
        venv: VecEnv,
        gen_policy: BaseRLModel,
        discrim: discrim_net.DiscrimNet,
        expert_demos: types.Transitions,
        *,
        log_dir: str = "output/",
        disc_batch_size: int = 2048,
        disc_minibatch_size: int = 256,
        disc_opt_cls: tf.train.Optimizer = tf.train.AdamOptimizer,
        disc_opt_kwargs: dict = {},
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
    ):
        """Builds Trainer.

        Args:
            venv: The vectorized environment to train in.
            gen_policy: The generator policy that is trained to maximize
              discriminator confusion. The generator batch size
              `self.gen_batch_size` is inferred from `gen_policy.n_batch`.
            discrim: The discriminator network.
              For GAIL, use a DiscrimNetGAIL. For AIRL, use a DiscrimNetAIRL.
            expert_demos: Transitions from an expert dataset.
            log_dir: Directory to store TensorBoard logs, plots, etc. in.
            disc_batch_size: The default number of expert and generator transitions
              samples to feed to the discriminator in each call to
              `self.train_disc()`. (Half of the samples are expert and half of the
              samples are generator).
            disc_minibatch_size: The discriminator minibatch size. Each
              discriminator batch is split into minibatches and an Adam update is
              applied on the gradient resulting form each minibatch. Must evenly
              divide `disc_batch_size`. Must be an even number.
            disc_opt_cls: The optimizer for discriminator training.
            disc_opt_kwargs: Parameters for discriminator training.
            gen_replay_buffer_capacity: The capacity of the
              generator replay buffer (the number of obs-action-obs samples from
              the generator that can be stored).

              By default this is equal to `self.gen_batch_size`, meaning that we
              sample only from the most recent batch of generator samples.
            init_tensorboard: If True, makes various discriminator
              TensorBoard summaries.
            init_tensorboard_graph: If both this and `init_tensorboard` are True,
              then write a Tensorboard graph summary to disk.
            debug_use_ground_truth: If True, use the ground truth reward for
              `self.train_env`.
              This disables the reward wrapping that would normally replace
              the environment reward with the learned reward. This is useful for
              sanity checking that the policy training is functional.
        """
        assert (logger.is_configured()
                ), "Requires call to imitation.util.logger.configure"
        self._sess = tf.get_default_session()
        self._global_step = tf.train.create_global_step()

        assert disc_batch_size % disc_minibatch_size == 0
        assert disc_minibatch_size % 2 == 0, (
            "discriminator minibatch size must be even "
            "(equal split between generator and expert samples)")
        self.disc_batch_size = disc_batch_size
        self.disc_minibatch_size = disc_minibatch_size

        self.debug_use_ground_truth = debug_use_ground_truth

        self.venv = venv
        self._expert_demos = expert_demos
        self._gen_policy = gen_policy

        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self._discrim = discrim
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._build_graph()
        self._sess.run(tf.global_variables_initializer())

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.reward_train = self.reward_test = None
            self.venv_train = self.venv_test = self.venv
        else:
            self.reward_train = partial(
                self.discrim.reward_train,
                # The generator policy uses normalized observations
                # but the reward function (self.reward_train) and discriminator use
                # and receive unnormalized observations. Therefore to get the right
                # log action probs for AIRL's ent bonus, we need to normalize obs.
                gen_log_prob_fn=self._gen_log_action_prob_from_unnormalized,
            )
            self.reward_test = self.discrim.reward_test
            self.venv_train = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.reward_train)
            self.venv_test = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.reward_test)

        self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train)
        self.venv_train_norm = VecNormalize(self.venv_train_buffering)
        self.gen_policy.set_env(self.venv_train_norm)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv)
        self._exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_demos)
        if self.disc_batch_size // 2 > len(self._exp_replay_buffer):
            warn(
                "The discriminator batch size is more than twice the number of "
                "expert samples. This means that we will be reusing samples every "
                "discrim batch.")
Esempio n. 2
0
    def __init__(
        self,
        venv: vec_env.VecEnv,
        gen_algo: on_policy_algorithm.OnPolicyAlgorithm,
        discrim: discrim_nets.DiscrimNet,
        expert_data: Union[Iterable[Mapping], types.Transitions],
        expert_batch_size: int,
        n_disc_updates_per_round: int = 2,
        *,
        log_dir: str = "output/",
        normalize_obs: bool = True,
        normalize_reward: bool = True,
        disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam,
        disc_opt_kwargs: Optional[Mapping] = None,
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
    ):
        """Builds AdversarialTrainer.

        Args:
            venv: The vectorized environment to train in.
            gen_algo: The generator RL algorithm that is trained to maximize
                discriminator confusion. The generator batch size
                `self.gen_batch_size` is inferred from `gen_algo.n_steps`.
            discrim: The discriminator network. This will be moved to the same
                device as `gen_algo`.
            expert_data: Either a `torch.utils.data.DataLoader`-like object or an
                instance of `Transitions` which is automatically converted into a
                shuffled version of the former type.

                If the argument passed is a `DataLoader`, then it must yield batches of
                expert data via its `__iter__` method. Each batch is a dictionary whose
                keys "obs", "acts", "next_obs", and "dones", correspond to Tensor or
                NumPy array values each with batch dimension equal to
                `expert_batch_size`. If any batch dimension doesn't equal
                `expert_batch_size` then a `ValueError` is raised.

                If the argument is a `Transitions` instance, then `len(expert_data)`
                must be at least `expert_batch_size`.
            expert_batch_size: The number of samples in each batch yielded from
                the expert data loader. The discriminator batch size is twice this
                number because each discriminator batch contains a generator sample for
                every expert sample.
            n_discrim_updates_per_round: The number of discriminator updates after each
                round of generator updates in AdversarialTrainer.learn().
            log_dir: Directory to store TensorBoard logs, plots, etc. in.
            normalize_obs: Whether to normalize observations with `VecNormalize`.
            normalize_reward: Whether to normalize rewards with `VecNormalize`.
            disc_opt_cls: The optimizer for discriminator training.
            disc_opt_kwargs: Parameters for discriminator training.
            gen_replay_buffer_capacity: The capacity of the
                generator replay buffer (the number of obs-action-obs samples from
                the generator that can be stored).

                By default this is equal to `self.gen_batch_size`, meaning that we
                sample only from the most recent batch of generator samples.
            init_tensorboard: If True, makes various discriminator
                TensorBoard summaries.
            init_tensorboard_graph: If both this and `init_tensorboard` are True,
                then write a Tensorboard graph summary to disk.
            debug_use_ground_truth: If True, use the ground truth reward for
                `self.train_env`.
                This disables the reward wrapping that would normally replace
                the environment reward with the learned reward. This is useful for
                sanity checking that the policy training is functional.
        """

        assert (
            logger.is_configured()
        ), "Requires call to imitation.util.logger.configure"
        self._global_step = 0
        self._disc_step = 0
        self.n_disc_updates_per_round = n_disc_updates_per_round

        if expert_batch_size <= 0:
            raise ValueError(f"expert_batch_size={expert_batch_size} must be positive.")

        self.expert_batch_size = expert_batch_size
        if isinstance(expert_data, types.Transitions):
            if len(expert_data) < expert_batch_size:
                raise ValueError(
                    "Provided Transitions instance as `expert_data` argument but "
                    "len(expert_data) < expert_batch_size. "
                    f"({len(expert_data)} < {expert_batch_size})."
                )

            self.expert_data_loader = th_data.DataLoader(
                expert_data,
                batch_size=expert_batch_size,
                collate_fn=types.transitions_collate_fn,
                shuffle=True,
                drop_last=True,
            )
        else:
            self.expert_data_loader = expert_data
        self._endless_expert_iterator = util.endless_iter(self.expert_data_loader)

        self.debug_use_ground_truth = debug_use_ground_truth
        self.venv = venv
        self.gen_algo = gen_algo
        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self.discrim = discrim.to(self.gen_algo.device)
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs or {}
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._disc_opt = self._disc_opt_cls(
            self.discrim.parameters(), **self._disc_opt_kwargs
        )

        if self._init_tensorboard:
            logging.info("building summary directory at " + self._log_dir)
            summary_dir = os.path.join(self._log_dir, "summary")
            os.makedirs(summary_dir, exist_ok=True)
            self._summary_writer = thboard.SummaryWriter(summary_dir)

        self.venv_buffering = wrappers.BufferingWrapper(self.venv)
        self.venv_norm_obs = vec_env.VecNormalize(
            self.venv_buffering,
            norm_reward=False,
            norm_obs=normalize_obs,
        )

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.venv_wrapped = self.venv_norm_obs
            self.gen_callback = None
        else:
            self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper(
                self.venv_norm_obs, self.discrim.predict_reward_train
            )
            self.gen_callback = self.venv_wrapped.make_log_callback()
        self.venv_train = vec_env.VecNormalize(
            self.venv_wrapped, norm_obs=False, norm_reward=normalize_reward
        )

        self.gen_algo.set_env(self.venv_train)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv
        )
Esempio n. 3
0
    def __init__(
        self,
        venv: vec_env.VecEnv,
        gen_algo: base_class.BaseAlgorithm,
        discrim: discrim_nets.DiscrimNet,
        expert_data: Union[datasets.Dataset[types.Transitions],
                           types.Transitions],
        *,
        log_dir: str = "output/",
        disc_batch_size: int = 2048,
        disc_minibatch_size: int = 256,
        disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam,
        disc_opt_kwargs: Optional[Mapping] = None,
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
        device: Union[str, th.device] = "auto",
    ):
        """Builds AdversarialTrainer.

        Args:
            venv: The vectorized environment to train in.
            gen_algo: The generator RL algorithm that is trained to maximize
              discriminator confusion. The generator batch size
              `self.gen_batch_size` is inferred from `gen_algo.n_steps`.
            discrim: The discriminator network. This will be moved to the same
              device as `gen_algo`.
            expert_data: Either a `Dataset` of expert `Transitions`, or an instance of
              `Transitions` to be automatically converted into a
              `Dataset[Transitions]`.
            log_dir: Directory to store TensorBoard logs, plots, etc. in.
            disc_batch_size: The default number of expert and generator transitions
              samples to feed to the discriminator in each call to
              `self.train_disc()`. (Half of the samples are expert and half of the
              samples are generator).
            disc_minibatch_size: The discriminator minibatch size. Each
              discriminator batch is split into minibatches and an Adam update is
              applied on the gradient resulting form each minibatch. Must evenly
              divide `disc_batch_size`. Must be an even number.
            disc_opt_cls: The optimizer for discriminator training.
            disc_opt_kwargs: Parameters for discriminator training.
            gen_replay_buffer_capacity: The capacity of the
              generator replay buffer (the number of obs-action-obs samples from
              the generator that can be stored).

              By default this is equal to `self.gen_batch_size`, meaning that we
              sample only from the most recent batch of generator samples.
            init_tensorboard: If True, makes various discriminator
              TensorBoard summaries.
            init_tensorboard_graph: If both this and `init_tensorboard` are True,
              then write a Tensorboard graph summary to disk.
            debug_use_ground_truth: If True, use the ground truth reward for
              `self.train_env`.
              This disables the reward wrapping that would normally replace
              the environment reward with the learned reward. This is useful for
              sanity checking that the policy training is functional.
        """
        assert (logger.is_configured()
                ), "Requires call to imitation.util.logger.configure"
        self._global_step = 0
        self._disc_step = 0

        assert disc_batch_size % disc_minibatch_size == 0
        assert disc_minibatch_size % 2 == 0, (
            "discriminator minibatch size must be even "
            "(equal split between generator and expert samples)")
        self.disc_batch_size = disc_batch_size
        self.disc_minibatch_size = disc_minibatch_size
        self.debug_use_ground_truth = debug_use_ground_truth
        self.venv = venv
        self.gen_algo = gen_algo
        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self.discrim = discrim.to(self.gen_algo.device)
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs or {}
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._disc_opt = self._disc_opt_cls(self.discrim.parameters(),
                                            **self._disc_opt_kwargs)

        if self._init_tensorboard:
            logging.info("building summary directory at " + self._log_dir)
            summary_dir = os.path.join(self._log_dir, "summary")
            os.makedirs(summary_dir, exist_ok=True)
            self._summary_writer = thboard.SummaryWriter(summary_dir)

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.venv_train = self.venv_test = self.venv
        else:
            self.venv_train = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.discrim.predict_reward_train)
            self.venv_test = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.discrim.predict_reward_test)

        self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train)
        self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering)
        self.gen_algo.set_env(self.venv_train_norm)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv)

        if isinstance(expert_data, types.Transitions):
            # Somehow, pytype doesn't recognize that `expert_data` is Transitions.
            expert_data = datasets.TransitionsDictDatasetAdaptor(
                expert_data,  # pytype: disable=wrong-arg-types
            )
        self._expert_dataset = expert_data

        expert_ds_size = self._expert_dataset.size()
        if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size:
            warnings.warn(
                "The discriminator batch size is more than twice the number of "
                "expert samples. This means that we will be reusing expert samples "
                "every discrim batch.",
                category=RuntimeWarning,
            )