コード例 #1
0
    def set_expert_dataset(
        self,
        expert_data: Union[
            types.TransitionsMinimal, datasets.Dataset[types.TransitionsMinimal],
        ],
    ):
        """Replace the current expert dataset with a new one.

        Useful for DAgger and other interactive algorithms.

        Args:
             expert_data: Either a `Dataset[types.TransitionsMinimal]` for which
                 `.size()` is not None, or a instance of `TransitionsMinimal`, which
                 is automatically converted to a shuffled, epoch-order
                 `Dataset[types.TransitionsMinimal]`.
        """
        if isinstance(expert_data, types.TransitionsMinimal):
            trans = expert_data
            expert_dataset = datasets.TransitionsDictDatasetAdaptor(
                trans, datasets.EpochOrderDictDataset
            )
        else:
            assert isinstance(expert_data, datasets.Dataset)
            expert_dataset = expert_data
        assert expert_dataset.size() is not None
        self.expert_dataset = expert_dataset
コード例 #2
0
def trainer(_algorithm_cls, _parallel: bool, tmpdir: str,
            _convert_dataset: bool):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    trajs = types.load(
        "tests/data/expert_models/cartpole_0/rollouts/final.pkl")
    if _convert_dataset:
        trans = rollout.flatten_trajectories(trajs)
        expert_data = datasets.TransitionsDictDatasetAdaptor(trans)
    else:
        expert_data = rollout.flatten_trajectories(trajs)

    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_policy = util.init_rl(venv, verbose=1)

    return _algorithm_cls(
        venv=venv,
        expert_data=expert_data,
        gen_policy=gen_policy,
        log_dir=tmpdir,
    )
コード例 #3
0
def trainer(request, session, venv):
    convert_dataset = request.param
    rollouts = types.load(ROLLOUT_PATH)
    data = rollout.flatten_trajectories(rollouts)
    if convert_dataset:
        data = datasets.TransitionsDictDatasetAdaptor(
            data, datasets.EpochOrderDictDataset)
    return bc.BC(venv.observation_space, venv.action_space, expert_data=data)
コード例 #4
0
ファイル: test_bc.py プロジェクト: whoiszyc/imitation-1
def test_train_from_random_dict_dataset(venv):
    # make sure that we can construct BC instance & train from a RandomDictDataset
    rollouts = types.load(ROLLOUT_PATH)
    data = rollout.flatten_trajectories(rollouts)
    data = datasets.TransitionsDictDatasetAdaptor(data,
                                                  datasets.RandomDictDataset)
    trainer = bc.BC(venv.observation_space,
                    venv.action_space,
                    expert_data=data)
    trainer.train(n_epochs=1)
コード例 #5
0
ファイル: test_data.py プロジェクト: whoiszyc/imitation-1
 def trans_ds_rew(self, transitions_rew, dict_dataset_params):
     dict_dataset_cls, dict_dataset_kwargs = dict_dataset_params
     return datasets.TransitionsDictDatasetAdaptor(
         transitions_rew, dict_dataset_cls, dict_dataset_kwargs
     )
コード例 #6
0
ファイル: adversarial.py プロジェクト: whoiszyc/imitation-1
    def __init__(
        self,
        venv: vec_env.VecEnv,
        gen_algo: base_class.BaseAlgorithm,
        discrim: discrim_nets.DiscrimNet,
        expert_data: Union[datasets.Dataset[types.Transitions],
                           types.Transitions],
        *,
        log_dir: str = "output/",
        disc_batch_size: int = 2048,
        disc_minibatch_size: int = 256,
        disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam,
        disc_opt_kwargs: Optional[Mapping] = None,
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
        device: Union[str, th.device] = "auto",
    ):
        """Builds AdversarialTrainer.

        Args:
            venv: The vectorized environment to train in.
            gen_algo: The generator RL algorithm that is trained to maximize
              discriminator confusion. The generator batch size
              `self.gen_batch_size` is inferred from `gen_algo.n_steps`.
            discrim: The discriminator network. This will be moved to the same
              device as `gen_algo`.
            expert_data: Either a `Dataset` of expert `Transitions`, or an instance of
              `Transitions` to be automatically converted into a
              `Dataset[Transitions]`.
            log_dir: Directory to store TensorBoard logs, plots, etc. in.
            disc_batch_size: The default number of expert and generator transitions
              samples to feed to the discriminator in each call to
              `self.train_disc()`. (Half of the samples are expert and half of the
              samples are generator).
            disc_minibatch_size: The discriminator minibatch size. Each
              discriminator batch is split into minibatches and an Adam update is
              applied on the gradient resulting form each minibatch. Must evenly
              divide `disc_batch_size`. Must be an even number.
            disc_opt_cls: The optimizer for discriminator training.
            disc_opt_kwargs: Parameters for discriminator training.
            gen_replay_buffer_capacity: The capacity of the
              generator replay buffer (the number of obs-action-obs samples from
              the generator that can be stored).

              By default this is equal to `self.gen_batch_size`, meaning that we
              sample only from the most recent batch of generator samples.
            init_tensorboard: If True, makes various discriminator
              TensorBoard summaries.
            init_tensorboard_graph: If both this and `init_tensorboard` are True,
              then write a Tensorboard graph summary to disk.
            debug_use_ground_truth: If True, use the ground truth reward for
              `self.train_env`.
              This disables the reward wrapping that would normally replace
              the environment reward with the learned reward. This is useful for
              sanity checking that the policy training is functional.
        """
        assert (logger.is_configured()
                ), "Requires call to imitation.util.logger.configure"
        self._global_step = 0
        self._disc_step = 0

        assert disc_batch_size % disc_minibatch_size == 0
        assert disc_minibatch_size % 2 == 0, (
            "discriminator minibatch size must be even "
            "(equal split between generator and expert samples)")
        self.disc_batch_size = disc_batch_size
        self.disc_minibatch_size = disc_minibatch_size
        self.debug_use_ground_truth = debug_use_ground_truth
        self.venv = venv
        self.gen_algo = gen_algo
        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self.discrim = discrim.to(self.gen_algo.device)
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs or {}
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._disc_opt = self._disc_opt_cls(self.discrim.parameters(),
                                            **self._disc_opt_kwargs)

        if self._init_tensorboard:
            logging.info("building summary directory at " + self._log_dir)
            summary_dir = os.path.join(self._log_dir, "summary")
            os.makedirs(summary_dir, exist_ok=True)
            self._summary_writer = thboard.SummaryWriter(summary_dir)

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.venv_train = self.venv_test = self.venv
        else:
            self.venv_train = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.discrim.predict_reward_train)
            self.venv_test = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.discrim.predict_reward_test)

        self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train)
        self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering)
        self.gen_algo.set_env(self.venv_train_norm)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv)

        if isinstance(expert_data, types.Transitions):
            # Somehow, pytype doesn't recognize that `expert_data` is Transitions.
            expert_data = datasets.TransitionsDictDatasetAdaptor(
                expert_data,  # pytype: disable=wrong-arg-types
            )
        self._expert_dataset = expert_data

        expert_ds_size = self._expert_dataset.size()
        if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size:
            warnings.warn(
                "The discriminator batch size is more than twice the number of "
                "expert samples. This means that we will be reusing expert samples "
                "every discrim batch.",
                category=RuntimeWarning,
            )
コード例 #7
0
    def __init__(
        self,
        venv: vec_env.VecEnv,
        gen_policy: base_class.BaseRLModel,
        discrim: discrim_net.DiscrimNet,
        expert_data: Union[datasets.Dataset[types.Transitions],
                           types.Transitions],
        *,
        log_dir: str = "output/",
        disc_batch_size: int = 2048,
        disc_minibatch_size: int = 256,
        disc_opt_cls: Type[tf.train.Optimizer] = tf.train.AdamOptimizer,
        disc_opt_kwargs: Optional[Mapping] = None,
        gen_replay_buffer_capacity: Optional[int] = None,
        init_tensorboard: bool = False,
        init_tensorboard_graph: bool = False,
        debug_use_ground_truth: bool = False,
    ):
        """Builds AdversarialTrainer.

        Args:
            venv: The vectorized environment to train in.
            gen_policy: The generator policy that is trained to maximize
              discriminator confusion. The generator batch size
              `self.gen_batch_size` is inferred from `gen_policy.n_batch`.
            discrim: The discriminator network.
            expert_data: Either a `Dataset` of expert `Transitions`, or an instance of
                `Transitions` to be automatically converted into a
                `Dataset[Transitions]`.
            log_dir: Directory to store TensorBoard logs, plots, etc. in.
            disc_batch_size: The default number of expert and generator transitions
              samples to feed to the discriminator in each call to
              `self.train_disc()`. (Half of the samples are expert and half of the
              samples are generator).
            disc_minibatch_size: The discriminator minibatch size. Each
              discriminator batch is split into minibatches and an Adam update is
              applied on the gradient resulting form each minibatch. Must evenly
              divide `disc_batch_size`. Must be an even number.
            disc_opt_cls: The optimizer for discriminator training.
            disc_opt_kwargs: Parameters for discriminator training.
            gen_replay_buffer_capacity: The capacity of the
              generator replay buffer (the number of obs-action-obs samples from
              the generator that can be stored).

              By default this is equal to `self.gen_batch_size`, meaning that we
              sample only from the most recent batch of generator samples.
            init_tensorboard: If True, makes various discriminator
              TensorBoard summaries.
            init_tensorboard_graph: If both this and `init_tensorboard` are True,
              then write a Tensorboard graph summary to disk.
            debug_use_ground_truth: If True, use the ground truth reward for
              `self.train_env`.
              This disables the reward wrapping that would normally replace
              the environment reward with the learned reward. This is useful for
              sanity checking that the policy training is functional.
        """
        assert (logger.is_configured()
                ), "Requires call to imitation.util.logger.configure"
        self._sess = tf.get_default_session()
        self._global_step = tf.train.create_global_step()

        assert disc_batch_size % disc_minibatch_size == 0
        assert disc_minibatch_size % 2 == 0, (
            "discriminator minibatch size must be even "
            "(equal split between generator and expert samples)")
        self.disc_batch_size = disc_batch_size
        self.disc_minibatch_size = disc_minibatch_size
        self.debug_use_ground_truth = debug_use_ground_truth
        self.venv = venv
        self._gen_policy = gen_policy
        self._log_dir = log_dir

        # Create graph for optimising/recording stats on discriminator
        self._discrim = discrim
        self._disc_opt_cls = disc_opt_cls
        self._disc_opt_kwargs = disc_opt_kwargs or {}
        self._init_tensorboard = init_tensorboard
        self._init_tensorboard_graph = init_tensorboard_graph
        self._build_graph()
        self._sess.run(tf.global_variables_initializer())

        if debug_use_ground_truth:
            # Would use an identity reward fn here, but RewardFns can't see rewards.
            self.reward_train = self.reward_test = None
            self.venv_train = self.venv_test = self.venv
        else:
            self.reward_train = partial(
                self.discrim.reward_train,
                # The generator policy uses normalized observations
                # but the reward function (self.reward_train) and discriminator use
                # and receive unnormalized observations. Therefore to get the right
                # log action probs for AIRL's ent bonus, we need to normalize obs.
                gen_log_prob_fn=self._gen_log_action_prob_from_unnormalized,
            )
            self.reward_test = self.discrim.reward_test
            self.venv_train = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.reward_train)
            self.venv_test = reward_wrapper.RewardVecEnvWrapper(
                self.venv, self.reward_test)

        self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train)
        self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering)
        self.gen_policy.set_env(self.venv_train_norm)

        if gen_replay_buffer_capacity is None:
            gen_replay_buffer_capacity = self.gen_batch_size
        self._gen_replay_buffer = buffer.ReplayBuffer(
            gen_replay_buffer_capacity, self.venv)

        if isinstance(expert_data, types.Transitions):
            # Somehow, pytype doesn't recognize that `expert_data` is Transitions.
            expert_data = datasets.TransitionsDictDatasetAdaptor(
                expert_data,  # pytype: disable=wrong-arg-types
            )
        self._expert_dataset = expert_data

        expert_ds_size = self.expert_dataset.size()
        if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size:
            warnings.warn(
                "The discriminator batch size is more than twice the number of "
                "expert samples. This means that we will be reusing expert samples "
                "every discrim batch.",
                category=RuntimeWarning,
            )