def set_expert_dataset( self, expert_data: Union[ types.TransitionsMinimal, datasets.Dataset[types.TransitionsMinimal], ], ): """Replace the current expert dataset with a new one. Useful for DAgger and other interactive algorithms. Args: expert_data: Either a `Dataset[types.TransitionsMinimal]` for which `.size()` is not None, or a instance of `TransitionsMinimal`, which is automatically converted to a shuffled, epoch-order `Dataset[types.TransitionsMinimal]`. """ if isinstance(expert_data, types.TransitionsMinimal): trans = expert_data expert_dataset = datasets.TransitionsDictDatasetAdaptor( trans, datasets.EpochOrderDictDataset ) else: assert isinstance(expert_data, datasets.Dataset) expert_dataset = expert_data assert expert_dataset.size() is not None self.expert_dataset = expert_dataset
def trainer(_algorithm_cls, _parallel: bool, tmpdir: str, _convert_dataset: bool): logger.configure(tmpdir, ["tensorboard", "stdout"]) trajs = types.load( "tests/data/expert_models/cartpole_0/rollouts/final.pkl") if _convert_dataset: trans = rollout.flatten_trajectories(trajs) expert_data = datasets.TransitionsDictDatasetAdaptor(trans) else: expert_data = rollout.flatten_trajectories(trajs) venv = util.make_vec_env( "CartPole-v1", n_envs=2, parallel=_parallel, log_dir=tmpdir, ) gen_policy = util.init_rl(venv, verbose=1) return _algorithm_cls( venv=venv, expert_data=expert_data, gen_policy=gen_policy, log_dir=tmpdir, )
def trainer(request, session, venv): convert_dataset = request.param rollouts = types.load(ROLLOUT_PATH) data = rollout.flatten_trajectories(rollouts) if convert_dataset: data = datasets.TransitionsDictDatasetAdaptor( data, datasets.EpochOrderDictDataset) return bc.BC(venv.observation_space, venv.action_space, expert_data=data)
def test_train_from_random_dict_dataset(venv): # make sure that we can construct BC instance & train from a RandomDictDataset rollouts = types.load(ROLLOUT_PATH) data = rollout.flatten_trajectories(rollouts) data = datasets.TransitionsDictDatasetAdaptor(data, datasets.RandomDictDataset) trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=data) trainer.train(n_epochs=1)
def trans_ds_rew(self, transitions_rew, dict_dataset_params): dict_dataset_cls, dict_dataset_kwargs = dict_dataset_params return datasets.TransitionsDictDatasetAdaptor( transitions_rew, dict_dataset_cls, dict_dataset_kwargs )
def __init__( self, venv: vec_env.VecEnv, gen_algo: base_class.BaseAlgorithm, discrim: discrim_nets.DiscrimNet, expert_data: Union[datasets.Dataset[types.Transitions], types.Transitions], *, log_dir: str = "output/", disc_batch_size: int = 2048, disc_minibatch_size: int = 256, disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam, disc_opt_kwargs: Optional[Mapping] = None, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, device: Union[str, th.device] = "auto", ): """Builds AdversarialTrainer. Args: venv: The vectorized environment to train in. gen_algo: The generator RL algorithm that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_algo.n_steps`. discrim: The discriminator network. This will be moved to the same device as `gen_algo`. expert_data: Either a `Dataset` of expert `Transitions`, or an instance of `Transitions` to be automatically converted into a `Dataset[Transitions]`. log_dir: Directory to store TensorBoard logs, plots, etc. in. disc_batch_size: The default number of expert and generator transitions samples to feed to the discriminator in each call to `self.train_disc()`. (Half of the samples are expert and half of the samples are generator). disc_minibatch_size: The discriminator minibatch size. Each discriminator batch is split into minibatches and an Adam update is applied on the gradient resulting form each minibatch. Must evenly divide `disc_batch_size`. Must be an even number. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `self.gen_batch_size`, meaning that we sample only from the most recent batch of generator samples. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert (logger.is_configured() ), "Requires call to imitation.util.logger.configure" self._global_step = 0 self._disc_step = 0 assert disc_batch_size % disc_minibatch_size == 0 assert disc_minibatch_size % 2 == 0, ( "discriminator minibatch size must be even " "(equal split between generator and expert samples)") self.disc_batch_size = disc_batch_size self.disc_minibatch_size = disc_minibatch_size self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self.gen_algo = gen_algo self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self.discrim = discrim.to(self.gen_algo.device) self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs or {} self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._disc_opt = self._disc_opt_cls(self.discrim.parameters(), **self._disc_opt_kwargs) if self._init_tensorboard: logging.info("building summary directory at " + self._log_dir) summary_dir = os.path.join(self._log_dir, "summary") os.makedirs(summary_dir, exist_ok=True) self._summary_writer = thboard.SummaryWriter(summary_dir) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_train = self.venv_test = self.venv else: self.venv_train = reward_wrapper.RewardVecEnvWrapper( self.venv, self.discrim.predict_reward_train) self.venv_test = reward_wrapper.RewardVecEnvWrapper( self.venv, self.discrim.predict_reward_test) self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train) self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering) self.gen_algo.set_env(self.venv_train_norm) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv) if isinstance(expert_data, types.Transitions): # Somehow, pytype doesn't recognize that `expert_data` is Transitions. expert_data = datasets.TransitionsDictDatasetAdaptor( expert_data, # pytype: disable=wrong-arg-types ) self._expert_dataset = expert_data expert_ds_size = self._expert_dataset.size() if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size: warnings.warn( "The discriminator batch size is more than twice the number of " "expert samples. This means that we will be reusing expert samples " "every discrim batch.", category=RuntimeWarning, )
def __init__( self, venv: vec_env.VecEnv, gen_policy: base_class.BaseRLModel, discrim: discrim_net.DiscrimNet, expert_data: Union[datasets.Dataset[types.Transitions], types.Transitions], *, log_dir: str = "output/", disc_batch_size: int = 2048, disc_minibatch_size: int = 256, disc_opt_cls: Type[tf.train.Optimizer] = tf.train.AdamOptimizer, disc_opt_kwargs: Optional[Mapping] = None, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, ): """Builds AdversarialTrainer. Args: venv: The vectorized environment to train in. gen_policy: The generator policy that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_policy.n_batch`. discrim: The discriminator network. expert_data: Either a `Dataset` of expert `Transitions`, or an instance of `Transitions` to be automatically converted into a `Dataset[Transitions]`. log_dir: Directory to store TensorBoard logs, plots, etc. in. disc_batch_size: The default number of expert and generator transitions samples to feed to the discriminator in each call to `self.train_disc()`. (Half of the samples are expert and half of the samples are generator). disc_minibatch_size: The discriminator minibatch size. Each discriminator batch is split into minibatches and an Adam update is applied on the gradient resulting form each minibatch. Must evenly divide `disc_batch_size`. Must be an even number. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `self.gen_batch_size`, meaning that we sample only from the most recent batch of generator samples. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert (logger.is_configured() ), "Requires call to imitation.util.logger.configure" self._sess = tf.get_default_session() self._global_step = tf.train.create_global_step() assert disc_batch_size % disc_minibatch_size == 0 assert disc_minibatch_size % 2 == 0, ( "discriminator minibatch size must be even " "(equal split between generator and expert samples)") self.disc_batch_size = disc_batch_size self.disc_minibatch_size = disc_minibatch_size self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self._gen_policy = gen_policy self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self._discrim = discrim self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs or {} self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._build_graph() self._sess.run(tf.global_variables_initializer()) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.reward_train = self.reward_test = None self.venv_train = self.venv_test = self.venv else: self.reward_train = partial( self.discrim.reward_train, # The generator policy uses normalized observations # but the reward function (self.reward_train) and discriminator use # and receive unnormalized observations. Therefore to get the right # log action probs for AIRL's ent bonus, we need to normalize obs. gen_log_prob_fn=self._gen_log_action_prob_from_unnormalized, ) self.reward_test = self.discrim.reward_test self.venv_train = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_train) self.venv_test = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_test) self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train) self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering) self.gen_policy.set_env(self.venv_train_norm) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv) if isinstance(expert_data, types.Transitions): # Somehow, pytype doesn't recognize that `expert_data` is Transitions. expert_data = datasets.TransitionsDictDatasetAdaptor( expert_data, # pytype: disable=wrong-arg-types ) self._expert_dataset = expert_data expert_ds_size = self.expert_dataset.size() if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size: warnings.warn( "The discriminator batch size is more than twice the number of " "expert samples. This means that we will be reusing expert samples " "every discrim batch.", category=RuntimeWarning, )