def test_reward_overwrite(): """Test that reward wrapper actually overwrites base rewards.""" env_name = "Pendulum-v0" num_envs = 3 env = util.make_vec_env(env_name, num_envs) reward_fn = FunkyReward() wrapped_env = reward_wrapper.RewardVecEnvWrapper(env, reward_fn) policy = RandomPolicy(env.observation_space, env.action_space) sample_until = rollout.min_episodes(10) default_stats = rollout.rollout_stats( rollout.generate_trajectories(policy, env, sample_until)) wrapped_stats = rollout.rollout_stats( rollout.generate_trajectories(policy, wrapped_env, sample_until)) # Pendulum-v0 always has negative rewards assert default_stats["return_max"] < 0 # ours gives between 1 * traj_len and num_envs * traj_len reward # (trajectories are all constant length of 200 in Pendulum) steps = wrapped_stats["len_mean"] assert wrapped_stats["return_min"] == 1 * steps assert wrapped_stats["return_max"] == num_envs * steps # check that wrapped reward is negative (all pendulum rewards is negative) # and other rewards are non-negative rand_act, _, _, _ = policy.step(wrapped_env.reset()) _, rew, _, infos = wrapped_env.step(rand_act) assert np.all(rew >= 0) assert np.all([info_dict["wrapped_env_rew"] < 0 for info_dict in infos])
def __init__( self, venv: VecEnv, rollouts: Sequence[types.Trajectory], imitation_trainer: BaseRLModel, *, standardise_inputs: bool = True, kernel: str = "gaussian", kernel_bandwidth: float = 0.5, density_type: str = STATE_ACTION_DENSITY, is_stationary: bool = False, ): r"""Family of simple imitation learning baseline algorithms that apply RL to maximise a rough density estimate of the demonstration trajectories. Specifically, it constructs a non-parametric estimate of `p(s)`, `p(s,s')`, `p_t(s,a)`, etc. (depending on options), then rewards the imitation learner with `r_t(s,a,s')=\log p_t(s,a,s')` (or `\log p(s,s')`, or whatever the user wants the model to condition on). Args: venv: environment to train on. rollouts: list of expert trajectories to imitate. imitation_trainer: RL algorithm & initial policy that will be used to train the imitation learner. kernel, kernel_bandwidth, density_type, is_stationary, n_expert_trajectories: these are passed directly to `DensityReward`; refer to documentation for that class.""" self.venv = venv self.imitation_trainer = imitation_trainer self.reward_fn = DensityReward( trajectories=rollouts, density_type=density_type, obs_space=self.venv.observation_space, act_space=self.venv.action_space, is_stationary=is_stationary, kernel=kernel, kernel_bandwidth=kernel_bandwidth, standardise_inputs=standardise_inputs, ) self.wrapped_env = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_fn) self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) with self.graph.as_default(): self.sess.run(tf.global_variables_initializer())
def wrap_env_train_reward(self, env): """Returns the given Env wrapped with a reward function that returns the AIRL training reward (discriminator confusion). The wrapped `Env`'s reward is directly evaluated from the reward network, and therefore changes whenever `self.train()` is called. Args: env (str, Env, or VecEnv): The Env that we want to wrap. If a string environment name is given or a Env is given, then we first convert to a VecEnv before continuing. wrapped_env (VecEnv): The wrapped environment with a new reward. """ env = util.maybe_load_env(env, vectorize=True) if self.debug_use_ground_truth: return env else: return reward_wrapper.RewardVecEnvWrapper( env, self._policy_train_reward_fn)
def wrap_env_test_reward(self, env): """Returns the given Env wrapped with a reward function that returns the reward learned by this Trainer. The wrapped `Env`'s reward is directly evaluated from the reward network, and therefore changes whenever `self.train()` is called. Args: env (str, Env, or VecEnv): The Env that should be wrapped. If a string environment name is given or a Env is given, then we first make a VecEnv before continuing. Returns: wrapped_env (VecEnv): The wrapped environment with a new reward. """ env = util.maybe_load_env(env, vectorize=True) if self.debug_use_ground_truth: return env else: return reward_wrapper.RewardVecEnvWrapper(env, self._test_reward_fn)
def adversarial_learning( venv, expert=None, expert_venv=None, expert_trajectories=None, state_only=False, policy_fn=get_ppo, total_timesteps=20000, gen_batch_size=200, disc_batch_size=100, updates_per_batch=2, policy_lr=1e-3, reward_lr=1e-3, is_airl=True, **kwargs, ): # Set up generator gen_policy = policy_fn(venv, learning_rate=policy_lr) policy = gen_policy # Set up discriminator if is_airl: rn = BasicShapedRewardNet( venv.observation_space, venv.action_space, theta_units=[32, 32], phi_units=[32, 32], scale=True, state_only=state_only, ) discrim = DiscrimNetAIRL(rn, entropy_weight=1.0) else: rn = None discrim = DiscrimNetGAIL(venv.observation_space, venv.action_space) # Set up optimizer train_op = tf.train.AdamOptimizer(learning_rate=reward_lr).minimize( tf.reduce_mean(discrim.disc_loss)) # Set up environment reward reward_train = functools.partial( discrim.reward_train, gen_log_prob_fn=gen_policy.action_probability) venv_train = reward_wrapper.RewardVecEnvWrapper(venv, reward_train) venv_train_buffering = BufferingWrapper(venv_train) gen_policy.set_env(venv_train_buffering) # possibly redundant # Set up replay buffers gen_replay_buffer_capacity = 20 * gen_batch_size gen_replay_buffer = buffer.ReplayBuffer(gen_replay_buffer_capacity, venv) if expert_trajectories is not None: expert_transitions = flatten_trajectories(expert_trajectories) exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_transitions) else: exp_replay_buffer = buffer.ReplayBuffer(gen_replay_buffer_capacity, venv) # Start training sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) num_epochs = int(np.ceil(total_timesteps / gen_batch_size)) for epoch in range(num_epochs): # Train gen gen_policy.learn(total_timesteps=gen_batch_size, reset_num_timesteps=True) gen_replay_buffer.store(venv_train_buffering.pop_transitions()) if expert_trajectories is None: exp_replay_buffer.store( flatten_trajectories( sample_trajectories(expert_venv, expert, n_timesteps=gen_batch_size))) # Train disc for _ in range(updates_per_batch): disc_minibatch_size = disc_batch_size // updates_per_batch half_minibatch = disc_minibatch_size // 2 gen_samples = gen_replay_buffer.sample(half_minibatch) expert_samples = exp_replay_buffer.sample(half_minibatch) obs = np.concatenate([gen_samples.obs, expert_samples.obs]) acts = np.concatenate([gen_samples.acts, expert_samples.acts]) next_obs = np.concatenate( [gen_samples.next_obs, expert_samples.next_obs]) labels = np.concatenate( [np.ones(half_minibatch), np.zeros(half_minibatch)]) log_act_prob = gen_policy.action_probability(obs, actions=acts, logp=True) log_act_prob = log_act_prob.reshape((disc_minibatch_size, )) _, logits_v, loss_v = sess.run( [ train_op, discrim._disc_logits_gen_is_high, discrim._disc_loss, ], feed_dict={ discrim.obs_ph: obs, discrim.act_ph: acts, discrim.next_obs_ph: next_obs, discrim.labels_gen_is_one_ph: labels, discrim.log_policy_act_prob_ph: log_act_prob, }, ) results = {} results["reward_model"] = rn results["discrim"] = discrim results["policy"] = gen_policy return results
def __init__( self, venv: vec_env.VecEnv, gen_algo: on_policy_algorithm.OnPolicyAlgorithm, discrim: discrim_nets.DiscrimNet, expert_data: Union[Iterable[Mapping], types.Transitions], expert_batch_size: int, n_disc_updates_per_round: int = 2, *, log_dir: str = "output/", normalize_obs: bool = True, normalize_reward: bool = True, disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam, disc_opt_kwargs: Optional[Mapping] = None, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, ): """Builds AdversarialTrainer. Args: venv: The vectorized environment to train in. gen_algo: The generator RL algorithm that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_algo.n_steps`. discrim: The discriminator network. This will be moved to the same device as `gen_algo`. expert_data: Either a `torch.utils.data.DataLoader`-like object or an instance of `Transitions` which is automatically converted into a shuffled version of the former type. If the argument passed is a `DataLoader`, then it must yield batches of expert data via its `__iter__` method. Each batch is a dictionary whose keys "obs", "acts", "next_obs", and "dones", correspond to Tensor or NumPy array values each with batch dimension equal to `expert_batch_size`. If any batch dimension doesn't equal `expert_batch_size` then a `ValueError` is raised. If the argument is a `Transitions` instance, then `len(expert_data)` must be at least `expert_batch_size`. expert_batch_size: The number of samples in each batch yielded from the expert data loader. The discriminator batch size is twice this number because each discriminator batch contains a generator sample for every expert sample. n_discrim_updates_per_round: The number of discriminator updates after each round of generator updates in AdversarialTrainer.learn(). log_dir: Directory to store TensorBoard logs, plots, etc. in. normalize_obs: Whether to normalize observations with `VecNormalize`. normalize_reward: Whether to normalize rewards with `VecNormalize`. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `self.gen_batch_size`, meaning that we sample only from the most recent batch of generator samples. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert ( logger.is_configured() ), "Requires call to imitation.util.logger.configure" self._global_step = 0 self._disc_step = 0 self.n_disc_updates_per_round = n_disc_updates_per_round if expert_batch_size <= 0: raise ValueError(f"expert_batch_size={expert_batch_size} must be positive.") self.expert_batch_size = expert_batch_size if isinstance(expert_data, types.Transitions): if len(expert_data) < expert_batch_size: raise ValueError( "Provided Transitions instance as `expert_data` argument but " "len(expert_data) < expert_batch_size. " f"({len(expert_data)} < {expert_batch_size})." ) self.expert_data_loader = th_data.DataLoader( expert_data, batch_size=expert_batch_size, collate_fn=types.transitions_collate_fn, shuffle=True, drop_last=True, ) else: self.expert_data_loader = expert_data self._endless_expert_iterator = util.endless_iter(self.expert_data_loader) self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self.gen_algo = gen_algo self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self.discrim = discrim.to(self.gen_algo.device) self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs or {} self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._disc_opt = self._disc_opt_cls( self.discrim.parameters(), **self._disc_opt_kwargs ) if self._init_tensorboard: logging.info("building summary directory at " + self._log_dir) summary_dir = os.path.join(self._log_dir, "summary") os.makedirs(summary_dir, exist_ok=True) self._summary_writer = thboard.SummaryWriter(summary_dir) self.venv_buffering = wrappers.BufferingWrapper(self.venv) self.venv_norm_obs = vec_env.VecNormalize( self.venv_buffering, norm_reward=False, norm_obs=normalize_obs, ) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_wrapped = self.venv_norm_obs self.gen_callback = None else: self.venv_wrapped = reward_wrapper.RewardVecEnvWrapper( self.venv_norm_obs, self.discrim.predict_reward_train ) self.gen_callback = self.venv_wrapped.make_log_callback() self.venv_train = vec_env.VecNormalize( self.venv_wrapped, norm_obs=False, norm_reward=normalize_reward ) self.gen_algo.set_env(self.venv_train) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv )
def __init__( self, venv: VecEnv, gen_policy: BaseRLModel, discrim: discrim_net.DiscrimNet, expert_demos: rollout.Transitions, *, log_dir: str = 'output/', disc_batch_size: int = 2048, disc_minibatch_size: int = 256, disc_opt_cls: tf.train.Optimizer = tf.train.AdamOptimizer, disc_opt_kwargs: dict = {}, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, ): """Builds Trainer. Args: venv: The vectorized environment to train in. gen_policy: The generator policy that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_policy.n_batch`. discrim: The discriminator network. For GAIL, use a DiscrimNetGAIL. For AIRL, use a DiscrimNetAIRL. expert_demos: Transitions from an expert dataset. log_dir: Directory to store TensorBoard logs, plots, etc. in. disc_batch_size: The default number of expert and generator transitions samples to feed to the discriminator in each call to `self.train_disc()`. (Half of the samples are expert and half of the samples are generator). disc_minibatch_size: The discriminator minibatch size. Each discriminator batch is split into minibatches and an Adam update is applied on the gradient resulting form each minibatch. Must evenly divide `disc_batch_size`. Must be an even number. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `20 * self.gen_batch_size`. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert util.logger.is_configured(), ("Requires call to " "imitation.util.logger.configure") self._sess = tf.get_default_session() self._global_step = tf.train.create_global_step() assert disc_batch_size % disc_minibatch_size == 0 assert disc_minibatch_size % 2 == 0, ( "discriminator minibatch size must be even " "(equal split between generator and expert samples)") self.disc_batch_size = disc_batch_size self.disc_minibatch_size = disc_minibatch_size self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self._expert_demos = expert_demos self._gen_policy = gen_policy self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self._discrim = discrim self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._build_graph() self._sess.run(tf.global_variables_initializer()) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.reward_train = self.reward_test = None self.venv_train = self.venv_test = self.venv else: self.reward_train = partial( self.discrim.reward_train, gen_log_prob_fn=self._gen_policy.action_probability) self.reward_test = self.discrim.reward_test self.venv_train = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_train) self.venv_test = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_test) self.venv_train_norm = VecNormalize(self.venv_train) self.venv_train_norm_buffering = BufferingWrapper(self.venv_train_norm) self.gen_policy.set_env(self.venv_train_norm_buffering) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = 20 * self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv) self._exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_demos) if self.disc_batch_size // 2 > len(self._exp_replay_buffer): warn( "The discriminator batch size is more than twice the number of " "expert samples. This means that we will be reusing samples every " "discrim batch.")
def eval_policy( _run, _seed: int, env_name: str, eval_n_timesteps: Optional[int], eval_n_episodes: Optional[int], num_vec: int, parallel: bool, render: bool, render_fps: int, log_dir: str, policy_type: str, policy_path: str, reward_type: Optional[str] = None, reward_path: Optional[str] = None, max_episode_steps: Optional[int] = None, ): """Rolls a policy out in an environment, collecting statistics. Args: _seed: generated by Sacred. env_name: Gym environment identifier. eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. num_vec: Number of environments to run simultaneously. parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise, uses `DummyVecEnv`. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. render: If True, renders interactively to the screen. log_dir: The directory to log intermediate output to. (As of 2019-07-19 this is just episode-by-episode reward from bench.Monitor.) policy_type: A unique identifier for the saved policy, defined in POLICY_CLASSES. policy_path: A path to the serialized policy. reward_type: If specified, overrides the environment reward with a reward of this. reward_path: If reward_type is specified, the path to a serialized reward of `reward_type` to override the environment reward with. Returns: Return value of `imitation.util.rollout.rollout_stats()`. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info('Logging to %s', log_dir) sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) venv = VecNormalize(venv, training=False, norm_reward=False) venv = venv.load(policy_path + "/vec_normalize.pkl", venv) if render: venv = InteractiveRender(venv, render_fps) # TODO(adam): add support for videos using VideoRecorder? with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") with serialize.load_policy(policy_type, policy_path, venv) as policy: trajs = rollout.generate_trajectories(policy, venv, sample_until) return rollout.rollout_stats(trajs)
def __init__( self, venv: vec_env.VecEnv, gen_algo: base_class.BaseAlgorithm, discrim: discrim_nets.DiscrimNet, expert_data: Union[datasets.Dataset[types.Transitions], types.Transitions], *, log_dir: str = "output/", disc_batch_size: int = 2048, disc_minibatch_size: int = 256, disc_opt_cls: Type[th.optim.Optimizer] = th.optim.Adam, disc_opt_kwargs: Optional[Mapping] = None, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, device: Union[str, th.device] = "auto", ): """Builds AdversarialTrainer. Args: venv: The vectorized environment to train in. gen_algo: The generator RL algorithm that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_algo.n_steps`. discrim: The discriminator network. This will be moved to the same device as `gen_algo`. expert_data: Either a `Dataset` of expert `Transitions`, or an instance of `Transitions` to be automatically converted into a `Dataset[Transitions]`. log_dir: Directory to store TensorBoard logs, plots, etc. in. disc_batch_size: The default number of expert and generator transitions samples to feed to the discriminator in each call to `self.train_disc()`. (Half of the samples are expert and half of the samples are generator). disc_minibatch_size: The discriminator minibatch size. Each discriminator batch is split into minibatches and an Adam update is applied on the gradient resulting form each minibatch. Must evenly divide `disc_batch_size`. Must be an even number. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `self.gen_batch_size`, meaning that we sample only from the most recent batch of generator samples. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert (logger.is_configured() ), "Requires call to imitation.util.logger.configure" self._global_step = 0 self._disc_step = 0 assert disc_batch_size % disc_minibatch_size == 0 assert disc_minibatch_size % 2 == 0, ( "discriminator minibatch size must be even " "(equal split between generator and expert samples)") self.disc_batch_size = disc_batch_size self.disc_minibatch_size = disc_minibatch_size self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self.gen_algo = gen_algo self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self.discrim = discrim.to(self.gen_algo.device) self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs or {} self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._disc_opt = self._disc_opt_cls(self.discrim.parameters(), **self._disc_opt_kwargs) if self._init_tensorboard: logging.info("building summary directory at " + self._log_dir) summary_dir = os.path.join(self._log_dir, "summary") os.makedirs(summary_dir, exist_ok=True) self._summary_writer = thboard.SummaryWriter(summary_dir) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.venv_train = self.venv_test = self.venv else: self.venv_train = reward_wrapper.RewardVecEnvWrapper( self.venv, self.discrim.predict_reward_train) self.venv_test = reward_wrapper.RewardVecEnvWrapper( self.venv, self.discrim.predict_reward_test) self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train) self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering) self.gen_algo.set_env(self.venv_train_norm) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv) if isinstance(expert_data, types.Transitions): # Somehow, pytype doesn't recognize that `expert_data` is Transitions. expert_data = datasets.TransitionsDictDatasetAdaptor( expert_data, # pytype: disable=wrong-arg-types ) self._expert_dataset = expert_data expert_ds_size = self._expert_dataset.size() if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size: warnings.warn( "The discriminator batch size is more than twice the number of " "expert samples. This means that we will be reusing expert samples " "every discrim batch.", category=RuntimeWarning, )
def preferences( venv, expert=None, evaluate_trajectories_fn=None, n_pairs_per_batch=50, n_timesteps_per_query=None, reward_lr=1e-3, policy_lr=1e-3, policy_epoch_timesteps=200, total_timesteps=10000, state_only=False, use_rnd_bonus=False, rnd_lr=1e-3, rnd_coeff=0.5, normalize_extrinsic=False, egreedy_sampling=False, **kwargs, ): if n_pairs_per_batch is None: horizon = get_horizon(venv) n_pairs_per_batch = (n_timesteps_per_query / (2 * horizon)) if evaluate_trajectories_fn is None: reward_eval_fn = reward_eval_path_fn(venv) evaluate_trajectories_fn = get_eval_trajectories_fn(reward_eval_fn) # Create reward model rn = BasicShapedRewardNet( venv.observation_space, venv.action_space, theta_units=[32, 32], phi_units=[32, 32], scale=True, state_only=state_only, ) # Compute trajectory probabilities preferences_ph = tf.placeholder( shape=(None, 2), dtype=tf.float32, name="preferences", ) num_segments = 2 * tf.shape(preferences_ph)[0] rewards_out = tf.reshape(rn.reward_output_train, [num_segments, -1]) returns_out = tf.reduce_sum(rewards_out, axis=1) returns = tf.reshape(returns_out, shape=[-1, 2]) log_probs = tf.nn.log_softmax(returns, axis=1) # Write loss and optimizer op loss = (-1) * tf.reduce_sum(log_probs * preferences_ph) optimizer = tf.train.AdamOptimizer(learning_rate=reward_lr) reward_train_op = optimizer.minimize(loss) base_extrinsic_reward_fn = get_reward_fn_from_model(rn) if not use_rnd_bonus: reward_fn = base_extrinsic_reward_fn else: # Random network distillation bonus rnd_size = 50 inputs = [rn.obs_inp, rn.act_inp] inputs = [tf.layers.flatten(x) for x in inputs] inputs = tf.concat(inputs, axis=1) rnd_target_net = build_mlp([32, 32, 32], output_size=rnd_size) rnd_target = sequential(inputs, rnd_target_net) rnd_pred_net = build_mlp([32, 32, 32], output_size=rnd_size) rnd_pred = sequential(inputs, rnd_pred_net) rnd_loss = tf.reduce_mean((tf.stop_gradient(rnd_target) - rnd_pred)**2) rnd_optimizer = tf.train.AdamOptimizer(learning_rate=rnd_lr) rnd_train_op = rnd_optimizer.minimize(rnd_loss) runn_rnd_rews = RunningMeanVar(alpha=0.01) def rnd_reward_fn(obs, acts=None, *args, **kwargs): if acts is None: acts = [venv.action_space.sample()] int_rew = sess.run(rnd_loss, feed_dict={ rn.obs_ph: obs, rn.act_ph: acts }) int_rew_old = int_rew int_rew = runn_rnd_rews.exp_update(int_rew) return int_rew if normalize_extrinsic: runn_ext_rews = RunningMeanVar(alpha=0.01) def extrinsic_reward_fn(*args, **kwargs): ext_rew = base_extrinsic_reward_fn(*args, **kwargs) if normalize_extrinsic: ext_rew = runn_ext_rews.exp_update(ext_rew) return ext_rew def reward_fn(*args, **kwargs): return extrinsic_reward_fn( *args, **kwargs) + rnd_coeff * rnd_reward_fn(*args, **kwargs) # Create learner from reward model venv_train = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) policy = PPO2(MlpPolicy, venv_train, learning_rate=policy_lr) # Start training sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) sampling_policy = make_egreedy(policy, venv) if egreedy_sampling else policy num_epochs = int(np.ceil(total_timesteps / policy_epoch_timesteps)) for epoch in range(num_epochs): trajectories = sample_trajectories(venv, sampling_policy, 2 * n_pairs_per_batch) segments = get_segments(trajectories) seg_returns = evaluate_trajectories_fn(segments) seg_returns = seg_returns.reshape(-1, 2) preferences = np.stack( [ seg_returns[:, 0] > seg_returns[:, 1], seg_returns[:, 1] > seg_returns[:, 0], ], axis=1, ) obs = np.concatenate([seg.obs for seg in segments]) acts = np.concatenate([seg.acts for seg in segments]) next_obs = np.concatenate([seg.next_obs for seg in segments]) ops = [reward_train_op] if use_rnd_bonus: ops.append(rnd_train_op) sess.run( ops, feed_dict={ rn.obs_ph: obs, rn.act_ph: acts, rn.next_obs_ph: next_obs, preferences_ph: preferences, }, ) policy.learn(total_timesteps=policy_epoch_timesteps) results = {} results["reward_model"] = rn results["policy"] = policy return results
def eval_policy( _run, _seed: int, env_name: str, eval_n_timesteps: Optional[int], eval_n_episodes: Optional[int], num_vec: int, parallel: bool, render: bool, render_fps: int, videos: bool, video_kwargs: Mapping[str, Any], log_dir: str, policy_type: str, policy_path: str, reward_type: Optional[str] = None, reward_path: Optional[str] = None, max_episode_steps: Optional[int] = None, ): """Rolls a policy out in an environment, collecting statistics. Args: _seed: generated by Sacred. env_name: Gym environment identifier. eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly one of `eval_n_episodes` and `eval_n_timesteps`. num_vec: Number of environments to run simultaneously. parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise, uses `DummyVecEnv`. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. render: If True, renders interactively to the screen. render_fps: The target number of frames per second to render on screen. videos: If True, saves videos to `log_dir`. video_kwargs: Keyword arguments passed through to `video_wrapper.VideoWrapper`. log_dir: The directory to log intermediate output to, such as episode reward. policy_type: A unique identifier for the saved policy, defined in POLICY_CLASSES. policy_path: A path to the serialized policy. reward_type: If specified, overrides the environment reward with a reward of this. reward_path: If reward_type is specified, the path to a serialized reward of `reward_type` to override the environment reward with. Returns: Return value of `imitation.util.rollout.rollout_stats()`. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) logging.basicConfig(level=logging.INFO) logging.info("Logging to %s", log_dir) sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes) post_wrappers = [video_wrapper_factory(log_dir, **video_kwargs) ] if videos else None venv = util.make_vec_env( env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps, post_wrappers=post_wrappers, ) try: if render: # As of July 31, 2020, DummyVecEnv rendering only works with num_vec=1 # due to a bug on Stable Baselines 3. venv = InteractiveRender(venv, render_fps) if reward_type is not None: reward_fn = load_reward(reward_type, reward_path, venv) venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn) logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") policy = serialize.load_policy(policy_type, policy_path, venv) trajs = rollout.generate_trajectories(policy, venv, sample_until) return rollout.rollout_stats(trajs) finally: venv.close()
def __init__(self, venv: VecEnv, gen_policy: BaseRLModel, discrim: discrim_net.DiscrimNet, expert_demos: rollout.Transitions, *, disc_opt_cls: tf.train.Optimizer = tf.train.AdamOptimizer, disc_opt_kwargs: dict = {}, n_disc_samples_per_buffer: int = 200, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False): """Builds Trainer. Args: venv: The vectorized environment to train in. gen_policy: The generator policy that is trained to maximize discriminator confusion. discrim: The discriminator network. For GAIL, use a DiscrimNetGAIL. For AIRL, use a DiscrimNetAIRL. expert_demos: Transitions from an expert dataset. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. n_disc_samples_per_buffer: The number of obs-act-obs triples sampled from each replay buffer (expert and generator) during each step of discriminator training. This is also the number of triples stored in the replay buffer after each epoch of generator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `20 * n_disc_samples_per_buffer`. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ self._sess = tf.get_default_session() self._global_step = tf.train.create_global_step() self._n_disc_samples_per_buffer = n_disc_samples_per_buffer self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self._expert_demos = expert_demos self._gen_policy = gen_policy # Discriminator and reward output self._discrim = discrim self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs with tf.variable_scope("trainer"): with tf.variable_scope("discriminator"): self._build_disc_train() self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph if init_tensorboard: with tf.name_scope("summaries"): self._build_summarize() self._sess.run(tf.global_variables_initializer()) if debug_use_ground_truth: self.venv_train = self.venv_test = self.venv else: reward_train = partial( self.discrim.reward_train, gen_log_prob_fn=self._gen_policy.action_probability) self.venv_train = reward_wrapper.RewardVecEnvWrapper( self.venv, reward_train) self.venv_test = reward_wrapper.RewardVecEnvWrapper( self.venv, self.discrim.reward_test) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = 20 * self._n_disc_samples_per_buffer self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv) self._populate_gen_replay_buffer() self._exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_demos) if n_disc_samples_per_buffer > len(self._exp_replay_buffer): warn("The discriminator batch size is larger than the number of " "expert samples.")
def __init__( self, venv: vec_env.VecEnv, gen_policy: base_class.BaseRLModel, discrim: discrim_net.DiscrimNet, expert_data: Union[datasets.Dataset[types.Transitions], types.Transitions], *, log_dir: str = "output/", disc_batch_size: int = 2048, disc_minibatch_size: int = 256, disc_opt_cls: Type[tf.train.Optimizer] = tf.train.AdamOptimizer, disc_opt_kwargs: Optional[Mapping] = None, gen_replay_buffer_capacity: Optional[int] = None, init_tensorboard: bool = False, init_tensorboard_graph: bool = False, debug_use_ground_truth: bool = False, ): """Builds AdversarialTrainer. Args: venv: The vectorized environment to train in. gen_policy: The generator policy that is trained to maximize discriminator confusion. The generator batch size `self.gen_batch_size` is inferred from `gen_policy.n_batch`. discrim: The discriminator network. expert_data: Either a `Dataset` of expert `Transitions`, or an instance of `Transitions` to be automatically converted into a `Dataset[Transitions]`. log_dir: Directory to store TensorBoard logs, plots, etc. in. disc_batch_size: The default number of expert and generator transitions samples to feed to the discriminator in each call to `self.train_disc()`. (Half of the samples are expert and half of the samples are generator). disc_minibatch_size: The discriminator minibatch size. Each discriminator batch is split into minibatches and an Adam update is applied on the gradient resulting form each minibatch. Must evenly divide `disc_batch_size`. Must be an even number. disc_opt_cls: The optimizer for discriminator training. disc_opt_kwargs: Parameters for discriminator training. gen_replay_buffer_capacity: The capacity of the generator replay buffer (the number of obs-action-obs samples from the generator that can be stored). By default this is equal to `self.gen_batch_size`, meaning that we sample only from the most recent batch of generator samples. init_tensorboard: If True, makes various discriminator TensorBoard summaries. init_tensorboard_graph: If both this and `init_tensorboard` are True, then write a Tensorboard graph summary to disk. debug_use_ground_truth: If True, use the ground truth reward for `self.train_env`. This disables the reward wrapping that would normally replace the environment reward with the learned reward. This is useful for sanity checking that the policy training is functional. """ assert (logger.is_configured() ), "Requires call to imitation.util.logger.configure" self._sess = tf.get_default_session() self._global_step = tf.train.create_global_step() assert disc_batch_size % disc_minibatch_size == 0 assert disc_minibatch_size % 2 == 0, ( "discriminator minibatch size must be even " "(equal split between generator and expert samples)") self.disc_batch_size = disc_batch_size self.disc_minibatch_size = disc_minibatch_size self.debug_use_ground_truth = debug_use_ground_truth self.venv = venv self._gen_policy = gen_policy self._log_dir = log_dir # Create graph for optimising/recording stats on discriminator self._discrim = discrim self._disc_opt_cls = disc_opt_cls self._disc_opt_kwargs = disc_opt_kwargs or {} self._init_tensorboard = init_tensorboard self._init_tensorboard_graph = init_tensorboard_graph self._build_graph() self._sess.run(tf.global_variables_initializer()) if debug_use_ground_truth: # Would use an identity reward fn here, but RewardFns can't see rewards. self.reward_train = self.reward_test = None self.venv_train = self.venv_test = self.venv else: self.reward_train = partial( self.discrim.reward_train, # The generator policy uses normalized observations # but the reward function (self.reward_train) and discriminator use # and receive unnormalized observations. Therefore to get the right # log action probs for AIRL's ent bonus, we need to normalize obs. gen_log_prob_fn=self._gen_log_action_prob_from_unnormalized, ) self.reward_test = self.discrim.reward_test self.venv_train = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_train) self.venv_test = reward_wrapper.RewardVecEnvWrapper( self.venv, self.reward_test) self.venv_train_buffering = wrappers.BufferingWrapper(self.venv_train) self.venv_train_norm = vec_env.VecNormalize(self.venv_train_buffering) self.gen_policy.set_env(self.venv_train_norm) if gen_replay_buffer_capacity is None: gen_replay_buffer_capacity = self.gen_batch_size self._gen_replay_buffer = buffer.ReplayBuffer( gen_replay_buffer_capacity, self.venv) if isinstance(expert_data, types.Transitions): # Somehow, pytype doesn't recognize that `expert_data` is Transitions. expert_data = datasets.TransitionsDictDatasetAdaptor( expert_data, # pytype: disable=wrong-arg-types ) self._expert_dataset = expert_data expert_ds_size = self.expert_dataset.size() if expert_ds_size is not None and self.disc_batch_size // 2 > expert_ds_size: warnings.warn( "The discriminator batch size is more than twice the number of " "expert samples. This means that we will be reusing expert samples " "every discrim batch.", category=RuntimeWarning, )