def test_wrap_learned_reward_no_crash(use_gail, env="CartPole-v1"): """ Briefly train with AIRL, and then used the learned reward to wrap a duplicate environment. Finally, use that learned reward to train a policy. """ trainer = init_test_trainer(env, use_gail) trainer.train(n_epochs=1) learned_reward_env = trainer.wrap_env_test_reward(env) policy = util.init_rl(env) policy.set_env(learned_reward_env) policy.learn(10)
def add_data_ep_reward(self, env, name): """Calculate and record average episode returns.""" sample_until = util.rollout.min_episodes( self.n_episodes_per_reward_data) gen_policy = self.trainer.gen_policy gen_ret = util.rollout.mean_return(gen_policy, env, sample_until) self.gen_ep_reward[name].append(gen_ret) tf.logging.info("generator return: {}".format(gen_ret)) rand_policy = util.init_rl(self.trainer.venv) rand_ret = util.rollout.mean_return(rand_policy, env, sample_until) self.rand_ep_reward[name].append(rand_ret) tf.logging.info("random return: {}".format(rand_ret))
def ep_reward_plot_add_data(env, name): """Calculate and record average episode returns.""" gen_policy = trainer.gen_policy gen_ret = util.rollout.mean_return( gen_policy, env, n_episodes=n_episodes_per_reward_data) gen_ep_reward[name].append(gen_ret) tf.logging.info("generator return: {}".format(gen_ret)) rand_policy = util.init_rl(trainer.env) rand_ret = util.rollout.mean_return( rand_policy, env, n_episodes=n_episodes_per_reward_data) rand_ep_reward[name].append(rand_ret) tf.logging.info("random return: {}".format(rand_ret)) if expert_policy is not None: exp_ret = util.rollout.mean_return( expert_policy, env, n_episodes=n_episodes_per_reward_data) exp_ep_reward[name].append(exp_ret) tf.logging.info("exp return: {}".format(exp_ret))
def test_density_trainer(density_type, is_stationary): env_id = 'Pendulum-v0' rollouts = rollout.load_trajectories(f"tests/data/rollouts/{env_id}_*.pkl") env = util.make_vec_env(env_id, 2) imitation_trainer = util.init_rl(env) density_trainer = DensityTrainer(env, rollouts=rollouts, imitation_trainer=imitation_trainer, density_type=density_type, is_stationary=is_stationary, kernel='gaussian') novice_stats = density_trainer.test_policy() density_trainer.train_policy(2000) good_stats = density_trainer.test_policy() # Novice is bad assert novice_stats["return_mean"] < -500 # Density is also pretty bad, but shouldn't make things more than 50% worse. # It would be nice to have a less flaky/more meaningful test here. assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
def add_data_ep_reward(self, epoch): """Calculate and record average episode returns.""" if epoch in self.ep_reward_X: # Don't calculate ep reward twice. return self.ep_reward_X.append(epoch) gen_policy = self.trainer.gen_policy rand_policy = util.init_rl(self.trainer.venv) sample_until = util.rollout.min_episodes( self.n_episodes_per_reward_data) trajs_rand = util.rollout.generate_trajectories( rand_policy, self.venv_norm_obs, sample_until) trajs_gen = util.rollout.generate_trajectories(gen_policy, self.venv_norm_obs, sample_until) for reward_fn, reward_name in [ (None, "Ground Truth Reward"), (self.trainer.reward_train, "Train Reward"), (self.trainer.reward_test, "Test Reward") ]: if reward_fn is None: trajs_rand_rets = [np.sum(traj.rews) for traj in trajs_rand] trajs_gen_rets = [np.sum(traj.rews) for traj in trajs_gen] else: trajs_rand_rets = [ np.sum(util.rollout.recalc_rewards_traj(traj, reward_fn)) for traj in trajs_rand ] trajs_gen_rets = [ np.sum(util.rollout.recalc_rewards_traj(traj, reward_fn)) for traj in trajs_gen ] gen_ret = np.mean(trajs_gen_rets) rand_ret = np.mean(trajs_rand_rets) self.gen_ep_reward[reward_name].append(gen_ret) self.rand_ep_reward[reward_name].append(rand_ret) tf.logging.info(f"{reward_name} generator return: {gen_ret}") tf.logging.info(f"{reward_name} random return: {rand_ret}")
def test_density_trainer(density_type, is_stationary): env_name = 'Pendulum-v0' with open("tests/data/expert_models/pendulum_0/rollouts/final.pkl", "rb") as f: rollouts = pickle.load(f) env = util.make_vec_env(env_name, 2) imitation_trainer = util.init_rl(env) density_trainer = DensityTrainer(env, rollouts=rollouts, imitation_trainer=imitation_trainer, density_type=density_type, is_stationary=is_stationary, kernel='gaussian') novice_stats = density_trainer.test_policy() density_trainer.train_policy(2000) good_stats = density_trainer.test_policy() # Novice is bad assert novice_stats["return_mean"] < -500 # Density is also pretty bad, but shouldn't make things more than 50% worse. # It would be nice to have a less flaky/more meaningful test here. assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
def init_trainer( env_name: str, expert_trajectories: Sequence[rollout.Trajectory], *, log_dir: str, seed: int = 0, use_gail: bool = False, num_vec: int = 8, parallel: bool = False, max_episode_steps: Optional[int] = None, scale: bool = True, airl_entropy_weight: float = 1.0, discrim_kwargs: dict = {}, reward_kwargs: dict = {}, trainer_kwargs: dict = {}, init_rl_kwargs: dict = {}, ): """Builds an AdversarialTrainer, ready to be trained on a vectorized environment and expert demonstrations. Args: env_name: The string id of a gym environment. expert_trajectories: Demonstrations from expert. seed: Random seed. log_dir: Directory for logging output. Will generate a unique sub-directory within this directory for all output. use_gail: If True, then train using GAIL. If False, then train using AIRL. num_vec: The number of vectorized environments. parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv. max_episode_steps: If specified, wraps VecEnv in TimeLimit wrapper with this episode length before returning. policy_dir: The directory containing the pickled experts for generating rollouts. scale: If True, then scale input Tensors to the interval [0, 1]. airl_entropy_weight: Only applicable for AIRL. The `entropy_weight` argument of `DiscrimNetAIRL.__init__`. trainer_kwargs: Arguments for the Trainer constructor. reward_kwargs: Arguments for the `*RewardNet` constructor. discrim_kwargs: Arguments for the `DiscrimNet*` constructor. init_rl_kwargs: Keyword arguments passed to `init_rl`, used to initialize the RL algorithm. """ util.logger.configure(folder=log_dir, format_strs=['tensorboard', 'stdout']) env = util.make_vec_env(env_name, num_vec, seed=seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) gen_policy = util.init_rl(env, verbose=1, **init_rl_kwargs) if use_gail: discrim = discrim_net.DiscrimNetGAIL(env.observation_space, env.action_space, scale=scale, **discrim_kwargs) else: rn = BasicShapedRewardNet(env.observation_space, env.action_space, scale=scale, **reward_kwargs) discrim = discrim_net.DiscrimNetAIRL( rn, entropy_weight=airl_entropy_weight, **discrim_kwargs) expert_demos = util.rollout.flatten_trajectories(expert_trajectories) trainer = AdversarialTrainer(env, gen_policy, discrim, expert_demos, log_dir=log_dir, **trainer_kwargs) return trainer
def init_trainer( env_id: str, rollout_glob: str, *, n_expert_demos: Optional[int] = None, seed: int = 0, log_dir: str = None, use_gail: bool = False, num_vec: int = 8, parallel: bool = False, max_n_files: int = 1, scale: bool = True, airl_entropy_weight: float = 1.0, discrim_kwargs: bool = {}, reward_kwargs: bool = {}, trainer_kwargs: bool = {}, make_blank_policy_kwargs: bool = {}, ): """Builds a Trainer, ready to be trained on a vectorized environment and expert demonstrations. Args: env_id: The string id of a gym environment. rollout_glob: Argument for `imitation.util.rollout.load_trajectories`. n_expert_demos: The number of expert trajectories to actually use after loading them via `load_trajectories`. If None, then use all available trajectories. If `n_expert_demos` is an `int`, then use exactly `n_expert_demos` trajectories, erroring if there aren't enough trajectories. If there are surplus trajectories, then use the first `n_expert_demos` trajectories and drop the rest. seed: Random seed. log_dir: Directory for logging output. use_gail: If True, then train using GAIL. If False, then train using AIRL. num_vec: The number of vectorized environments. parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv. max_n_files: If provided, then only load the most recent `max_n_files` files, as sorted by modification times. policy_dir: The directory containing the pickled experts for generating rollouts. scale: If True, then scale input Tensors to the interval [0, 1]. airl_entropy_weight: Only applicable for AIRL. The `entropy_weight` argument of `DiscrimNetAIRL.__init__`. trainer_kwargs: Arguments for the Trainer constructor. reward_kwargs: Arguments for the `*RewardNet` constructor. discrim_kwargs: Arguments for the `DiscrimNet*` constructor. make_blank_policy_kwargs: Keyword arguments passed to `make_blank_policy`, used to initialize the trainer. """ env = util.make_vec_env(env_id, num_vec, seed=seed, parallel=parallel, log_dir=log_dir) gen_policy = util.init_rl(env, verbose=1, **make_blank_policy_kwargs) if use_gail: discrim = discrim_net.DiscrimNetGAIL(env.observation_space, env.action_space, scale=scale, **discrim_kwargs) else: rn = BasicShapedRewardNet(env.observation_space, env.action_space, scale=scale, **reward_kwargs) discrim = discrim_net.DiscrimNetAIRL( rn, entropy_weight=airl_entropy_weight, **discrim_kwargs) expert_demos = util.rollout.load_trajectories(rollout_glob, max_n_files=max_n_files) if n_expert_demos is not None: assert len(expert_demos) >= n_expert_demos expert_demos = expert_demos[:n_expert_demos] expert_rollouts = util.rollout.flatten_trajectories(expert_demos)[:3] trainer = Trainer(env, gen_policy, discrim, expert_rollouts, **trainer_kwargs) return trainer
def rollouts_and_policy( _run, _seed: int, env_name: str, total_timesteps: int, *, log_dir: str, num_vec: int, parallel: bool, max_episode_steps: Optional[int], normalize: bool, normalize_kwargs: dict, init_rl_kwargs: dict, n_episodes_eval: int, reward_type: Optional[str], reward_path: Optional[str], rollout_save_interval: int, rollout_save_final: bool, rollout_save_n_timesteps: Optional[int], rollout_save_n_episodes: Optional[int], policy_save_interval: int, policy_save_final: bool, init_tensorboard: bool, ) -> dict: """Trains an expert policy from scratch and saves the rollouts and policy. Checkpoints: At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}/`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. normalize: If True, then rescale observations and reward. normalize_kwargs: kwargs for `VecNormalize`. init_rl_kwargs: kwargs for `init_rl`. n_episodes_eval: The number of episodes to average over when calculating the average ground truth reward return of the final policy. reward_type: If provided, then load the serialized reward of this type, wrapping the environment in this reward. This is useful to test whether a reward model transfers. For more information, see `imitation.rewards.serialize.load_reward`. reward_path: A specifier, such as a path to a file on disk, used by reward_type to load the reward model. For more information, see `imitation.rewards.serialize.load_reward`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb and "output/summary/...". Returns: The return value of `rollout_stats()` using the final policy. """ os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) sample_until = util.rollout.make_sample_until(rollout_save_n_timesteps, rollout_save_n_episodes) eval_sample_until = util.rollout.min_episodes(n_episodes_eval) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) if init_tensorboard: sb_tensorboard_dir = osp.join(log_dir, "sb_tb") init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) log_callbacks = [] with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = RewardVecEnvWrapper(venv, reward_fn) log_callbacks.append(venv.log_callback) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv, **normalize_kwargs) policy = util.init_rl(venv, verbose=1, **init_rl_kwargs) # Make callback to save intermediate artifacts during training. step = 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: save_path = osp.join(rollout_dir, f"{step}.pkl") util.rollout.save(save_path, policy, venv, sample_until) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize) policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: save_path = osp.join(rollout_dir, "final.pkl") util.rollout.save(save_path, policy, venv, sample_until) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize) # Final evaluation of expert policy. trajs = util.rollout.generate_trajectories( policy, venv, eval_sample_until) stats = util.rollout.rollout_stats(trajs) return stats
def rollouts_and_policy( _seed: int, env_name: str, total_timesteps: int, *, log_dir: str = None, num_vec: int = 8, parallel: bool = False, max_episode_steps: Optional[int] = None, normalize: bool = True, make_blank_policy_kwargs: dict = {}, reward_type: Optional[str] = None, reward_path: Optional[str] = None, rollout_save_interval: int = 0, rollout_save_final: bool = False, rollout_save_n_timesteps: Optional[int] = None, rollout_save_n_episodes: Optional[int] = None, policy_save_interval: int = -1, policy_save_final: bool = True, ) -> None: """Trains an expert policy from scratch and saves the rollouts and policy. At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}.pkl`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. max_episode_steps: If not None, then environments are wrapped by TimeLimit so that they have at most `max_episode_steps` steps per episode. normalize: If True, then rescale observations and reward. make_blank_policy_kwargs: Kwargs for `make_blank_policy`. reward_type: If provided, then load the serialized reward of this type, wrapping the environment in this reward. This is useful to test whether a reward model transfers. For more information, see `imitation.rewards.serialize.load_reward`. reward_path: A specifier, such as a path to a file on disk, used by reward_type to load the reward model. For more information, see `imitation.rewards.serialize.load_reward`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. """ _validate_traj_generate_params(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps) log_callbacks = [] with contextlib.ExitStack() as stack: if reward_type is not None: reward_fn_ctx = load_reward(reward_type, reward_path, venv) reward_fn = stack.enter_context(reward_fn_ctx) venv = RewardVecEnvWrapper(venv, reward_fn) log_callbacks.append(venv.log_callback) tf.logging.info( f"Wrapped env in reward {reward_type} from {reward_path}.") vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs) # Make callback to save intermediate artifacts during training. step = 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] # TODO(adam): make logging frequency configurable for callback in log_callbacks: callback(sb_logger) if rollout_save_interval > 0 and step % rollout_save_interval == 0: util.rollout.save(rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_interval > 0 and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:05d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True # Continue training. policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: util.rollout.save(rollout_dir, policy, venv, "final", n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize)
def rollouts_and_policy( _seed: int, env_name: str, total_timesteps: int, *, log_dir: str = None, num_vec: int = 8, parallel: bool = False, normalize: bool = True, make_blank_policy_kwargs: dict = {}, rollout_save_interval: int = 0, rollout_save_final: bool = False, rollout_save_n_timesteps: Optional[int] = None, rollout_save_n_episodes: Optional[int] = None, policy_save_interval: int = -1, policy_save_final: bool = True, ) -> None: """Trains an expert policy from scratch and saves the rollouts and policy. At applicable training steps `step` (where step is either an integer or "final"): - Policies are saved to `{log_dir}/policies/{step}.pkl`. - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`. Args: env_name: The gym.Env name. Loaded as VecEnv. total_timesteps: Number of training timesteps in `model.learn()`. log_dir: The root directory to save metrics and checkpoints to. num_vec: Number of environments in VecEnv. parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv. normalize: If True, then rescale observations and reward. make_blank_policy_kwargs: Kwargs for `make_blank_policy`. rollout_save_interval: The number of training updates in between intermediate rollout saves. If the argument is nonpositive, then don't save intermediate updates. rollout_save_final: If True, then save rollouts right after training is finished. rollout_save_n_timesteps: The minimum number of timesteps saved in every file. Could be more than `rollout_save_n_timesteps` because trajectories are saved by episode rather than by transition. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. rollout_save_n_episodes: The number of episodes saved in every file. Must set exactly one of `rollout_save_n_timesteps` and `rollout_save_n_episodes`. policy_save_interval: The number of training updates between saves. Has the same semantics are `rollout_save_interval`. policy_save_final: If True, then save the policy right after training is finished. """ _validate_traj_generate_params(rollout_save_n_timesteps, rollout_save_n_episodes) with util.make_session(): tf.logging.set_verbosity(tf.logging.INFO) sb_logger.configure(folder=osp.join(log_dir, 'rl'), format_strs=['tensorboard', 'stdout']) rollout_dir = osp.join(log_dir, "rollouts") policy_dir = osp.join(log_dir, "policies") os.makedirs(rollout_dir, exist_ok=True) os.makedirs(policy_dir, exist_ok=True) venv = util.make_vec_env(env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir) vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) policy = util.init_rl(venv, verbose=1, **make_blank_policy_kwargs) # Make callback to save intermediate artifacts during training. step = 0 rollout_ok = rollout_save_interval > 0 policy_ok = policy_save_interval > 0 def callback(locals_: dict, _) -> bool: nonlocal step step += 1 policy = locals_['self'] if rollout_ok and step % rollout_save_interval == 0: util.rollout.save( rollout_dir, policy, venv, step, n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_ok and step % policy_save_interval == 0: output_dir = os.path.join(policy_dir, f'{step:5d}') serialize.save_stable_model(output_dir, policy, vec_normalize) return True policy.learn(total_timesteps, callback=callback) # Save final artifacts after training is complete. if rollout_save_final: util.rollout.save( rollout_dir, policy, venv, "final", n_timesteps=rollout_save_n_timesteps, n_episodes=rollout_save_n_episodes) if policy_save_final: output_dir = os.path.join(policy_dir, "final") serialize.save_stable_model(output_dir, policy, vec_normalize)