def trainer(_algorithm_cls, _parallel: bool, tmpdir: str, _convert_dataset: bool): logger.configure(tmpdir, ["tensorboard", "stdout"]) trajs = types.load( "tests/data/expert_models/cartpole_0/rollouts/final.pkl") if _convert_dataset: trans = rollout.flatten_trajectories(trajs) expert_data = datasets.TransitionsDictDatasetAdaptor(trans) else: expert_data = rollout.flatten_trajectories(trajs) venv = util.make_vec_env( "CartPole-v1", n_envs=2, parallel=_parallel, log_dir=tmpdir, ) gen_policy = util.init_rl(venv, verbose=1) return _algorithm_cls( venv=venv, expert_data=expert_data, gen_policy=gen_policy, log_dir=tmpdir, )
def trainer(request, session, venv): convert_dataset = request.param rollouts = types.load(ROLLOUT_PATH) data = rollout.flatten_trajectories(rollouts) if convert_dataset: data = datasets.TransitionsDictDatasetAdaptor( data, datasets.EpochOrderDictDataset) return bc.BC(venv.observation_space, venv.action_space, expert_data=data)
def _load_all_demos(self): num_demos_by_round = [] for round_num in range(self._last_loaded_round + 1, self.round_num + 1): round_dir = self._demo_dir_path_for_round(round_num) demo_paths = self._get_demo_paths(round_dir) self._all_demos.extend(_load_trajectory(p) for p in demo_paths) num_demos_by_round.append(len(demo_paths)) tf.logging.info(f"Loaded {len(self._all_demos)} total") demo_transitions = rollout.flatten_trajectories(self._all_demos) return demo_transitions, num_demos_by_round
def test_train_from_random_dict_dataset(venv): # make sure that we can construct BC instance & train from a RandomDictDataset rollouts = types.load(ROLLOUT_PATH) data = rollout.flatten_trajectories(rollouts) data = datasets.TransitionsDictDatasetAdaptor(data, datasets.RandomDictDataset) trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=data) trainer.train(n_epochs=1)
def f(total_timesteps: int) -> types.Transitions: trajs = trajectory_callable( sample_until=rollout.min_timesteps(total_timesteps)) trans = rollout.flatten_trajectories(trajs) assert len(trans) >= total_timesteps as_dict = dataclasses.asdict(trans) truncated = { k: arr[:total_timesteps] for k, arr in as_dict.items() } return dataclasses.replace(trans, **truncated)
def test_potential_shaping_cycle(graph, session, venv, potential_cls, discount: float, num_episodes: int = 10) -> None: """Test that potential shaping is constant on any fixed-length cycle. Specifically, performs rollouts of a random policy in the environment. Fixes the starting state for each trajectory at the all-zero state. Then computes episode return, and checks they're all equal. Requires environment be fixed length, otherwise the episode return will vary (except in the undiscounted case). """ policy = base.RandomPolicy(venv.observation_space, venv.action_space) trajectories = rollout.generate_trajectories( policy, venv, sample_until=rollout.min_episodes(num_episodes)) transitions = rollout.flatten_trajectories(trajectories) # Make initial state fixed as all-zero. # Note don't need to change final state, since `dones` being `True` should # force potential to be zero at those states. obs = np.array(transitions.obs) idxs = np.where(transitions.dones)[0] + 1 idxs = np.pad(idxs[:-1], (1, 0), "constant") obs[idxs, :] = 0 transitions = dataclasses.replace(transitions, obs=obs) with graph.as_default(), session.as_default(): reward_model = potential_cls(venv.observation_space, venv.action_space, discount=discount) session.run(tf.global_variables_initializer()) rews = rewards.evaluate_models({"m": reward_model}, transitions) rets = rewards.compute_return_from_rews(rews, transitions.dones, discount=discount)["m"] if discount == 1.0: assert np.allclose(rets, 0.0, atol=1e-5) assert np.allclose(rets, np.mean(rets), atol=1e-5)
def train_bc(env, experiment_path): """ Train GAIL on rollouts in the experiment path, save checkpoints and evaluate those checkpoints Based on code here https://github.com/HumanCompatibleAI/imitation/blob/master/src/imitation/scripts/train_adversarial.py """ rollout_file = os.path.join(experiment_path, ROLLOUTS_FILE) bc_model_directory = os.path.join(experiment_path, BC_MODEL_DIRECTORY) bc_log_directory = os.path.join(experiment_path, BC_LOG_DIRECTORY) if os.path.isdir(bc_log_directory): print("Skipping BC training (log directory exists)") return os.makedirs(bc_model_directory, exist_ok=True) os.makedirs(bc_log_directory, exist_ok=True) logger.configure(bc_log_directory) expert_trajs = types.load(rollout_file) expert_transitions = rollout.flatten_trajectories(expert_trajs) env = gym.make(env) trainer = BC(env.observation_space, env.action_space, expert_data=expert_transitions, policy_class=MlpPolicy, device="cpu", ent_weight=0.0) env.close() def callback(locals): path = os.path.join(bc_model_directory, "epoch_{}".format(locals["epoch_num"])) trainer.save_policy(path) trainer.save_policy(os.path.join(experiment_path, "start_bc")) trainer.train(BC_TRAIN_EPOCHS, on_epoch_end=callback) # Save trained policy trainer.save_policy(os.path.join(experiment_path, "final_bc"))
def trainer(batch_size, venv, expert_data_type): rollouts = types.load(ROLLOUT_PATH) trans = rollout.flatten_trajectories(rollouts) if expert_data_type == "data_loader": expert_data = th_data.DataLoader( trans, batch_size=batch_size, shuffle=True, collate_fn=types.transitions_collate_fn, ) elif expert_data_type == "ducktyped_data_loader": expert_data = DucktypedDataset(trans, batch_size) elif expert_data_type == "transitions": expert_data = trans else: # pragma: no cover raise ValueError(expert_data_type) return bc.BC( venv.observation_space, venv.action_space, expert_data=expert_data, )
def compute_return_of_models( models: Mapping[K, RewardModel], trajectories: Sequence[types.Trajectory], discount: float = 1.0, ) -> Mapping[K, np.ndarray]: """Computes the returns of each trajectory under each model. Args: models: A collection of reward models. trajectories: A sequence of trajectories. discount: The discount rate; defaults to undiscounted. Returns: A collection of NumPy arrays containing the returns from each model. """ # Reward models are Markovian so only operate on a timestep at a time, # expecting input shape (batch_size, ) + {obs,act}_shape. Flatten the # trajectories to accommodate this. transitions = rollout.flatten_trajectories(trajectories) preds = evaluate_models(models, transitions) return compute_return_from_rews(preds, transitions.dones, discount)
def init_trainer( env_name: str, expert_trajectories: Sequence[types.Trajectory], *, log_dir: str, seed: int = 0, use_gail: bool = False, num_vec: int = 8, parallel: bool = False, max_episode_steps: Optional[int] = None, scale: bool = True, airl_entropy_weight: float = 1.0, discrim_kwargs: dict = {}, reward_kwargs: dict = {}, trainer_kwargs: dict = {}, init_rl_kwargs: dict = {}, ): """Builds an AdversarialTrainer, ready to be trained on expert demonstrations. Args: env_name: The string id of a gym environment. expert_trajectories: Demonstrations from expert. seed: Random seed. log_dir: Directory for logging output. Will generate a unique sub-directory within this directory for all output. use_gail: If True, then train using GAIL. If False, then train using AIRL. num_vec: The number of vectorized environments. parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv. max_episode_steps: If specified, wraps VecEnv in TimeLimit wrapper with this episode length before returning. policy_dir: The directory containing the pickled experts for generating rollouts. scale: If True, then scale input Tensors to the interval [0, 1]. airl_entropy_weight: Only applicable for AIRL. The `entropy_weight` argument of `DiscrimNetAIRL.__init__`. trainer_kwargs: Arguments for the Trainer constructor. reward_kwargs: Arguments for the `*RewardNet` constructor. discrim_kwargs: Arguments for the `DiscrimNet*` constructor. init_rl_kwargs: Keyword arguments passed to `init_rl`, used to initialize the RL algorithm. """ logger.configure(folder=log_dir, format_strs=["tensorboard", "stdout"]) env = util.make_vec_env( env_name, num_vec, seed=seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps, ) gen_policy = util.init_rl(env, verbose=1, **init_rl_kwargs) if use_gail: discrim = discrim_net.DiscrimNetGAIL(env.observation_space, env.action_space, scale=scale, **discrim_kwargs) else: rn = BasicShapedRewardNet(env.observation_space, env.action_space, scale=scale, **reward_kwargs) discrim = discrim_net.DiscrimNetAIRL( rn, entropy_weight=airl_entropy_weight, **discrim_kwargs) expert_demos = rollout.flatten_trajectories(expert_trajectories) trainer = AdversarialTrainer(env, gen_policy, discrim, expert_demos, log_dir=log_dir, **trainer_kwargs) return trainer
def convert_traj_to_coords_filtered(trajs: Sequence[types.Trajectory]): trans = rollout.flatten_trajectories(trajs) obs = trans.obs if filter_trans_by_act: obs = obs[trans.acts == act] return obs[:, 0], obs[:, 1]
from imitation.algorithms import adversarial, bc from imitation.data import rollout from imitation.util import logger, util # Load pickled test demonstrations. with open("tests/data/expert_models/cartpole_0/rollouts/final.pkl", "rb") as f: # This is a list of `imitation.data.types.Trajectory`, where # every instance contains observations and actions for a single expert # demonstration. trajectories = pickle.load(f) # Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`. # This is a more general dataclass containing unordered # (observation, actions, next_observation) transitions. transitions = rollout.flatten_trajectories(trajectories) venv = util.make_vec_env("CartPole-v1", n_envs=2) tempdir = tempfile.TemporaryDirectory(prefix="quickstart") tempdir_path = pathlib.Path(tempdir.name) print( f"All Tensorboards and logging are being written inside {tempdir_path}/.") # Train BC on expert data. # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over # dictionaries containing observations and actions. logger.configure(tempdir_path / "BC/") bc_trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=transitions)
def expert_transitions(): trajs = types.load( "tests/data/expert_models/cartpole_0/rollouts/final.pkl") trans = rollout.flatten_trajectories(trajs) return trans
def make_trainer(): env_name = "CartPole-v1" env = util.make_vec_env(env_name, 2) rollouts = types.load(ROLLOUT_PATH) rollouts = rollout.flatten_trajectories(rollouts) return bc.BCTrainer(env, expert_demos=rollouts)
def imitation_learning(expert_traj_path, imitation_algo_name, rl_algo_name, env_name): # Load pickled expert demonstrations. with open(expert_traj_path, "rb") as f: # This is a list of `imitation.data.types.Trajectory`, where # every instance contains observations and actions for a single expert # demonstration. trajectories = pickle.load(f) # Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`. # This is a more general dataclass containing unordered # (observation, actions, next_observation) transitions. transitions = rollout.flatten_trajectories(trajectories) venv = util.make_vec_env(env_name, n_envs=2) # tempdir = tempfile.TemporaryDirectory(prefix="il_results/{}_{}".format(rl_algo_name, env_name)) # tempdir_path = pathlib.Path(tempdir.name) # print(f"All Tensorboards and logging are being written inside {tempdir_path}/.") log_path = "il_results/{}_{}/{}/".format(rl_algo_name, env_name, imitation_algo_name) if imitation_algo_name == 'BC': # Train BC on expert data. # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over # dictionaries containing observations and actions. logger.configure(log_path, format_strs=["stdout", "tensorboard"]) trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=transitions) trainer.train(n_epochs=100, log_interval=1) elif imitation_algo_name == 'GAIL': logger.configure(log_path, format_strs=["stdout", "tensorboard"]) gail_trainer = adversarial.GAIL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024), discrim_kwargs={ 'discrim_net': ActObsMLP( action_space=venv.action_space, observation_space=venv.observation_space, hid_sizes=(32, 32), ) }) gail_trainer.train(total_timesteps=2048) trainer = gail_trainer.gen_algo elif imitation_algo_name == 'AIRL': # Train AIRL on expert data. logger.configure(log_path) airl_trainer = adversarial.AIRL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024), ) airl_trainer.train(total_timesteps=2048) sample_until = rollout.min_episodes(15) trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until) # trainer.save_policy("{}/bc_policy.pth.tar".format(log_path)) th.save(trainer.policy, "{}/{}_policy.pth.tar".format(log_path, imitation_algo_name)) return trained_ret_mean
def train( _run, _seed: int, algorithm: str, env_name: str, num_vec: int, parallel: bool, max_episode_steps: Optional[int], rollout_path: str, n_expert_demos: Optional[int], log_dir: str, total_timesteps: int, n_episodes_eval: int, init_tensorboard: bool, checkpoint_interval: int, gen_batch_size: int, init_rl_kwargs: Mapping, algorithm_kwargs: Mapping[str, Mapping], discrim_net_kwargs: Mapping[str, Mapping], ) -> dict: """Train an adversarial-network-based imitation learning algorithm. Checkpoints: - DiscrimNets are saved to `f"{log_dir}/checkpoints/{step}/discrim/"`, where step is either the training round or "final". - Generator policies are saved to `f"{log_dir}/checkpoints/{step}/gen_policy/"`. Args: _seed: Random seed. algorithm: A case-insensitive string determining which adversarial imitation learning algorithm is executed. Either "airl" or "gail". env_name: The environment to train in. num_vec: Number of `gym.Env` to vectorize. parallel: Whether to use "true" parallelism. If True, then use `SubProcVecEnv`. Otherwise, use `DummyVecEnv` which steps through environments serially. max_episode_steps: If not None, then a TimeLimit wrapper is applied to each environment to artificially limit the maximum number of timesteps in an episode. rollout_path: Path to pickle containing list of Trajectories. Used as expert demonstrations. n_expert_demos: The number of expert trajectories to actually use after loading them from `rollout_path`. If None, then use all available trajectories. If `n_expert_demos` is an `int`, then use exactly `n_expert_demos` trajectories, erroring if there aren't enough trajectories. If there are surplus trajectories, then use the first `n_expert_demos` trajectories and drop the rest. log_dir: Directory to save models and other logging to. total_timesteps: The number of transitions to sample from the environment during training. n_episodes_eval: The number of episodes to average over when calculating the average episode reward of the imitation policy for return. init_tensorboard: If True, then write tensorboard logs to `{log_dir}/sb_tb`. checkpoint_interval: Save the discriminator and generator models every `checkpoint_interval` rounds and after training is complete. If 0, then only save weights after training is complete. If <0, then don't save weights at all. gen_batch_size: Batch size for generator updates. Sacred automatically uses this to calculate `n_steps` in `init_rl_kwargs`. In the script body, this is only used in sanity checks. init_rl_kwargs: Keyword arguments for `init_rl`, the RL algorithm initialization utility function. algorithm_kwargs: Keyword arguments for the `GAIL` or `AIRL` constructor that can apply to either constructor. Unlike a regular kwargs argument, this argument can only have the following keys: "shared", "airl", and "gail". `algorithm_kwargs["airl"]`, if it is provided, is a kwargs `Mapping` passed to the `AIRL` constructor when `algorithm == "airl"`. Likewise `algorithm_kwargs["gail"]` is passed to the `GAIL` constructor when `algorithm == "gail"`. `algorithm_kwargs["shared"]`, if provided, is passed to both the `AIRL` and `GAIL` constructors. Duplicate keyword argument keys between `algorithm_kwargs["shared"]` and `algorithm_kwargs["airl"]` (or "gail") leads to an error. discrim_net_kwargs: Keyword arguments for the `DiscrimNet` constructor. Unlike a regular kwargs argument, this argument can only have the following keys: "shared", "airl", "gail". These keys have the same meaning as they do in `algorithm_kwargs`. Returns: A dictionary with two keys. "imit_stats" gives the return value of `rollout_stats()` on rollouts test-reward-wrapped environment, using the final policy (remember that the ground-truth reward can be recovered from the "monitor_return" key). "expert_stats" gives the return value of `rollout_stats()` on the expert demonstrations loaded from `rollout_path`. """ if gen_batch_size % num_vec != 0: raise ValueError( f"num_vec={num_vec} must evenly divide gen_batch_size={gen_batch_size}." ) allowed_keys = {"shared", "gail", "airl"} if not discrim_net_kwargs.keys() <= allowed_keys: raise ValueError( f"Invalid discrim_net_kwargs.keys()={discrim_net_kwargs.keys()}. " f"Allowed keys: {allowed_keys}" ) if not algorithm_kwargs.keys() <= allowed_keys: raise ValueError( f"Invalid discrim_net_kwargs.keys()={algorithm_kwargs.keys()}. " f"Allowed keys: {allowed_keys}" ) if not os.path.exists(rollout_path): raise ValueError(f"File at rollout_path={rollout_path} does not exist.") expert_trajs = types.load(rollout_path) if n_expert_demos is not None: if not len(expert_trajs) >= n_expert_demos: raise ValueError( f"Want to use n_expert_demos={n_expert_demos} trajectories, but only " f"{len(expert_trajs)} are available via {rollout_path}." ) expert_trajs = expert_trajs[:n_expert_demos] expert_transitions = rollout.flatten_trajectories(expert_trajs) total_timesteps = int(total_timesteps) logging.info("Logging to %s", log_dir) logger.configure(log_dir, ["tensorboard", "stdout"]) os.makedirs(log_dir, exist_ok=True) sacred_util.build_sacred_symlink(log_dir, _run) venv = util.make_vec_env( env_name, num_vec, seed=_seed, parallel=parallel, log_dir=log_dir, max_episode_steps=max_episode_steps, ) # if init_tensorboard: # tensorboard_log = osp.join(log_dir, "sb_tb") # else: # tensorboard_log = None gen_algo = util.init_rl( # FIXME(sam): ignoring tensorboard_log is a hack to prevent SB3 from # re-configuring the logger (SB3 issue #109). See init_rl() for details. # TODO(shwang): Let's get rid of init_rl after SB3 issue #109 is fixed? # Besides sidestepping #109, init_rl is just a stub function. venv, **init_rl_kwargs, ) discrim_kwargs_shared = discrim_net_kwargs.get("shared", {}) discrim_kwargs_algo = discrim_net_kwargs.get(algorithm, {}) final_discrim_kwargs = dict(**discrim_kwargs_shared, **discrim_kwargs_algo) algorithm_kwargs_shared = algorithm_kwargs.get("shared", {}) algorithm_kwargs_algo = algorithm_kwargs.get(algorithm, {}) final_algorithm_kwargs = dict( **algorithm_kwargs_shared, **algorithm_kwargs_algo, ) if algorithm.lower() == "gail": algo_cls = adversarial.GAIL elif algorithm.lower() == "airl": algo_cls = adversarial.AIRL else: raise ValueError(f"Invalid value algorithm={algorithm}.") trainer = algo_cls( venv=venv, expert_data=expert_transitions, gen_algo=gen_algo, log_dir=log_dir, discrim_kwargs=final_discrim_kwargs, **final_algorithm_kwargs, ) def callback(round_num): if checkpoint_interval > 0 and round_num % checkpoint_interval == 0: save(trainer, os.path.join(log_dir, "checkpoints", f"{round_num:05d}")) trainer.train(total_timesteps, callback) # Save final artifacts. if checkpoint_interval >= 0: save(trainer, os.path.join(log_dir, "checkpoints", "final")) # Final evaluation of imitation policy. results = {} sample_until_eval = rollout.min_episodes(n_episodes_eval) trajs = rollout.generate_trajectories( trainer.gen_algo, trainer.venv_train_norm, sample_until=sample_until_eval ) results["expert_stats"] = rollout.rollout_stats(expert_trajs) results["imit_stats"] = rollout.rollout_stats(trajs) return results