def agent_PPO(prompt, num_legal_actions, num_possible_obs): dummy_env.set_meta(num_legal_actions, num_possible_obs) meta = (num_legal_actions, num_possible_obs) num_observs = (len(prompt) + 1) / 3 train_on_len = 3 * pow(2, int(log2(num_observs))) - 1 train_on = prompt[:train_on_len] if not ((train_on, meta) in cache_PPO): rewards = [train_on[i + 0] for i in range(0, train_on_len, 3)] observs = [train_on[i + 1] for i in range(0, train_on_len, 3)] dummy_env.set_rewards_and_observs(rewards, observs) n_steps = len(rewards) - 1 if n_steps < 2: return 0 A = SBL3.PPO('MlpPolicy', dummy_env, n_steps=n_steps, batch_size=n_steps, seed=0) A.learn(len(rewards) - 1) cache_PPO[(train_on, meta)] = A else: A = cache_PPO[(train_on, meta)] action, _ = A.predict(prompt[-1]) return action
def test_stable_PPO(env_name, request): env = request.getfixturevalue(env_name) model = stable_baselines3.PPO('MlpPolicy', env, verbose=1, learning_rate=.0003, n_steps=2048, batch_size=64, n_epochs=10, gamma=.99, gae_lambda=.95, clip_range=.2, ent_coef=0, vf_coef=.5, max_grad_norm=.5) model.learn(total_timesteps=TIMESTEPS) # Check model state assert model.action_space == env.action_space assert model.env.action_space == env.action_space assert isinstance(model.policy, stable_baselines3.common.policies.ActorCriticPolicy) # Check model works obs = env.reset() a, _ = model.predict(obs) obs, reward, done, info = env.step(a) assert reward is not None and reward < 0 assert a is not None assert isinstance(done, bool) assert info['timestep'] == 1 env.close()
# Train BC on expert data. # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over # dictionaries containing observations and actions. logger.configure(tempdir_path / "BC/") bc_trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=transitions) bc_trainer.train(n_epochs=1) # Train GAIL on expert data. # GAIL, and AIRL also accept as `expert_data` any Pytorch-style DataLoader that # iterates over dictionaries containing observations, actions, and next_observations. logger.configure(tempdir_path / "GAIL/") gail_trainer = adversarial.GAIL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024), ) gail_trainer.train(total_timesteps=2048) # Train AIRL on expert data. logger.configure(tempdir_path / "AIRL/") airl_trainer = adversarial.AIRL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024), ) airl_trainer.train(total_timesteps=2048)
def imitation_learning(expert_traj_path, imitation_algo_name, rl_algo_name, env_name): # Load pickled expert demonstrations. with open(expert_traj_path, "rb") as f: # This is a list of `imitation.data.types.Trajectory`, where # every instance contains observations and actions for a single expert # demonstration. trajectories = pickle.load(f) # Convert List[types.Trajectory] to an instance of `imitation.data.types.Transitions`. # This is a more general dataclass containing unordered # (observation, actions, next_observation) transitions. transitions = rollout.flatten_trajectories(trajectories) venv = util.make_vec_env(env_name, n_envs=2) # tempdir = tempfile.TemporaryDirectory(prefix="il_results/{}_{}".format(rl_algo_name, env_name)) # tempdir_path = pathlib.Path(tempdir.name) # print(f"All Tensorboards and logging are being written inside {tempdir_path}/.") log_path = "il_results/{}_{}/{}/".format(rl_algo_name, env_name, imitation_algo_name) if imitation_algo_name == 'BC': # Train BC on expert data. # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over # dictionaries containing observations and actions. logger.configure(log_path, format_strs=["stdout", "tensorboard"]) trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=transitions) trainer.train(n_epochs=100, log_interval=1) elif imitation_algo_name == 'GAIL': logger.configure(log_path, format_strs=["stdout", "tensorboard"]) gail_trainer = adversarial.GAIL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024), discrim_kwargs={ 'discrim_net': ActObsMLP( action_space=venv.action_space, observation_space=venv.observation_space, hid_sizes=(32, 32), ) }) gail_trainer.train(total_timesteps=2048) trainer = gail_trainer.gen_algo elif imitation_algo_name == 'AIRL': # Train AIRL on expert data. logger.configure(log_path) airl_trainer = adversarial.AIRL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1, n_steps=1024), ) airl_trainer.train(total_timesteps=2048) sample_until = rollout.min_episodes(15) trained_ret_mean = rollout.mean_return(trainer.policy, venv, sample_until) # trainer.save_policy("{}/bc_policy.pth.tar".format(log_path)) th.save(trainer.policy, "{}/{}_policy.pth.tar".format(log_path, imitation_algo_name)) return trained_ret_mean
# Train BC on expert data. # BC also accepts as `expert_data` any PyTorch-style DataLoader that iterates over # dictionaries containing observations and actions. logger.configure(tempdir_path / "BC/") bc_trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=transitions) bc_trainer.train(n_epochs=1) # Train GAIL on expert data. # GAIL, and AIRL also accept as `expert_data` any Pytorch-style DataLoader that # iterates over dictionaries containing observations, actions, and next_observations. logger.configure(tempdir_path / "GAIL/") gail_trainer = adversarial.GAIL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1), ) gail_trainer.train(total_timesteps=2048) # Train AIRL on expert data. logger.configure(tempdir_path / "AIRL/") airl_trainer = adversarial.AIRL( venv, expert_data=transitions, expert_batch_size=32, gen_algo=sb3.PPO("MlpPolicy", venv, verbose=1), ) airl_trainer.train(total_timesteps=2048)