def test_gail(expert_env): env_id, expert_path = expert_env env = gym.make(env_id) dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(1000) model.save("GAIL-{}".format(env_id)) model = model.load("GAIL-{}".format(env_id), env=env) model.learn(1000) obs = env.reset() for _ in range(1000): action, _ = model.predict(obs) obs, _, done, _ = env.step(action) if done: obs = env.reset() del dataset, model
def test_gail(tmp_path, expert_env): env_id, expert_path, load_from_memory = expert_env env = gym.make(env_id) traj_data = None if load_from_memory: traj_data = np.load(expert_path) expert_path = None dataset = ExpertDataset(traj_data=traj_data, expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(300) model.save(str(tmp_path / "GAIL-{}".format(env_id))) model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env) model.learn(300) evaluate_policy(model, env, n_eval_episodes=5) del dataset, model
def load_bc_model_from_path(model_name): # NOTE: The lowest loss and highest accuracy models # were also saved, can be found in the same dir with # special suffixes. bc_metadata = load_pickle(BC_SAVE_DIR + model_name + "/bc_metadata") bc_params = bc_metadata["bc_params"] model = GAIL.load(BC_SAVE_DIR + model_name + "/model") return model, bc_params
def load_model(path: str, algorithm: str): from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO if algorithm == 'PPO2': return PPO2.load(path) if algorithm == 'DQN': return DQN.load(path) if algorithm == 'A2C': return A2C.load(path) if algorithm == 'ACER': return ACER.load(path) if algorithm == 'GAIL': return GAIL.load(path) if algorithm == 'TRPO': return TRPO.load(path) return None
def eval_with_standard_baselines(n_games, model_name, display=False): """Method to evaluate agent performance with stable-baselines infrastructure, just to make sure everything is compatible and integrating correctly.""" bc_metadata = load_pickle(BC_SAVE_DIR + model_name + "/bc_metadata") bc_params = bc_metadata["bc_params"] model = GAIL.load(BC_SAVE_DIR + model_name + "/model") gym_env = init_gym_env(bc_params) tot_rew = 0 for i in tqdm.trange(n_games): obs, _ = gym_env.reset() done = False while not done: ob0, ob1 = obs a0 = stable_baselines_predict_fn(model, ob0) a1 = stable_baselines_predict_fn(model, ob1) joint_action = (a0, a1) (obs, _), rewards, done, info = gym_env.step(joint_action) tot_rew += rewards print("avg reward", tot_rew / n_games) return tot_rew / n_games
def trian_agent_with_gail(load): from stable_baselines.common.policies import MlpPolicy from stable_baselines import GAIL env = gym.make("F16GCAS-v0") class CustomPolicy(MlpPolicy): def __init__(self, *args, **kwargs): super(CustomPolicy, self).__init__(*args, **kwargs, layers=[128, 128]) if not load: ExpData = ExpertDataset("./lqr_export.npz") model = GAIL(CustomPolicy, env, ExpData, verbose=1) model.learn(total_timesteps=1000000) model.save(ROOT+"/trained_models/TDRL/f16/gail/128_128") else: # with model.graph.as_default(): # for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'): # print(i) model = GAIL.load(ROOT+"/trained_models/TDRL/f16/gail/128_128", env=env) with model.graph.as_default(): print(tf.all_variables()) return model
type=int, help='Number of games to test.') parser.add_argument('-s', '--save', default=True, type=bool) args = parser.parse_args() sys.path.append('/Users/cusgadmin/Documents/UCB/Academics/SSastry/\ Multi_agent_competition') os.chdir( '/Users/cusgadmin/Documents/UCB/Academics/SSastry/Multi_agent_competition/' ) print(colored('Testing learnt policy from model file {} for {} games!'.\ format(args.model,args.num_test),'red')) start_time = time.time() model = GAIL.load(args.model) env = gym.make('gym_pursuitevasion_small:pursuitevasion_small-v0') g = 1 obs = env.reset(ep=g) e_win_games = int(0) env.render(mode='human', highlight=True, ep=g) if args.save: metadata = dict(title='Game') writer = FFMpegWriter(fps=5, metadata=metadata) writer.setup(env.window.fig, "test_game.mp4", 300) writer.grab_frame() while True: action, _states = model.predict(obs) obs, rewards, done, e_win = env.step(action) env.render(mode='human', highlight=True, ep=g) if args.save:
import gym from stable_baselines import GAIL, SAC from stable_baselines.gail import ExpertDataset, generate_expert_traj # Generate expert trajectories (train expert) model = SAC('MlpPolicy', 'Pendulum-v0', verbose=1) generate_expert_traj(model, 'expert_pendulum', n_timesteps=100, n_episodes=10) # Load the expert dataset dataset = ExpertDataset(expert_path='expert_pendulum.npz', traj_limitation=10, verbose=1) model = GAIL('MlpPolicy', 'Pendulum-v0', dataset, verbose=1) # Note: in practice, you need to train for 1M steps to have a working policy model.learn(total_timesteps=100000) model.save("gail_pendulum") del model # remove to demonstrate saving and loading model = GAIL.load("gail_pendulum") env = gym.make('Pendulum-v0') obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()