def test_gail(expert_env): env_id, expert_path = expert_env env = gym.make(env_id) dataset = ExpertDataset(expert_path=expert_path, traj_limitation=10, sequential_preprocessing=True) # Note: train for 1M steps to have a working policy model = GAIL('MlpPolicy', env, adversary_entcoeff=0.0, lam=0.92, max_kl=0.001, expert_dataset=dataset, hidden_size_adversary=64, verbose=0) model.learn(1000) model.save("GAIL-{}".format(env_id)) model = model.load("GAIL-{}".format(env_id), env=env) model.learn(1000) obs = env.reset() for _ in range(1000): action, _ = model.predict(obs) obs, _, done, _ = env.step(action) if done: obs = env.reset() del dataset, model
def optimize_agent(trial): """ Train the model and optimise Optuna maximises the negative log likelihood, so we need to negate the reward here """ model_params = optimize_GAIL(trial) seed = trial.suggest_int('numpyseed', 1, 429496729) np.random.seed(seed) original_env = gym.make('rustyblocks-v0') original_env.max_invalid_tries = 3 env = DummyVecEnv([lambda: original_env]) model = GAIL("MlpPolicy", env, verbose=0, **model_params) print("DOING LEARING a2c") original_env.force_progression = False model.learn(int(2e4 * 5), seed=seed) print("DONE LEARING a2c") original_env.max_invalid_tries = -1 rewards = [] n_episodes, reward_sum = 0, 0.0 obs = env.reset() original_env.force_progression = True original_env.invalid_try_limit = 5000 while n_episodes < 4: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = env.reset() last_reward = np.mean(rewards) trial.report(last_reward) return last_reward
(end_time-start_time)/60,(end_time-start_time)/3600),'red')) print(colored('Trained BC policy','red')) else: #test print(colored('Trained on expert data from {}!'.format(args.exp_file),'red')) # exp_data = np.load(args.exp_file)s print(colored('Testing learnt policy from model file {} for {} games!'.\ format(args.model,int(args.num_test)),'red')) start_time = time.time() model = GAIL.load(args.model) env = gym.make('gym_pursuitevasion_small:pursuitevasion_small-v0') g = 1 obs = env.reset(ep=g) e_win_games = int(0) while True: action, _states = model.predict(obs) obs, rewards, done, e_win = env.step(action) if done: g += 1 obs = env.reset(ep=g) if g % 100 == 0: print('Playing game {}'.format(g)) if e_win: e_win_games += 1 if g > args.num_test: break end_time = time.time() # print(colored('Expert evader had won {} games!'\ # .format(len(exp_data['episode_returns'])),'red')) print(colored('BC Evader won {}/{} games = {:.2f}%!'.format(e_win_games,int(args.num_test),\ e_win_games*100/args.num_test),'red'))