def test_gail(expert_env):
    env_id, expert_path = expert_env
    env = gym.make(env_id)
    dataset = ExpertDataset(expert_path=expert_path,
                            traj_limitation=10,
                            sequential_preprocessing=True)

    # Note: train for 1M steps to have a working policy
    model = GAIL('MlpPolicy',
                 env,
                 adversary_entcoeff=0.0,
                 lam=0.92,
                 max_kl=0.001,
                 expert_dataset=dataset,
                 hidden_size_adversary=64,
                 verbose=0)

    model.learn(1000)
    model.save("GAIL-{}".format(env_id))
    model = model.load("GAIL-{}".format(env_id), env=env)
    model.learn(1000)

    obs = env.reset()

    for _ in range(1000):
        action, _ = model.predict(obs)
        obs, _, done, _ = env.step(action)
        if done:
            obs = env.reset()
    del dataset, model
Beispiel #2
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_GAIL(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = GAIL("MlpPolicy", env, verbose=0, **model_params)
    print("DOING LEARING a2c")
    original_env.force_progression = False
    model.learn(int(2e4 * 5), seed=seed)
    print("DONE LEARING a2c")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
		(end_time-start_time)/60,(end_time-start_time)/3600),'red'))
	print(colored('Trained BC policy','red'))
	
else: #test
	print(colored('Trained on expert data from {}!'.format(args.exp_file),'red'))
	# exp_data = np.load(args.exp_file)s
	print(colored('Testing learnt policy from model file {} for {} games!'.\
		format(args.model,int(args.num_test)),'red'))
	start_time = time.time()
	model = GAIL.load(args.model)
	env = gym.make('gym_pursuitevasion_small:pursuitevasion_small-v0')
	g = 1
	obs = env.reset(ep=g)
	e_win_games = int(0)
	while True:
		action, _states = model.predict(obs)
		obs, rewards, done, e_win = env.step(action)
		if done:
			g += 1
			obs = env.reset(ep=g)
			if g % 100 == 0:
				print('Playing game {}'.format(g))
			if e_win:
				e_win_games += 1
			if g > args.num_test:
				break
	end_time = time.time()
	# print(colored('Expert evader had won {} games!'\
	# 	.format(len(exp_data['episode_returns'])),'red'))
	print(colored('BC Evader won {}/{} games = {:.2f}%!'.format(e_win_games,int(args.num_test),\
		e_win_games*100/args.num_test),'red'))