def run_experiment(Lambda, alpha, twe, trunc_normal, subspaces, num_runs, num_episodes=20000, num_procs=None, name=""): returns = np.empty((num_runs, num_episodes), dtype=np.float64) results.append(returns) for i in xrange(num_runs): print name agent = PolicyGradientAgent(simulator, Lambda=Lambda, alpha_u=alpha, alpha_v=alpha, tile_weight_exponent=twe, trunc_normal=trunc_normal, subspaces=subspaces) agent.persist_state() framework = Framework(simulator, agent, num_episodes=num_episodes) framework.train(num_procs=num_procs) returns[i] = framework.returns random = np.random.randint(sys.maxsize) directory = 'data/%s/' % (name) filename = directory + ('%d.npy' % (random)) try: os.makedirs(directory) except OSError: pass np.save(filename, returns) return returns
import os import gym import numpy as np from PolicyGradientAgent import PolicyGradientAgent os.environ["CUDA_VISIBLE_DEVICES"] = "-1" env = gym.make('LunarLander-v2') action_size = env.action_space.n state_size = 8 agent = PolicyGradientAgent(state_size, action_size) print("Training...") train_episodes = 5000 avg_score = 0 loss = 0 for episode in range(train_episodes): state = env.reset() state = np.reshape(state, [1, state_size]) cum_reward = 0 for i in range(1000): action = agent.act(state, is_training=True) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, state_size])
print(env.observation_space.shape, env.action_space.n) env.reset() test_mode = False num_games = 2500 best_score = -np.inf scores = [] eps_history = [] state_dims = env.observation_space.shape[0] num_actions = env.action_space.n lr = 0.001 gamma = 0.99 agent = PolicyGradientAgent(lr=lr, gamma=gamma, state_dims=state_dims, num_actions=num_actions, env_name='lunar_lander', checkpoint_dir='temp/') if test_mode: agent.load_model() # env = gym.wrappers.Monitor(env, 'temp/lunar_lander', # video_callable=lambda episode_id: True, force=True) for count in range(num_games): state = env.reset() done = False score = 0 while not done: env.render()
def make_framework(Lambda): agent = PolicyGradientAgent(simulator, Lambda=Lambda) filename = 'data/saved_state_lambda' + str(Lambda) + '.npy' agent.save_state(filename) agent.load_state(filename, mmap_mode='r+') return Framework(simulator, agent)