def main(num_episodes, render=False): # initialize gym environment and the agent # env = gym.make('SpaceInvaders-v0') env = gym.make('Breakout-v0') state = env.reset() state_shape = list(state.shape) state_shape[-1] = state_shape[-1] * 5 agent = DQNAgent(state_shape, env.action_space.n) states = deque(maxlen=5) max_train_time = 800 # Iterate the game for e in range(num_episodes): # reset state in the beginning of each game state = env.reset() for i in range(5): states.appendleft(state) # time_t represents each frame of the game num_random = 0 total_reward = 0. for time_t in range(max_train_time): # turn this on if you want to render if render: env.render() # Decide action action = agent.act(states) if agent.acted_randomly: num_random += 1 # Advance the game to the next frame based on the action. next_state, reward, done, info = env.step(action) total_reward += reward # Remember the previous state, action, reward, and done agent.remember(states.copy(), action, reward, next_state, done) # make next_state the new current state for the next frame. states.appendleft(next_state) # done becomes True when the game ends if done: # print the score and break out of the loop rand_perc = num_random / float( time_t + 1) * 100. # Percentage of random actions. print( "episode: {}/{}, training_time: {}, summed_reward: {}, random_actions: {}%, eps: {}" .format(e, num_episodes, time_t, total_reward, rand_perc, agent.epsilon)) # train the agent with the experience of the episode agent.replay(min(100, time_t)) break # print("epsilon {}".format(agent.epsilon)) if e % 1000 == 0: agent.save("./deep_q_model.h5") print("saved model")
from agents import QAgent, Agent, RandomAgent, DQNAgent env = gym.make('LunarLander-v2') num_episodes = 5000 agent = DQNAgent(env.observation_space.n, env.action_space.n) average_reward = [] for episode in range(num_episodes): rewards = [] state = env.reset() while True: action = agent.act(state) next_state, reward, done, info = env.step(action) rewards.append(reward) agent.step(state, action, reward, next_state, done) state = next_state if done: average_reward.append(np.sum(rewards)) break # monitor progress if episode % print_evry == 0: reward_last_100 = int(np.mean(average_reward[-99:])) learning_rate = agent.scheduler.get_lr().squeeze() print( f"Episode {episode}/{num_episodes},eps:{agent.epsilon:.3f}, lr: {learning_rate}, reward:{reward_last_100}"
class Player(): def __init__(self): """Player implementation of dqn and random agents""" self.env = UnityEnvironment( file_name="../env/Banana_Linux_NoVis/Banana.x86_64") self.brain_name = self.env.brain_names[0] brain = self.env.brains[self.brain_name] # reset the environment env_info = self.env.reset(train_mode=False)[self.brain_name] # number of actions self.action_size = brain.vector_action_space_size # examine the state space state = env_info.vector_observations[0] state_size = len(state) self.agent = DQNAgent(state_size, self.action_size, seed=0) self.agent.local_network.load_state_dict( torch.load('../saved_models/dqn_banana_best.pth')) def play(self): """Play using best dqn agent""" scores = [] scores_window = deque(maxlen=10) best_score = -np.inf eps = DEFAULT_EPS for i in range(NUM_EPISODES): env_info = self.env.reset(train_mode=False)[self.brain_name] state = env_info.vector_observations[0] score = 0 while True: action = self.agent.act(state, eps) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward state = next_state if done: break scores_window.append(score) scores.append(score) if i % 10 == 0: print('\rProgress: {}/{}, avg score: {:.2f}'.format( i, NUM_EPISODES, np.mean(scores_window)), end="") sys.stdout.flush() return scores, best_score def play_random(self): """Play by choosing random actions""" scores = [] scores_window = deque(maxlen=10) best_score = -np.inf eps = DEFAULT_EPS for i in range(NUM_EPISODES): env_info = self.env.reset(train_mode=False)[self.brain_name] state = env_info.vector_observations[0] score = 0 while True: action = np.random.randint(self.action_size) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward state = next_state if done: break scores_window.append(score) scores.append(score) if i % 10 == 0: print('\rProgress: {}/{}, avg score: {:.2f}'.format( i, NUM_EPISODES, np.mean(scores_window)), end="") sys.stdout.flush() return scores, best_score