def train(agent: DQNAgent, env: Env, episodes: int = 10_000): display = False progression = tqdm.trange(episodes, desc=f"Training {agent.name}", unit="episode") fps = 0 for episode in progression: state = env.reset() mean_reward = 0 return_ = 0 x_pos = 0 for step in count(1): t = time() action = agent.act(np.asarray(state), explore=True) next_state, reward, done, info = env.step(action) agent.memorize( Experience((state, next_state, action, done, reward))) state = next_state agent.learn() mean_reward += (reward - mean_reward) / step return_ += reward x_pos = max(x_pos, info["x_pos"]) fps = fps * 0.9 + 0.1 / (time() - t) if not step % 100: try: display = (yaml.load( (PROJECT_DIRECTORY / "display.yml").read_text()).get( agent.name, {}).get("display", False)) except: pass if display: env.render() if done or info["flag_get"]: break progression.set_description( f"Training {agent.name}; " f"Frames: {agent.step} ({fps:.0f} FPS); " f"last progression: {x_pos} ({x_pos/3260:.1%}); " f"eps: {agent.eps:.2f}") agent.register_episode( EpisodeMetrics(episode=episode, x_pos=x_pos, return_=return_, steps=step)) agent.save_model()
def main(): env = UnityEnvironment( file_name= "/home/faten/projects/deep-reinforcement-learning/p1_navigation/Banana_Linux/Banana.x86_64" ) brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] state_size = len(state) agent = DQNAgent(state_size, action_size, seed=0) scores = train(env, agent) # plot the scores fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores)), scores) plt.ylabel('Score') plt.xlabel('Epsiode #') plt.show() agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth')) for i in range(3): state = env.reset() for j in range(200): action = agent.act(state) env.render() state, reward, done, _ = env.step(action) if done: break env.close()
test_scores_i = [] avg_scores = [] scores_window = deque(maxlen=100) config = generate_configuration_qnet(action_size, state_size) agent = DQNAgent(config) agent.create_dirs() eps = config.eps_start for i_episode in range(1, config.n_episodes + 1): # Reset the environment and the score env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 while True: action = agent.act(state, eps) env_info = env.step(action)[brain_name] next_state, reward, done = env_info.vector_observations[ 0], env_info.rewards[0], env_info.local_done[0] agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) scores.append(score) avg_scores.append(np.mean(scores_window)) eps = max(config.eps_min, config.eps_decay * eps) print( '\rEpisode {}\tEps {:.2f}\tLast Score: {:.2f}\tAverage Score: {:.2f}' .format(i_episode, eps, score, np.mean(scores_window)),
reward_history = [] nb_epsiodes = 1000 # for episode in range(nb_epsiodes):episode_reward_averague episode_reward_average = -1 with tqdm.trange(nb_epsiodes) as t: for episode in t: # agent.reset() observation = env.reset() observation = deepcopy(observation) agent.observe(observation) done = False episode_reward = [] step = 0 # train while not done: action = agent.act() observation, reward, done, info = env.step(action) observation = deepcopy(observation) # reward = observation[0] if done: if step < 199: reward = 100 agent.observe(observation, reward, done) episode_reward.append(reward) episode_reward_average = 0.01*np.mean(episode_reward) + 0.99*episode_reward_average reward_history.append(np.mean(episode_reward)) t.set_description('Episode {}, steps:{}, reward:{} '.format(episode, step, np.mean(episode_reward))) t.set_postfix(episode_reward=episode_reward_average) step_history.append(step) break
from agents.dqn_agent import DQNAgent env = gym.make("LunarLander-v2") env.seed(0) agent = DQNAgent(env.action_space.n, env.observation_space.shape[0]) episodes = 400 steps = 3000 loss = [] for i_episode in range(episodes): obv = np.reshape(env.reset(), (1, 8)) total_reward = 0 done = False for t in range(steps): # env.render() # print(observation) action = agent.act(obv, total_reward, done) next_obv, reward, done, info = env.step(action) next_obv = np.reshape(next_obv, (1, 8)) total_reward += reward agent.store_transition(obv, action, reward, next_obv, done) obv = next_obv agent.replay() if done: print("{}/{}, reward: {} in {} timesteps".format( i_episode, episodes, total_reward, t + 1)) break loss.append(total_reward) # Average score of last 100 episode if len(loss) >= 100: is_solved = np.mean(loss[-100:])
def test(): # set hyperparameters (not really important for running the agent) # higher eps. decay rate buffer_size = int(1e5) batch_size = 64 gamma = 0.99 tau = 1e-3 learning_rate = 5e-4 eps_start = 1.0 eps_end = 0.01 eps_decay = 0.999 fc1_units = 64 fc2_units = 64 q_function_update_fraction = 4 seed = 0 ############ THE ENVIRONMENT ############### env = UnityEnvironment(file_name='Banana_Linux/Banana.x86_64', seed=seed) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # get the number of agents num_agents = len(env_info.agents) # get the size of the action space action_size = brain.vector_action_space_size # examine the state space states = env_info.vector_observations state_size = states.shape[1] # initialize agent dqn_agent = DQNAgent(name=None, state_size=state_size, action_size=action_size, learning_rate=learning_rate, discount_rate=gamma, eps_start=eps_start, eps_end=eps_end, eps_decay=eps_decay, tau=tau, network_architecture=[fc1_units, fc2_units], experience_replay_buffer_size=buffer_size, experience_replay_buffer_batch_size=batch_size, experience_replay_start_size=3200, q_function_update_fraction=q_function_update_fraction, device='gpu', seed=seed) dqn_agent.load_state_dict(torch.load('checkpoint.pth')) env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) for i in range(200): actions = dqn_agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards states = next_states if np.any(dones): break