def demo(self, agent, env, EPISODES, state_size, batch_size): done = False for e in range(EPISODES): state = env.reset() env.render() state = np.reshape(state, [1, state_size]) for episode in range(500): action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) state = next_state if done: break return 'done'
def train(self, agent, env, EPISODES, state_size, batch_size): done = False for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for episode in range(500): # env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, episode, agent.epsilon)) break if len(agent.memory) > batch_size: agent.replay(batch_size) return agent
def demo_q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1): Q = defaultdict(lambda: np.zeros(env.action_space.n)) policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for i_episode in range(num_episodes): # Print out which episode we're on, useful for debugging. if (i_episode + 1) % 100 == 0: print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes)) sys.stdout.flush() # Reset the environment and pick the first action state = env.reset() # One step in the environment total_reward = 0.0 for t in itertools.count(): # Take a step action_probs = policy(state) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) # TD Update best_next_action = np.argmax(Q[next_state]) print("Decided Action: ", best_next_action) td_target = reward + discount_factor * Q[next_state][ best_next_action] td_delta = td_target - Q[state][action] Q[state][action] += alpha * td_delta if done: break state = next_state return Q
def ddpg(env, agent, brain_name, action_size, n_episodes=2000, max_t=1000, n_agent=20): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores best_score = 0 for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations agent.noise_reset() agent_scores = [0] * n_agent for step in range(max_t): actions = agent.act(states, step) env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # see if episode has finished for i_agent in range(n_agent): agent_scores[i_agent] += rewards[i_agent] agent.step(states[i_agent], actions[i_agent], rewards[i_agent], next_states[i_agent], dones[i_agent], i_agent) states = next_states if any(dones): break score = np.mean(agent_scores) scores_window.append(score) # save most recent score scores.append(score) # save most recent score if best_score < score: best_score = score print( '\rEpisode {}\t Episode score: {:.2f}\t Average Score: {:.2f}\t Best Score: {:.2f}' .format(i_episode, score, np.mean(scores_window), best_score), end="") if i_episode % 100 == 0: print( '\rEpisode {}\t Current score: {:.2f}\t Average Score: {:.2f}'. format(i_episode, score, np.mean(scores_window))) if np.mean(scores_window) >= 30: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) agent.save_model() break env.close() return scores
# Reset environment and get initial state current_state = env.reset() # Reset flag and start iterating until episode ends done = False while not done: # This part stays mostly the same, the change is to query a model for Q values if np.random.random() > EPSILON: # Get action from Q table action = np.argmax(agent.get_qs(current_state)) else: # Get random action action = np.random.randint(0, env.action_space.n) new_state, reward, done = env.step(action)[:3] reward = env.compute_reward(new_state) # Transform new continous state to new discrete state and count reward episode_reward += reward if RENDER and episode % RENDER_EVERY == 0: env.render() # Every step we update replay memory and train main network agent.update_replay_memory( (current_state, action, reward, new_state, done)) agent.train(done, step) current_state = new_state step += 1