def play_and_train(env: gym.Env, agent: QLearningAgent, t_max=10 ** 4): """ This function should - run a full game (for t_max steps), actions given by agent - train agent whenever possible - return total reward """ total_reward = 0.0 state = env.reset() for _ in range(t_max): action = agent.get_action(state) new_state, reward, done, _ = env.step(action) total_reward += reward agent.update(state, action, new_state, reward) state = new_state if done: break state = new_state return total_reward
n_actions = env.action_space.n print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with # alpha=0.5 # get_legal_actions = lambda s: range(n_actions) # epsilon=0.1 # discount=0.99 alpha = 0.5 get_legal_actions = lambda s: range(n_actions) epsilon = 0.1 discount = 0.99 agent = QLearningAgent(alpha, epsilon, discount, get_legal_actions) plt.figure(figsize=[10, 4]) rewards = [] # Training loop for i in range(max_iterations): # Play & train game # Update rewards # rewards # Decay agent epsilon # agent.epsilon = ? initial_state = env.reset() rewards.append(play_and_train(env, agent))
if done: break return total_reward if __name__ == '__main__': env = gym.make("CartPole-v0").env env.reset() n_actions = env.action_space.n print(env.observation_space.high) print(env.observation_space.low) print('CartPole state: %s' % (env.reset())) agent = QLearningAgent(0.3, 0.5, 1.0, lambda s: range(n_actions)) # (x, x', theta, theta') state_bins = [ # Cart position. discretize_range(-2.4, 2.4, 2), # Cart velocity. discretize_range(-3.0, 3.0, 2), # Pole angle. discretize_range(-0.5, 0.5, 7), # Tip velocity. discretize_range(-2.0, 2.0, 7) ] max_bins = max(len(bin) for bin in state_bins) rewards = [] for i in range(2000):
env = gym.make('Taxi-v3') env.reset() env.render() n_states = env.observation_space.n n_actions = env.action_space.n print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with # alpha=0.5 # get_legal_actions = lambda s: range(n_actions) # epsilon=0.1 # discount=0.99 agent = QLearningAgent(0.5,0.1,0.99,lambda s: range(n_actions)) plt.figure(figsize=[10, 4]) rewards = [] # Training loop for i in range(max_iterations): total_r = play_and_train(env,agent) rewards.append(total_r) # Play & train game # Update rewards # rewards # Decay agent epsilon # agent.epsilon = ?
break return total_reward if __name__ == '__main__': env = gym.make("CartPole-v0").env env.reset() n_actions = env.action_space.n print(env.observation_space.high) print(env.observation_space.low) print('CartPole state: %s' % (env.reset())) agent = QLearningAgent(alpha=0.3, epsilon=0.5, discount=1.0, get_legal_actions=lambda s: range(n_actions)) # (x, x', theta, theta') state_bins = [ # Cart position. discretize_range(-2.4, 2.4, 2), # Cart velocity. discretize_range(-3.0, 3.0, 2), # Pole angle. discretize_range(-0.5, 0.5, 7), # Tip velocity. discretize_range(-2.0, 2.0, 7) ] max_bins = max(len(bin) for bin in state_bins) rewards = []
env = CliffWalkingEnv() env.reset() env.render() n_states = env.nS n_actions = env.nA print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with alpha = 0.5 get_legal_actions = lambda s: range(n_actions) epsilon = 0.2 discount = 0.99 agent = QLearningAgent(alpha, epsilon, discount, getActionRange) agent2 = SarsaAgent(alpha, epsilon, discount, getActionRange) plt.figure(figsize=[10, 4]) rewards1 = [] rewards2 = [] # Training loop for i in range(max_iterations): # Play & train game rewards1.append(play_and_train(env, agent)) rewards2.append(play_and_train(env, agent2)) if (i + 1) % 100 == 0: agent.epsilon = max(agent.epsilon * 0.99, 0.00001) agent2.epsilon = max(agent2.epsilon * 0.99, 0.00001) # agent.alpha = max(agent.alpha * 0.99, 0.00001)
env = gym.make('Taxi-v2').env env.reset() env.render() n_states = env.observation_space.n n_actions = env.action_space.n print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with # alpha=0.5 # get_legal_actions = lambda s: range(n_actions) # epsilon=0.1 # discount=0.99 agent = QLearningAgent(alpha=0.5, get_legal_actions=lambda s: range(n_actions), discount=0.99, epsilon=0.1) plt.figure(figsize=[10, 4]) rewards = [] # Training loop for i in range(max_iterations): # Play & train game # Update rewards # rewards rewards.append(play_and_train(env, agent)) # Decay agent epsilon if i % 100 == 0:
env.reset() env.render() n_states = env.observation_space.n n_actions = env.action_space.n print('States number = %i, Actions number = %i' % (n_states, n_actions)) # create q learning agent with # alpha=0.5 # get_legal_actions = lambda s: range(n_actions) # epsilon=0.1 # discount=0.99 epsilon = 0.1 agent = QLearningAgent(0.5, epsilon, 0.99, lambda s: range(n_actions)) plt.figure(figsize=[10, 4]) rewards = [] # Training loop for i in range(max_iterations): # Play & train game # Update rewards # rewards result = play_and_train(env, agent) rewards.append(result) # Decay agent epsilon # agent.epsilon = ?