Example #1
0
            # RL take action and get next state and reward
            _, next_state_index, reward, done = env.step(action)

            # RL choose action based on next state
            next_action = RL.choose_action(str(next_state_index))

            # RL learn from this transition (s, a, r, s, a) ==> Sarsa
            RL.learn(str(state), action, reward, str(next_state_index), next_action)

            # swap state and action
            state = next_state_index
            action = next_action

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = GridWorld()
    RL = Sarsa(actions=list(range(env.n_actions)))

    env.after(10000, update)
    env.mainloop()
    print(RL.q_table)
Example #2
0
                    _, next_state_index, reward, done = env.step(action)
                    env.render()
                    if next_state_index != 'terminal':
                        next_i, next_j = next_state_index
                    else:
                        next_i, next_j = 0, 0
                    # Record value function q(s,a) = r + γ*v(s')
                    values.append(reward +
                                  reward_decay * value[next_i, next_j])
                # According to the optimal equation of behrman, find the maximum value function, and update q(s,a)
                new_value[i, j] = np.max(values)
        # Iteration termination condition: error less than 1e-4
        if np.sum(np.abs(new_value - value)) < 1e-4:
            draw_image(np.round(new_value, decimals=2))
            plt.title('$v_{*}$')
            plt.show()
            plt.close()
            break
        value = new_value
        print(value)
    # end of game
    print('game over')
    env.destroy()


env = GridWorld(grid_world_h, grid_world_w)
value = np.zeros((grid_world_h, grid_world_w))

env.after(10000, dp)
env.mainloop()