Esempio n. 1
0
def sarsa(env, n_iter, gamma, epsilon, eta):
    S, A, T, R = env.S, env.A, env.T, env.R
    n_states, n_actions = len(S), len(A)
    Q = np.zeros((n_states, n_actions))
    for iter in range(n_iter):
        env.reset()
        s = env.s_start
        a = eg.epsilon_greedy(Q[s], n_actions, epsilon)
        while s not in env.S_end:
            s_next = int(T[s][a])
            a_next = eg.epsilon_greedy(Q[s_next], n_actions, epsilon)
            Q[s][a] = (1 - eta) * Q[s][a] + eta * (R[s][a] + gamma * Q[s_next][a_next])
            s = s_next
            a = a_next
    return np.argmax(Q, axis = 1)
epsilon_max = 1.0
epsilon_min = 0.01
epsilon_decay = 0.99
stop_penalty = -100
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    epsilon = epsilon_max
    for iteration in range(n_iterations):
        state = env.reset()
        done = False
        steps = 0
        while not done:
            steps += 1
            s_cur = state.reshape(1, state_size)
            Q_s_cur = Q_values.eval(feed_dict={State: s_cur})
            a_cur = eg.epsilon_greedy(Q_s_cur, n_actions, epsilon)
            if epsilon > epsilon_min:
                epsilon *= epsilon_decay
            state, reward, done, info = env.step(a_cur)
            s_next = state.reshape(1, state_size)
            if done:
                target = stop_penalty
                sess.run(training_op,
                         feed_dict={
                             State: s_cur,
                             Action: a_cur,
                             Target: target
                         })
                print(steps)
                break
            else: