Esempio n. 1
0
def sarsa_lambda(w, alpha, lambd, nsimus, gamma):

    n = w.width  * w.height
    Q = np.zeros((n, 4))

    for k in range(1, nsimus + 1):
        w.reset()
        E = np.zeros((n, 4))
        
        s = w.player.cell.pos
        a = Policy.e_greedy_action_from_qvs(s, 1 / k, Q)
        
        while not w.finished:
            reward = w.take_action(a)
            s2 = w.player.cell.pos
            a2 = Policy.e_greedy_action_from_qvs(s2, 1 / k, Q)

            err = reward + gamma * Q[s2][a2] - Q[s][a]
            E[s][a] += 1
            
            for si in range(n):
                for ai in range(4):
                    Q[si][ai] += alpha * err * E[si][ai]
                    E[si][ai] *= gamma * lambd

            s = s2
            a = a2

    return Policy.build_deterministic(Policy.qvs_to_table(Q))
Esempio n. 2
0
    def qvs_to_policy(self, qvs):

        w = self.world
        n = w.width * w.height
        policy = [0] * n

        for s in range(n):
            policy[s] = np.argmax(qvs[s])

        return Policy.build_deterministic(policy)
Esempio n. 3
0
    def policy_iteration(self, policy=None):

        w = self.world

        if policy == None:
            policy = Policy.build_deterministic([0] * w.width * w.height)

        while True:
            old_policy = policy
            vs = self.iterative_policy_evaluation(policy, 20)
            qvs = self.qvs_from_vs(vs)
            policy = self.qvs_to_policy(qvs)
            if policy == old_policy:
                break

        return policy
Esempio n. 4
0
def sarsa_offline(w, alpha, nsimus, gamma):

    n = w.width  * w.height
    Q = np.zeros((n, 4))

    for t in range(1, nsimus + 1):
        w.reset()
        s = w.player.cell.pos
        epsilon = 1 / t
        
        while not w.finished:
            a = Policy.e_greedy_action_from_qvs(s, epsilon, Q)
            reward = w.take_action(a)
            s2 = w.player.cell.pos

            Q[s][a] += alpha * (reward + gamma * np.max(Q[s2]) - Q[s][a])

            s = s2

    return Policy.build_deterministic(Policy.qvs_to_table(Q))
Esempio n. 5
0
def sarsa(w, alpha, nsimus, gamma):

    n = w.width  * w.height
    Q = np.zeros((n, 4))

    for k in range(1, nsimus + 1):
        w.reset()
        s = w.player.cell.pos
        a = Policy.e_greedy_action_from_qvs(s, 1 / k, Q)
        
        while not w.finished:
            reward = w.take_action(a)
            s2 = w.player.cell.pos
            a2 = Policy.e_greedy_action_from_qvs(s2, 1 / k, Q)

            Q[s][a] += alpha * (reward + gamma * Q[s2][a2] - Q[s][a])

            s = s2
            a = a2

    return Policy.build_deterministic(Policy.qvs_to_table(Q))