Exemple #1
0
        terminal = False

        state = game.initialise_state()
        action = epsilon_greedy(state)

        E_matrix = np.zeros_like(theta)

        while not terminal:
            # take action a, observe r, s'
            next_state, reward = game.step(state, action)
            # choose a' from s' using policy from Q

            terminal = next_state.terminal

            if not terminal:
                next_action = epsilon_greedy(state)
                delta = reward + Q(next_state, next_action) - Q(state, action)
            else:
                delta = reward - Q(state, action)

            E_matrix = np.add(lmd * E_matrix, psi(state, action))

            theta += alpha * delta * E_matrix

            if not terminal:
                state = next_state
                action = next_action

    game.visualise(V(generate_Q()))
Exemple #2
0
        E_matrix = np.zeros_like(Q_matrix)

        state = game.initialise_state()
        action = epsilon_greedy(allQ(state), allN(state))

        while not terminal:
            next_state, reward = game.step(state, action)

            terminal = state.terminal

            if not terminal:
                next_action = epsilon_greedy(allQ(state), allN(state))
                delta = reward + Q(next_state, next_action) - Q(state, action)
            else:
                delta = reward - Q(state, action)

            allE(state)[int(action)] += 1
            allN(state)[int(action)] += 1

            alpha = 1 / N(state, action)

            Q_matrix += alpha * delta * E_matrix
            E_matrix *= lmd

            if not terminal:
                state = next_state
                action = next_action

    game.visualise(V(Q_matrix))
Exemple #3
0
        while not terminal:
            state, reward = game.step(state, action)
            action = softmax_policy(state, theta)

            terminal = state.terminal

            if terminal:
                state_action_pairs = zip(history[0::3], history[1::3])

                history.append(reward)
                history.append(state)

                Gt = sum(history[2::3])

                for s, a in state_action_pairs:
                    increment_n(s, a)
                    alpha = 1 / N(s, a)
                    advantage = Gt - Q(s, a, theta)
                    theta += alpha * score_function(s, a, theta) * advantage

            else:
                history.append(reward)
                history.append(state)
                history.append(action)

        if k % 10000 == 0:
            print("MSE: " +
                  str(round(np.sum((Q_star - generate_Q(theta))**2), 2)))

    game.visualise(V(generate_Q(theta)))