Exemple #1
0
def Q(init, M, f):

    # solve
    V_init = utils.value_functional(M.P, M.r, init, M.discount)
    Q_init = utils.bellman_operator(M.P, M.r, V_init, M.discount)
    Q_star = utils.solve(ss.q_learning(M, 0.01), Q_init)[-1]
    # lift
    return np.dot(f.T, np.max(Q_star, axis=1, keepdims=True))
def onoffpolicy_abstraction(mdp, pis):
    tol = 0.01

    init = np.random.random((mdp.S, mdp.A))
    init = init / np.sum(init, axis=1, keepdims=True)

    # ### all policy abstraction
    # # n x |S| x |A|
    # Qs = np.stack([utils.bellman_operator(mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi, mdp.discount), mdp.discount) for pi in pis], axis=0)
    # similar_states = np.sum(np.sum(np.abs(Qs[:, :, None, :] - Qs[:, None, :, :]), axis=3), axis=0) # |S| x |S|
    # all_idx, all_abstracted_mdp, all_f = abs.build_state_abstraction(similar_states, mdp)

    ### optimal policy abstraction
    pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1]
    Q_star = utils.bellman_operator(
        mdp.P, mdp.r,
        utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount),
        mdp.discount)

    # similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]), axis=-1)  # |S| x |S|. preserves optimal policy's value (for all actions)
    # similar_states = np.abs(np.max(Q_star[:, None, :],axis=-1) - np.max(Q_star[None, :, :],axis=-1))  # |S| x |S|. preserves optimal action's value

    #
    V = utils.value_functional(mdp.P, mdp.r, init, mdp.discount)
    similar_states = np.abs(V[None, :, :] - V[:, None, :])[:, :, 0]

    optimal_idx, optimal_abstracted_mdp, optimal_f = abs.build_state_abstraction(
        similar_states, mdp, tol)

    mdps = [mdp, optimal_abstracted_mdp]
    names = ['ground', 'optimal_abstracted_mdp']
    solvers = [abs.Q, abs.SARSA, abs.VI]
    lifts = [np.eye(mdp.S), optimal_f]
    idxs = [range(mdp.S), optimal_idx]

    # if all_f.shape[0] == optimal_f.shape[0]:
    #     raise ValueError('Abstractions are the same so we probs wont see any difference...')
    print('\nAbstraction:', optimal_f.shape)

    truth = abs.PI(init, mdp, np.eye(mdp.S))
    results = []
    for n, M, idx, f in zip(names, mdps, idxs, lifts):
        for solve in solvers:
            err = np.max(np.abs(truth - solve(init[idx, :], M, f)))
            results.append((n, solve.__name__, err))
    return results
Exemple #3
0
# left
P[:, :, 3] = np.array([
    [1, 1, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0],
    [0, 0, 1, 1, 0, 0],
    [0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 1, 1],
    [0, 0, 0, 0, 0, 0],
])

# rewards. 6 x 4
r = np.array([
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [1, 0, 0, 0],  # rewarded for going up at the finish.
    [1, 0, 0, 0],
])

# initial distribution
d0 = np.array([[0.5, 0.5, 0, 0, 0, 0]])

pi = np.array(utils.random_policy(6, 4))
pi[[0, 2, 4]] = pi[[1, 3, 5]]
V = utils.value_functional(P, r, pi, 0.5)
Q_t = utils.bellman_operator(P, r, V, 0.5)
# print(np.sum(P, axis=-1))
print(Q_t)
Exemple #4
0
 def T(V):  # current value estimate, similarity measure
     assert V.shape[1] == 1
     return np.max(utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount), axis=1, keepdims=True)
Exemple #5
0
 def T(V):
     assert V.shape[1] == 1
     return utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount)
Exemple #6
0
def q_learning(mdp, lr):
    pi = lambda Q: utils.onehot(np.argmax(Q, axis=1), Q.shape[1])  # greedy
    T = lambda Q: utils.bellman_operator(mdp.P, mdp.r, np.max(Q, axis=1), mdp.discount)
    U = lambda Q: Q + lr * (T(Q) - Q)
    return jit(U)
Exemple #7
0
def sarsa(mdp, lr):
    pi = lambda Q: utils.onehot(np.argmax(Q, axis=1), Q.shape[1])  # greedy
    T = lambda Q: utils.bellman_operator(mdp.P, mdp.r, np.einsum('jk,jk->j', Q, pi(Q)), mdp.discount)
    U = lambda Q: Q + lr * (T(Q) - Q)
    return jit(U)
Exemple #8
0
 def update_fn(cores):
     V = utils.value_functional(mdp.P, mdp.r, utils.softmax(build(cores), axis=1), mdp.discount)
     Q = utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount)
     A = Q-V
     grads = [np.einsum('ijkl,ij->kl', d, A) for d in dlogpi_dw(cores)]
     return [c+lr*utils.clip_by_norm(g, 100)+1e-6*dH for c, g, dH in zip(cores, grads, dHdw(cores))]
Exemple #9
0
 def update_fn(pi):
     V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)
     Q = utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount)
     return utils.onehot(np.argmax(Q, axis=1), mdp.A)  # greedy update
Exemple #10
0
    return np.dot(f.T, V_star)


if __name__ == '__main__':

    tol = 0.01

    n_states, n_actions = 512, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    init = np.random.random((mdp.S, mdp.A))
    init = init / np.sum(init, axis=1, keepdims=True)
    pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1]

    Q_star = utils.bellman_operator(
        mdp.P, mdp.r,
        utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount),
        mdp.discount)

    similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]),
                            axis=-1)  # |S| x |S|
    optimal_idx, optimal_abstracted_mdp, optimal_f = build_state_abstraction(
        similar_states, mdp, tol)

    truth = PI(init, mdp, np.eye(mdp.S))
    approx = Q(init[optimal_idx], optimal_abstracted_mdp, optimal_f)

    print(
        '\n', 'bound >=V*-V', '\n',
        '{} >= {}'.format(2 * tol / (1 - mdp.discount)**2,
                          np.max(np.abs(truth - approx))))