Example #1
0
def graph_PI():
    n_states = 10
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    print('n pis: {}'.format(len(det_pis)))
    mdp = utils.build_random_sparse_mdp(n_states, n_actions, 0.5)

    A = graph.mdp_topology(det_pis)
    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)

    basis = graph.construct_mdp_basis(det_pis, mdp)

    init_pi = utils.softmax(np.random.standard_normal((n_states, n_actions)))
    init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, init_v, lr=0.1)

    pis = utils.solve(search_spaces.policy_iteration(mdp), init_pi)
    print("\n{} policies to vis".format(len(pis)))

    for i, pi in enumerate(pis[:-1]):
        print('Iteration: {}'.format(i))
        v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()
        a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a)
        plt.figure(figsize=(16,16))
        nx.draw(G, pos, node_color=a, node_size=150)
        # plt.show()
        plt.savefig('figs/pi_graphs/{}.png'.format(i))
        plt.close()
Example #2
0
def generate_pi(mdp, c):
    init_pi = utils.random_policy(mdp.S,mdp.A)
    pis = utils.solve(ss.policy_iteration(mdp), init_pi)
    vs = np.stack([utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) for pi in pis])[:,:,0]
    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n-2))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')

    for i in range(len(vs)-2):
        dv = 0.1*(vs[i+1, :] - vs[i, :])
        plt.arrow(vs[i, 0], vs[i, 1], dv[0], dv[1], color=c, alpha=0.5, width=0.005)
Example #3
0
def policy_iteration(mdp, pis):
    # pi_star = utils.solve(ss.policy_iteration(mdp), pis[0])[-1]

    lens, pi_stars = [], []

    for pi in pis:
        pi_traj = clip_solver_traj(utils.solve(ss.policy_iteration(mdp), pi))
        pi_star = pi_traj[-1]

        pi_stars.append(pi_star)
        lens.append(len(pi_traj))

    return lens, pi_stars
Example #4
0
def onoffpolicy_abstraction(mdp, pis):
    tol = 0.01

    init = np.random.random((mdp.S, mdp.A))
    init = init / np.sum(init, axis=1, keepdims=True)

    # ### all policy abstraction
    # # n x |S| x |A|
    # Qs = np.stack([utils.bellman_operator(mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi, mdp.discount), mdp.discount) for pi in pis], axis=0)
    # similar_states = np.sum(np.sum(np.abs(Qs[:, :, None, :] - Qs[:, None, :, :]), axis=3), axis=0) # |S| x |S|
    # all_idx, all_abstracted_mdp, all_f = abs.build_state_abstraction(similar_states, mdp)

    ### optimal policy abstraction
    pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1]
    Q_star = utils.bellman_operator(
        mdp.P, mdp.r,
        utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount),
        mdp.discount)

    # similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]), axis=-1)  # |S| x |S|. preserves optimal policy's value (for all actions)
    # similar_states = np.abs(np.max(Q_star[:, None, :],axis=-1) - np.max(Q_star[None, :, :],axis=-1))  # |S| x |S|. preserves optimal action's value

    #
    V = utils.value_functional(mdp.P, mdp.r, init, mdp.discount)
    similar_states = np.abs(V[None, :, :] - V[:, None, :])[:, :, 0]

    optimal_idx, optimal_abstracted_mdp, optimal_f = abs.build_state_abstraction(
        similar_states, mdp, tol)

    mdps = [mdp, optimal_abstracted_mdp]
    names = ['ground', 'optimal_abstracted_mdp']
    solvers = [abs.Q, abs.SARSA, abs.VI]
    lifts = [np.eye(mdp.S), optimal_f]
    idxs = [range(mdp.S), optimal_idx]

    # if all_f.shape[0] == optimal_f.shape[0]:
    #     raise ValueError('Abstractions are the same so we probs wont see any difference...')
    print('\nAbstraction:', optimal_f.shape)

    truth = abs.PI(init, mdp, np.eye(mdp.S))
    results = []
    for n, M, idx, f in zip(names, mdps, idxs, lifts):
        for solve in solvers:
            err = np.max(np.abs(truth - solve(init[idx, :], M, f)))
            results.append((n, solve.__name__, err))
    return results
Example #5
0
def compare_mdp_lmdp():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    pis = utils.gen_grid_policies(7)
    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16, 16))
    plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75)

    # solve via LMDPs
    p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
    u, v = lmdps.lmdp_solver(p, q, mdp.discount)
    pi_u_star = lmdps.lmdp_decoder(u, mdp.P)

    pi_p = lmdps.lmdp_decoder(p, mdp.P)

    # solve MDP
    init = np.random.standard_normal((n_states, n_actions))
    pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]
    # pi_star = onehot(np.argmax(qs, axis=1), n_actions)

    # evaluate both policies.
    v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount)
    v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount)
    v_p = utils.value_functional(mdp.P, mdp.r, pi_p, mdp.discount)

    plt.scatter(v_star[0, 0],
                v_star[1, 0],
                c='m',
                alpha=0.5,
                marker='x',
                label='mdp')
    plt.scatter(v_u_star[0, 0],
                v_u_star[1, 0],
                c='g',
                alpha=0.5,
                marker='x',
                label='lmdp')
    plt.scatter(v_p[0, 0], v_p[1, 0], c='k', marker='x', alpha=0.5, label='p')
    plt.legend()
    plt.show()
Example #6
0
def compare_acc():
    n_states, n_actions = 2, 2

    lmdp = []
    lmdp_rnd = []
    for _ in range(10):
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

        # solve via LMDPs
        p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
        u, v = lmdps.lmdp_solver(p, q, mdp.discount)
        pi_u_star = lmdps.lmdp_decoder(u, mdp.P)

        # solve MDP
        init = np.random.standard_normal((n_states, n_actions))
        pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]

        # solve via LMDPs
        # with p set to the random dynamics
        p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
        p = np.einsum('ijk,jk->ij', mdp.P,
                      np.ones((n_states, n_actions)) / n_actions)
        # q = np.max(mdp.r, axis=1, keepdims=True)
        u, v = lmdps.lmdp_solver(p, q, mdp.discount)
        pi_u_star_random = lmdps.lmdp_decoder(u, mdp.P)

        # evaluate both policies.
        v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount)
        v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star,
                                          mdp.discount)
        v_u_star_random = utils.value_functional(mdp.P, mdp.r,
                                                 pi_u_star_random,
                                                 mdp.discount)

        lmdp.append(np.isclose(v_star, v_u_star, 1e-3).all())
        lmdp_rnd.append(np.isclose(v_star, v_u_star_random, 1e-3).all())

    print([np.sum(lmdp), np.sum(lmdp_rnd)])
    plt.bar(range(2), [np.sum(lmdp), np.sum(lmdp_rnd)])
    plt.show()
Example #7
0
def mdp_lmdp_optimality():
    n_states, n_actions = 2, 2

    n = 5
    plt.figure(figsize=(8, 16))
    plt.title('Optimal control (LMDP) vs optimal policy (MDP)')
    for i in range(n):
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
        # solve via LMDPs
        p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
        u, v = lmdps.lmdp_solver(p, q, mdp.discount)

        init = np.random.standard_normal((n_states, n_actions))
        pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]

        P_pi_star = np.einsum('ijk,jk->ij', mdp.P, pi_star)
        plt.subplot(n, 2, 2 * i + 1)
        plt.imshow(u)
        plt.subplot(n, 2, 2 * i + 2)
        plt.imshow(P_pi_star)
    plt.savefig('figs/lmdp_mdp_optimal_dynamics.png')
    plt.show()
Example #8
0
def PI(init, M, f):
    pi_star = utils.solve(ss.policy_iteration(M), np.log(init))[-1]
    return utils.value_functional(M.P, M.r, np.dot(f.T, pi_star), M.discount)
Example #9
0
    V_star = utils.solve(ss.value_iteration(M, 0.01), V_init)[-1]

    # lift
    return np.dot(f.T, V_star)


if __name__ == '__main__':

    tol = 0.01

    n_states, n_actions = 512, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    init = np.random.random((mdp.S, mdp.A))
    init = init / np.sum(init, axis=1, keepdims=True)
    pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1]

    Q_star = utils.bellman_operator(
        mdp.P, mdp.r,
        utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount),
        mdp.discount)

    similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]),
                            axis=-1)  # |S| x |S|
    optimal_idx, optimal_abstracted_mdp, optimal_f = build_state_abstraction(
        similar_states, mdp, tol)

    truth = PI(init, mdp, np.eye(mdp.S))
    approx = Q(init[optimal_idx], optimal_abstracted_mdp, optimal_f)

    print(