Esempio n. 1
0
def graph_PI():
    n_states = 10
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    print('n pis: {}'.format(len(det_pis)))
    mdp = utils.build_random_sparse_mdp(n_states, n_actions, 0.5)

    A = graph.mdp_topology(det_pis)
    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)

    basis = graph.construct_mdp_basis(det_pis, mdp)

    init_pi = utils.softmax(np.random.standard_normal((n_states, n_actions)))
    init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, init_v, lr=0.1)

    pis = utils.solve(search_spaces.policy_iteration(mdp), init_pi)
    print("\n{} policies to vis".format(len(pis)))

    for i, pi in enumerate(pis[:-1]):
        print('Iteration: {}'.format(i))
        v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()
        a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a)
        plt.figure(figsize=(16,16))
        nx.draw(G, pos, node_color=a, node_size=150)
        # plt.show()
        plt.savefig('figs/pi_graphs/{}.png'.format(i))
        plt.close()
Esempio n. 2
0
def graph_PG():
    # ffmpeg -framerate 10 -start_number 0 -i %d.png -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4
    n_states = 6
    n_actions = 4

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    print('n pis: {}'.format(len(det_pis)))
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)

    A = graph.mdp_topology(det_pis)
    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)

    basis = graph.construct_mdp_basis(det_pis, mdp)

    init_logits = np.random.standard_normal((n_states, n_actions))
    init_v = utils.value_functional(mdp.P, mdp.r, utils.softmax(init_logits), mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, init_v, lr=0.1)

    print('\nSolving PG')
    pis = utils.solve(search_spaces.policy_gradient_iteration_logits(mdp, 0.1), init_logits)
    print("\n{} policies to vis".format(len(pis)))
    n = len(pis)
    # pis = pis[::n//100]
    pis = pis[0:20]

    for i, pi in enumerate(pis[:-1]):
        print('Iteration: {}'.format(i))
        v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()
        a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a)
        plt.figure()
        nx.draw(G, pos, node_color=a)
        # plt.show()
        plt.savefig('figs/pg_graphs/{}.png'.format(i))
        plt.close()
Esempio n. 3
0
def generate_model_cs():
    """
    Compare using all deterministic policies versus fewer mixed policies.
    Starts to get interesting in higher dims?



    """
    n_states = 32
    n_actions = 2
    lr = 0.01
    k = 64

    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    init = rnd.standard_normal((mdp.S * mdp.S * mdp.A + mdp.S * mdp.A))

    pi_star = utils.solve(policy_iteration(mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print('pi_star\n', pi_star)

    # adversarial pis
    # apis = utils.get_deterministic_policies(mdp.S, mdp.A)
    apis = np.stack([utils.random_det_policy(mdp.S, mdp.A) for _ in range(k)])

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1])
    error = np.mean(
        (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) -
         utils.value_functional(utils.softmax(p_logits), r, pi_star,
                                mdp.discount))**2)
    print('\n', error)
    new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount,
                        mdp.d0)
    pi_star = utils.solve(policy_iteration(new_mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print(pi_star)

    apis = np.stack([utils.random_policy(mdp.S, mdp.A) for _ in range(k)])

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1])
    error = np.mean(
        (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) -
         utils.value_functional(utils.softmax(p_logits), r, pi_star,
                                mdp.discount))**2)
    print('\n', error)
    new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount,
                        mdp.d0)
    pi_star = utils.solve(policy_iteration(new_mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print(pi_star)
Esempio n. 4
0
def value_graph():

    # vs = [np.sum(utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()**2) for pi in det_pis]
    # plt.figure(figsize=(16,16))
    # nx.draw(G, pos, node_color=vs, node_size=150)
    # plt.savefig('figs/pi_graphs/val.png')
    # plt.close()

    n_states = 10
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    n = len(det_pis)
    print('n pis: {}'.format(n))
    # how does discount effect these!?
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
    vs = np.stack(values).reshape((n, n_states))
    W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8)
    A = graph.mdp_topology(det_pis)
    adj = A*W
    G = nx.from_numpy_array(adj)
    pos = nx.spring_layout(G, iterations=200)

    plt.figure(figsize=(16,16))
    nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
    plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions))
    plt.close()
Esempio n. 5
0
def generate_iteration_figures(mdp, pis, iteration_fn, name):
    """
    How many steps to converge to the optima from different starting points.
    """
    n = 3
    lrs = np.linspace(0.0001, 0.1, n**2)  # 0.5 - 0.00195...
    plt.figure(figsize=(16, 16))
    value = vmap(
        lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount))
    Vs = value(np.stack(pis))[:, :, 0]

    # pool = multiprocessing.Pool(n**2)
    # # couldnt serialise the mdp collection. so just unwrap them here.
    # lens_n_pi_stars = pool.map(iteration_fn, [(mdp.P, mdp.r, mdp.discount, mdp.d0, pis, lr) for lr in lrs])
    # for i, lr, results in zip(range(n**2), lrs, lens_n_pi_stars):
    #     len, pi_star = results

    for i, lr in enumerate(lrs):
        print('\n{}: {}\n'.format(i, lr))
        lens, pi_stars = iteration_fn(
            (mdp.P, mdp.r, mdp.discount, mdp.d0, pis, lr))

        plt.subplot(n, n, i + 1)
        plt.title('Learning rate: {}'.format(lr))
        fig = plt.scatter(Vs[:, 0], Vs[:, 1], c=lens, s=5)
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)

    plt.tight_layout()
    plt.savefig('figs/iteration-lrs/0-{}.png'.format(name))
Esempio n. 6
0
def model_iteration(mdp, lr, pis):
    V_true = vmap(lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount))
    V_guess = vmap(lambda P, r, pi: utils.value_functional(P, r, pi, mdp.discount), in_axes=(None, None, 0))


    def loss_fn(params):
        p_logits, r = parse_model_params(mdp.S, mdp.A, params)
        return np.sum((V_true(pis) - V_guess(utils.softmax(p_logits), r, pis))**2)

    dLdp = grad(loss_fn)

    @jit
    def update_fn(params):
        return params - lr*utils.clip_by_norm(dLdp(params), 100)

    return update_fn
Esempio n. 7
0
def Q(init, M, f):

    # solve
    V_init = utils.value_functional(M.P, M.r, init, M.discount)
    Q_init = utils.bellman_operator(M.P, M.r, V_init, M.discount)
    Q_star = utils.solve(ss.q_learning(M, 0.01), Q_init)[-1]
    # lift
    return np.dot(f.T, np.max(Q_star, axis=1, keepdims=True))
Esempio n. 8
0
def onoffpolicy_abstraction(mdp, pis):
    tol = 0.01

    init = np.random.random((mdp.S, mdp.A))
    init = init / np.sum(init, axis=1, keepdims=True)

    # ### all policy abstraction
    # # n x |S| x |A|
    # Qs = np.stack([utils.bellman_operator(mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi, mdp.discount), mdp.discount) for pi in pis], axis=0)
    # similar_states = np.sum(np.sum(np.abs(Qs[:, :, None, :] - Qs[:, None, :, :]), axis=3), axis=0) # |S| x |S|
    # all_idx, all_abstracted_mdp, all_f = abs.build_state_abstraction(similar_states, mdp)

    ### optimal policy abstraction
    pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1]
    Q_star = utils.bellman_operator(
        mdp.P, mdp.r,
        utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount),
        mdp.discount)

    # similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]), axis=-1)  # |S| x |S|. preserves optimal policy's value (for all actions)
    # similar_states = np.abs(np.max(Q_star[:, None, :],axis=-1) - np.max(Q_star[None, :, :],axis=-1))  # |S| x |S|. preserves optimal action's value

    #
    V = utils.value_functional(mdp.P, mdp.r, init, mdp.discount)
    similar_states = np.abs(V[None, :, :] - V[:, None, :])[:, :, 0]

    optimal_idx, optimal_abstracted_mdp, optimal_f = abs.build_state_abstraction(
        similar_states, mdp, tol)

    mdps = [mdp, optimal_abstracted_mdp]
    names = ['ground', 'optimal_abstracted_mdp']
    solvers = [abs.Q, abs.SARSA, abs.VI]
    lifts = [np.eye(mdp.S), optimal_f]
    idxs = [range(mdp.S), optimal_idx]

    # if all_f.shape[0] == optimal_f.shape[0]:
    #     raise ValueError('Abstractions are the same so we probs wont see any difference...')
    print('\nAbstraction:', optimal_f.shape)

    truth = abs.PI(init, mdp, np.eye(mdp.S))
    results = []
    for n, M, idx, f in zip(names, mdps, idxs, lifts):
        for solve in solvers:
            err = np.max(np.abs(truth - solve(init[idx, :], M, f)))
            results.append((n, solve.__name__, err))
    return results
Esempio n. 9
0
def generate_vi(mdp, c, lr=0.1):
    init_pi = utils.random_policy(mdp.S,mdp.A)
    init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount)
    vs = np.stack(utils.solve(ss.value_iteration(mdp, lr), init_v))[:,:,0]
    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
Esempio n. 10
0
def VI(init, M, f):

    # solve
    V_init = utils.value_functional(M.P, M.r, init, M.discount)
    V_star = utils.solve(ss.value_iteration(M, 0.01), V_init)[-1]

    # lift
    return np.dot(f.T, V_star)
Esempio n. 11
0
def generate_pg(mdp, c, lr=0.01):
    init_pi = utils.random_policy(mdp.S,mdp.A)
    init_logit = np.log(init_pi)
    logits = utils.solve(ss.policy_gradient_iteration_logits(mdp, lr), init_logit)
    vs = np.stack([utils.value_functional(mdp.P, mdp.r, utils.softmax(logit), mdp.discount) for logit in logits])[:,:,0]
    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
Esempio n. 12
0
def value_iteration(mdp, pis, lr):
    trajs = []

    for pi in pis:
        init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)

        traj = utils.solve(ss.value_iteration(mdp, lr), init_V)
        v_star = traj[-1]
        trajs.append(traj)
    return trajs
Esempio n. 13
0
def compare_mdp_lmdp():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    pis = utils.gen_grid_policies(7)
    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16, 16))
    plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75)

    # solve via LMDPs
    p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
    u, v = lmdps.lmdp_solver(p, q, mdp.discount)
    pi_u_star = lmdps.lmdp_decoder(u, mdp.P)

    pi_p = lmdps.lmdp_decoder(p, mdp.P)

    # solve MDP
    init = np.random.standard_normal((n_states, n_actions))
    pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]
    # pi_star = onehot(np.argmax(qs, axis=1), n_actions)

    # evaluate both policies.
    v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount)
    v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount)
    v_p = utils.value_functional(mdp.P, mdp.r, pi_p, mdp.discount)

    plt.scatter(v_star[0, 0],
                v_star[1, 0],
                c='m',
                alpha=0.5,
                marker='x',
                label='mdp')
    plt.scatter(v_u_star[0, 0],
                v_u_star[1, 0],
                c='g',
                alpha=0.5,
                marker='x',
                label='lmdp')
    plt.scatter(v_p[0, 0], v_p[1, 0], c='k', marker='x', alpha=0.5, label='p')
    plt.legend()
    plt.show()
Esempio n. 14
0
def generate_pi(mdp, c):
    init_pi = utils.random_policy(mdp.S,mdp.A)
    pis = utils.solve(ss.policy_iteration(mdp), init_pi)
    vs = np.stack([utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) for pi in pis])[:,:,0]
    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n-2))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')

    for i in range(len(vs)-2):
        dv = 0.1*(vs[i+1, :] - vs[i, :])
        plt.arrow(vs[i, 0], vs[i, 1], dv[0], dv[1], color=c, alpha=0.5, width=0.005)
Esempio n. 15
0
def value_iteration(mdp, pis):
    lens, pi_stars = [], []

    for pi in pis:
        init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)
        pi_traj = utils.solve(ss.value_iteration(mdp, 0.01), init_V)
        pi_star = pi_traj[-1]

        pi_stars.append(pi_star)
        lens.append(len(pi_traj))

    return lens, pi_stars
Esempio n. 16
0
def generate_model_iteration():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = utils.gen_grid_policies(7)

    init = rnd.standard_normal(
        (mdp.S * mdp.S * mdp.A + mdp.S * mdp.A)
    )  # needs its own init. alternatively could find init that matches value of other inits?!?

    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16, 16))
    plt.scatter(vs[:, 0], vs[:, 1], c='b', s=10, alpha=0.75)

    lr = 0.01
    pi_star = utils.solve(policy_iteration(mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]

    # adversarial pis
    apis = utils.get_deterministic_policies(mdp.S, mdp.A)
    apis = np.stack(apis)

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    params = [parse_model_params(mdp.S, mdp.A, p) for p in params]

    vs = np.vstack([
        utils.value_functional(utils.softmax(p_logits), r, pi_star,
                               mdp.discount).T for p_logits, r in params
    ])

    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c='g', label='PG')
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n - 2), cmap='spring', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='g', marker='x')

    p_logits, r = params[-1]
    vs = utils.polytope(utils.softmax(p_logits), r, mdp.discount, pis)
    plt.scatter(vs[:, 0], vs[:, 1], c='r', s=10, alpha=0.75)
    plt.title('Model iteration')
    plt.xlabel('Value of state 1')
    plt.ylabel('Value of state 2')

    # plt.show()
    plt.savefig('figs/model_iteration_1.png')

    learned_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r,
                            mdp.discount, mdp.d0)
    pi_star_approx = utils.solve(
        policy_iteration(learned_mdp),
        utils.softmax(rnd.standard_normal((mdp.S, mdp.A))))[-1]
    print(pi_star_approx, '\n', pi_star)
Esempio n. 17
0
def compare_acc():
    n_states, n_actions = 2, 2

    lmdp = []
    lmdp_rnd = []
    for _ in range(10):
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

        # solve via LMDPs
        p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
        u, v = lmdps.lmdp_solver(p, q, mdp.discount)
        pi_u_star = lmdps.lmdp_decoder(u, mdp.P)

        # solve MDP
        init = np.random.standard_normal((n_states, n_actions))
        pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1]

        # solve via LMDPs
        # with p set to the random dynamics
        p, q = lmdps.mdp_encoder(mdp.P, mdp.r)
        p = np.einsum('ijk,jk->ij', mdp.P,
                      np.ones((n_states, n_actions)) / n_actions)
        # q = np.max(mdp.r, axis=1, keepdims=True)
        u, v = lmdps.lmdp_solver(p, q, mdp.discount)
        pi_u_star_random = lmdps.lmdp_decoder(u, mdp.P)

        # evaluate both policies.
        v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount)
        v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star,
                                          mdp.discount)
        v_u_star_random = utils.value_functional(mdp.P, mdp.r,
                                                 pi_u_star_random,
                                                 mdp.discount)

        lmdp.append(np.isclose(v_star, v_u_star, 1e-3).all())
        lmdp_rnd.append(np.isclose(v_star, v_u_star_random, 1e-3).all())

    print([np.sum(lmdp), np.sum(lmdp_rnd)])
    plt.bar(range(2), [np.sum(lmdp), np.sum(lmdp_rnd)])
    plt.show()
Esempio n. 18
0
def test_sparse_estimation():
    n_states = 5
    n_actions = 2

    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    det_pis = utils.get_deterministic_policies(mdp.S, mdp.A)
    basis = graph.construct_mdp_basis(det_pis, mdp)

    v = utils.value_functional(mdp.P, mdp.r, det_pis[2],
                               mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, v)

    print(a)
Esempio n. 19
0
def thompson(mdp, lr):
    """
    Can we do TD with values from different MDPs?
    """
    V_true = vmap(
        lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount))
    V_guess = vmap(lambda params, pi: utils.value_functional(
        *mdp_sampler(params), pi, mdp.discount),
                   in_axes=(None, None, 0))
    dLdp = grad(lambda params: mse(V_true(pis), V_guess(params, pis)))

    @jit
    def update_fn(params, Q):
        m = symmetric_sampler(params)
        Q_ = utils.bellman_optimality_operator(m.P, m.r, Q, m.discount)
        params_tp1 -= lr * dLdp(
            params
        )  # done based on observations... could use model iteration!?
        Q_tp1 = Q + lr * (Q_ - Q)
        return Q_tp1, params_tp1

    return update_fn
Esempio n. 20
0
def mom_value_iteration(mdp, pis):
    lens, pi_stars = [], []

    for pi in pis:
        init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)
        pi_traj = utils.solve(
            ss.momentum_bundler(ss.value_iteration(mdp, 0.01), 0.9),
            (init_V, np.zeros_like(init_V)))
        pi_star, _ = pi_traj[-1]

        pi_stars.append(pi_star)
        lens.append(len(pi_traj))

    return lens, pi_stars
Esempio n. 21
0
def mom_param_value_iteration(mdp, pis):
    lens, pi_stars = [], []

    core_init = ss.random_parameterised_matrix(2, 2, 32, 4)

    for pi in pis:
        init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)
        core_init = ss.approximate(init_V, core_init)
        params = utils.solve(
            ss.momentum_bundler(
                ss.parameterised_value_iteration(mdp, 0.01 / len(core_init)),
                0.8), (core_init, [np.zeros_like(c) for c in core_init]))
        pi_star, _ = params[-1]

        pi_stars.append(pi_star)
        lens.append(len(params))

    return lens, pi_stars
Esempio n. 22
0
def test_everything():
    n_states = 5
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)

    A = graph.mdp_topology(det_pis)
    basis = graph.construct_mdp_basis(det_pis, mdp)

    # v = np.random.random((n_states, ))
    v = utils.value_functional(mdp.P, mdp.r, det_pis[2],
                               mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, v)

    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)
    nx.draw(G, pos, node_color=a)
    plt.show()
Esempio n. 23
0
def param_value_iteration(mdp, pis):
    # hypothesis. we are going to see some weirdness in the mom partitions.
    # oscillations will depend on shape of the polytope?!?
    lens, pi_stars = [], []

    core_init = ss.random_parameterised_matrix(2, 2, 32, 4)

    for pi in pis:
        init_V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)
        core_init = ss.approximate(init_V, core_init)
        params = utils.solve(
            ss.parameterised_value_iteration(mdp, 0.01 / len(core_init)),
            core_init)
        pi_star = params[-1]

        pi_stars.append(pi_star)
        lens.append(len(params))

    return lens, pi_stars
Esempio n. 24
0
def policy_gradient_iteration_logits(mdp, lr):
    # this doesnt seem to behave nicely in larger state spaces!?
    # d/dlogits V = E_{\pi}[V] = E[V . d/dlogit log \pi]
    # dlogpi_dlogit = jacrev(lambda logits: np.log(utils.softmax(logits)+1e-8))
    dHdlogit = grad(lambda logits: utils.entropy(utils.softmax(logits)))
    dVdlogit = grad(lambda logits: np.sum(utils.value_functional(mdp.P, mdp.r, utils.softmax(logits), mdp.discount)))

    @jit
    def update_fn(logits):
        # NOTE this is actually soft A2C.
        # V = utils.value_functional(mdp.P, mdp.r, utils.softmax(logits), mdp.discount)
        # Q = utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount)
        # A = Q-V
        # g = np.einsum('ijkl,ij->kl', dlogpi_dlogit(logits), A)

        g = dVdlogit(logits)

        return logits + lr * utils.clip_by_norm(g, 500) + 1e-8*dHdlogit(logits)
    return update_fn
Esempio n. 25
0
def value_graph_laplacian():
    n_states = 8
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    n = len(det_pis)
    print('n pis: {}'.format(n))
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
    Vs = np.stack(values).reshape((n, n_states))
    A = graph.mdp_topology(det_pis)

    W = 1/(np.abs(np.sum(Vs[None, :, :] - Vs[:, None, :], axis=-1)) + 1e-8)
    adj = A*W

    G = nx.from_numpy_array(adj)
    pos = nx.spring_layout(G, iterations=200)
    plt.figure(figsize=(16,16))
    nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
    plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions))
    plt.close()

    # how can you calulate expected eignenvalues!?
    # observation. the underlying complexity of the value topology is linear!?!?
    # how hard is it to estimate the main eigen vec from noisy observations!?
    # that would tell us the complexity!?!?
    for i, alpha in enumerate(np.linspace(0, 1, 10)):
        us = []
        for _ in range(50):
            vs = Vs + alpha*np.random.standard_normal(Vs.shape)
            W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8)
            adj = A*W

            u, v = graph_laplacian_spectra(adj)
            us.append(u)
        us = np.stack(us, axis=0)
        mean = np.mean(us, axis=0)
        var = np.var(us, axis=0)
        plt.bar(range(len(mean)), mean, yerr=np.sqrt(var))
        plt.savefig('figs/value_graphs/{}-lap.png'.format(i))
        plt.close()
Esempio n. 26
0
def plot():
    n_states = 2
    n_actions = 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    value = vmap(
        lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount))

    pis = np.stack(utils.gen_grid_policies(101), axis=0)
    vs = value(pis)
    plt.scatter(vs[:, 0], vs[:, 1], s=10)

    pis = np.stack(utils.get_deterministic_policies(2, 2), axis=0)
    vs = value(pis)
    plt.scatter(vs[:, 0], vs[:, 1], s=10, c='r')

    plt.xlabel('The value of state 1')
    plt.ylabel('The value of state 2')

    plt.title('The value polytope')

    plt.show()
Esempio n. 27
0
def generate_iteration_figures(mdps, pis, iteration_fn, name):
    """
    How many steps to converge to the optima from different starting points.
    """
    n = np.sqrt(len(mdps))
    plt.figure(figsize=(16, 16))
    for i, mdp in enumerate(mdps):
        print(i)
        Vs = np.hstack([
            utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)
            for pi in pis
        ])
        lens, pi_stars = iteration_fn(mdp, pis)

        plt.subplot(n, n, i + 1)
        fig = plt.scatter(Vs[0, :], Vs[1, :], c=lens, s=5)
        fig.axes.get_xaxis().set_visible(False)
        fig.axes.get_yaxis().set_visible(False)

    plt.tight_layout()
    plt.savefig('figs/iterations/{}.png'.format(name))
Esempio n. 28
0
def value_graph_laplacians():
    n_states = 8
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    N = len(det_pis)
    print('n pis: {}'.format(N))
    for i in range(1):
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

        values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
        Vs = np.stack(values).reshape((N, n_states))
        A = graph.mdp_topology(det_pis)

        W = np.exp(-np.linalg.norm(Vs[None, :, :] - Vs[:, None, :], ord=np.inf, axis=-1)+1e-8)

        # mVs = np.mean(Vs, axis=0)  # n_states
        # W = np.dot((Vs - mVs) , (Vs - mVs).T)
        adj = W * A

        G = nx.from_numpy_array(adj)
        pos = nx.spectral_layout(G) #, iterations=500)
        plt.figure(figsize=(16,16))
        nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
        plt.savefig('figs/value_graphs/{}-value_graph-{}-{}.png'.format(i, n_states, n_actions))
        plt.close()

        u, v = graph_laplacian_spectra(adj)
        plt.figure(figsize=(8,8))
        plt.bar(range(len(u)), u)
        plt.savefig('figs/value_graphs/{}-lap.png'.format(i))
        plt.close()

        plt.figure(figsize=(16,16))
        n = 5
        for j in range(n*n):
            plt.subplot(n,n,j+1)
            nx.draw(G, pos, node_color=u[10*j] * v[10*j], node_size=150)
        plt.savefig('figs/value_graphs/{}-spectra.png'.format(i, n_states, n_actions))
        plt.close()
Esempio n. 29
0
def find_symmetric_mdp(n_states, n_actions, discount, lr=1e-2):
    """
    Approximately find a mdp with ??? symmetry
    """
    model_init = rnd.standard_normal(n_states * n_states * n_actions +
                                     n_states * n_actions)
    pis = utils.get_deterministic_policies(n_states, n_actions)
    # pis = [utils.random_policy(n_states, n_actions) for _ in range(100)]
    pis = np.stack(pis)
    # print(pis.shape)
    V = vmap(lambda P, r, pi: utils.value_functional(P, r, pi, discount),
             in_axes=(None, None, 0))

    def loss_fn(model_params):
        # policy symmetry
        P, r = ss.parse_model_params(n_states, n_actions, model_params)
        return np.sum(
            np.square(
                V(utils.softmax(P), r, pis) -
                V(utils.softmax(P), r, np.flip(pis, 1))))

    # def loss_fn(model_params):
    #     # value symmetry
    #     P, r = ss.parse_model_params(n_states, n_actions, model_params)
    #     vals = V(utils.softmax(P), r, pis)
    #     n = n_states//2
    #     return np.sum(np.square(vals[:, :n] - vals[:, n:]))

    dldp = grad(loss_fn)
    update_fn = lambda model: model - lr * dldp(model)
    init = (model_init, np.zeros_like(model_init))
    model_params, momentum_var = utils.solve(
        ss.momentum_bundler(update_fn, 0.9), init)[-1]

    P, r = ss.parse_model_params(n_states, n_actions, model_params)
    d0 = rnd.random((n_states, 1))
    return utils.MDP(n_states, n_actions, P, r, discount, d0)
Esempio n. 30
0
def variance(P, r, pi, discount):
    # var = int_s' int_a p(s'|s, a) pi(a|s)   (r(s,a) + \gamma V(s') + V(s))
    V = utils.value_functional(P, r, pi, discount)[:, 0]
    d = (r[None, :, :] + discount * V[:, None, None] - V[None, :, None])**2
    expected_d = np.einsum('ijk,ijk->j', P * pi[None, :, :], d)  # aka variance
    return np.sum(expected_d)