Ejemplo n.º 1
0
def graph_PI():
    n_states = 10
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    print('n pis: {}'.format(len(det_pis)))
    mdp = utils.build_random_sparse_mdp(n_states, n_actions, 0.5)

    A = graph.mdp_topology(det_pis)
    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)

    basis = graph.construct_mdp_basis(det_pis, mdp)

    init_pi = utils.softmax(np.random.standard_normal((n_states, n_actions)))
    init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, init_v, lr=0.1)

    pis = utils.solve(search_spaces.policy_iteration(mdp), init_pi)
    print("\n{} policies to vis".format(len(pis)))

    for i, pi in enumerate(pis[:-1]):
        print('Iteration: {}'.format(i))
        v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()
        a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a)
        plt.figure(figsize=(16,16))
        nx.draw(G, pos, node_color=a, node_size=150)
        # plt.show()
        plt.savefig('figs/pi_graphs/{}.png'.format(i))
        plt.close()
Ejemplo n.º 2
0
def value_graph():

    # vs = [np.sum(utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()**2) for pi in det_pis]
    # plt.figure(figsize=(16,16))
    # nx.draw(G, pos, node_color=vs, node_size=150)
    # plt.savefig('figs/pi_graphs/val.png')
    # plt.close()

    n_states = 10
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    n = len(det_pis)
    print('n pis: {}'.format(n))
    # how does discount effect these!?
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
    vs = np.stack(values).reshape((n, n_states))
    W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8)
    A = graph.mdp_topology(det_pis)
    adj = A*W
    G = nx.from_numpy_array(adj)
    pos = nx.spring_layout(G, iterations=200)

    plt.figure(figsize=(16,16))
    nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
    plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions))
    plt.close()
Ejemplo n.º 3
0
def graph_PG():
    # ffmpeg -framerate 10 -start_number 0 -i %d.png -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4
    n_states = 6
    n_actions = 4

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    print('n pis: {}'.format(len(det_pis)))
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)

    A = graph.mdp_topology(det_pis)
    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)

    basis = graph.construct_mdp_basis(det_pis, mdp)

    init_logits = np.random.standard_normal((n_states, n_actions))
    init_v = utils.value_functional(mdp.P, mdp.r, utils.softmax(init_logits), mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, init_v, lr=0.1)

    print('\nSolving PG')
    pis = utils.solve(search_spaces.policy_gradient_iteration_logits(mdp, 0.1), init_logits)
    print("\n{} policies to vis".format(len(pis)))
    n = len(pis)
    # pis = pis[::n//100]
    pis = pis[0:20]

    for i, pi in enumerate(pis[:-1]):
        print('Iteration: {}'.format(i))
        v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()
        a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a)
        plt.figure()
        nx.draw(G, pos, node_color=a)
        # plt.show()
        plt.savefig('figs/pg_graphs/{}.png'.format(i))
        plt.close()
Ejemplo n.º 4
0
def test_topology():
    n_states = 5
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    A = graph.mdp_topology(det_pis)
    print(A)
    G = nx.from_numpy_array(A)
    nx.draw(G)
    plt.show()
Ejemplo n.º 5
0
def test_estimation():
    n_states = 5
    n_actions = 2

    det_pis = utils.get_deterministic_policies(mdp.S, mdp.A)
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    basis = graph.construct_mdp_basis(det_pis, mdp)

    v = np.random.random((n_states, ))
    a = graph.estimate_coeffs(basis.T, v)
    print(a)
Ejemplo n.º 6
0
def generate_model_iteration():
    n_states, n_actions = 2, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = utils.gen_grid_policies(7)

    init = rnd.standard_normal(
        (mdp.S * mdp.S * mdp.A + mdp.S * mdp.A)
    )  # needs its own init. alternatively could find init that matches value of other inits?!?

    vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    plt.figure(figsize=(16, 16))
    plt.scatter(vs[:, 0], vs[:, 1], c='b', s=10, alpha=0.75)

    lr = 0.01
    pi_star = utils.solve(policy_iteration(mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]

    # adversarial pis
    apis = utils.get_deterministic_policies(mdp.S, mdp.A)
    apis = np.stack(apis)

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    params = [parse_model_params(mdp.S, mdp.A, p) for p in params]

    vs = np.vstack([
        utils.value_functional(utils.softmax(p_logits), r, pi_star,
                               mdp.discount).T for p_logits, r in params
    ])

    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c='g', label='PG')
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n - 2), cmap='spring', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='g', marker='x')

    p_logits, r = params[-1]
    vs = utils.polytope(utils.softmax(p_logits), r, mdp.discount, pis)
    plt.scatter(vs[:, 0], vs[:, 1], c='r', s=10, alpha=0.75)
    plt.title('Model iteration')
    plt.xlabel('Value of state 1')
    plt.ylabel('Value of state 2')

    # plt.show()
    plt.savefig('figs/model_iteration_1.png')

    learned_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r,
                            mdp.discount, mdp.d0)
    pi_star_approx = utils.solve(
        policy_iteration(learned_mdp),
        utils.softmax(rnd.standard_normal((mdp.S, mdp.A))))[-1]
    print(pi_star_approx, '\n', pi_star)
Ejemplo n.º 7
0
def test_sparse_estimation():
    n_states = 5
    n_actions = 2

    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)
    det_pis = utils.get_deterministic_policies(mdp.S, mdp.A)
    basis = graph.construct_mdp_basis(det_pis, mdp)

    v = utils.value_functional(mdp.P, mdp.r, det_pis[2],
                               mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, v)

    print(a)
Ejemplo n.º 8
0
def estimation_err():
    """
    Compare using all deterministic policies versus fewer mixed policies.
    Starts to get interesting in higher dims?



    """
    n_states = 4
    n_actions = 2
    lr = 0.01
    discount = 0.5

    dpis = utils.get_deterministic_policies(n_states, n_actions)
    params = rnd.standard_normal(
        (n_states * n_states * n_actions + n_states * n_actions))

    def value(P, r, pis):
        return np.array([
            utils.value_functional(P, r, pi, discount) for pi in pis
        ])  # jax doesnt seem to like me changing the batch size to a vmap?!?

    def loss_fn(params, pis):
        p_logits, r = parse_model_params(n_states, n_actions, params)
        return np.sum(value(utils.softmax(p_logits), r, pis)**2)

    dVdp = jit(lambda *x: np.array(grad(loss_fn, 0)(*x)))  #,axis=0)
    det_dVdp = dVdp(params, dpis)

    k_estim_err = []
    for k in range(n_states, n_actions**n_states + 1, n_states // 2):
        print('\n{} det policies. Testing with {}\n'.format(
            n_actions**n_states, k))
        diffs = []
        for _ in range(6):
            rnd_pis = np.stack([
                utils.random_det_policy(n_states, n_actions) for _ in range(k)
            ])
            diffs.append(np.max(np.abs(det_dVdp - dVdp(params, rnd_pis))))
        k_estim_err.append(numpy.mean(diffs))

    plt.plot(range(n_states, n_actions**n_states + 1, n_states // 2),
             k_estim_err)
    plt.xlabel('Number of randomly sampled policies')
    plt.ylabel('Max error in gradient estimation')
    plt.show()
Ejemplo n.º 9
0
def test_everything():
    n_states = 5
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    mdp = utils.build_random_mdp(n_states, n_actions, 0.9)

    A = graph.mdp_topology(det_pis)
    basis = graph.construct_mdp_basis(det_pis, mdp)

    # v = np.random.random((n_states, ))
    v = utils.value_functional(mdp.P, mdp.r, det_pis[2],
                               mdp.discount).squeeze()
    a = graph.sparse_coeffs(basis, v)

    G = nx.from_numpy_array(A)
    pos = nx.spring_layout(G, iterations=200)
    nx.draw(G, pos, node_color=a)
    plt.show()
Ejemplo n.º 10
0
def value_graph_laplacian():
    n_states = 8
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    n = len(det_pis)
    print('n pis: {}'.format(n))
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
    Vs = np.stack(values).reshape((n, n_states))
    A = graph.mdp_topology(det_pis)

    W = 1/(np.abs(np.sum(Vs[None, :, :] - Vs[:, None, :], axis=-1)) + 1e-8)
    adj = A*W

    G = nx.from_numpy_array(adj)
    pos = nx.spring_layout(G, iterations=200)
    plt.figure(figsize=(16,16))
    nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
    plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions))
    plt.close()

    # how can you calulate expected eignenvalues!?
    # observation. the underlying complexity of the value topology is linear!?!?
    # how hard is it to estimate the main eigen vec from noisy observations!?
    # that would tell us the complexity!?!?
    for i, alpha in enumerate(np.linspace(0, 1, 10)):
        us = []
        for _ in range(50):
            vs = Vs + alpha*np.random.standard_normal(Vs.shape)
            W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8)
            adj = A*W

            u, v = graph_laplacian_spectra(adj)
            us.append(u)
        us = np.stack(us, axis=0)
        mean = np.mean(us, axis=0)
        var = np.var(us, axis=0)
        plt.bar(range(len(mean)), mean, yerr=np.sqrt(var))
        plt.savefig('figs/value_graphs/{}-lap.png'.format(i))
        plt.close()
Ejemplo n.º 11
0
def plot():
    n_states = 2
    n_actions = 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    value = vmap(
        lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount))

    pis = np.stack(utils.gen_grid_policies(101), axis=0)
    vs = value(pis)
    plt.scatter(vs[:, 0], vs[:, 1], s=10)

    pis = np.stack(utils.get_deterministic_policies(2, 2), axis=0)
    vs = value(pis)
    plt.scatter(vs[:, 0], vs[:, 1], s=10, c='r')

    plt.xlabel('The value of state 1')
    plt.ylabel('The value of state 2')

    plt.title('The value polytope')

    plt.show()
Ejemplo n.º 12
0
def value_graph_laplacians():
    n_states = 8
    n_actions = 2

    det_pis = utils.get_deterministic_policies(n_states, n_actions)
    N = len(det_pis)
    print('n pis: {}'.format(N))
    for i in range(1):
        mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

        values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis]
        Vs = np.stack(values).reshape((N, n_states))
        A = graph.mdp_topology(det_pis)

        W = np.exp(-np.linalg.norm(Vs[None, :, :] - Vs[:, None, :], ord=np.inf, axis=-1)+1e-8)

        # mVs = np.mean(Vs, axis=0)  # n_states
        # W = np.dot((Vs - mVs) , (Vs - mVs).T)
        adj = W * A

        G = nx.from_numpy_array(adj)
        pos = nx.spectral_layout(G) #, iterations=500)
        plt.figure(figsize=(16,16))
        nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150)
        plt.savefig('figs/value_graphs/{}-value_graph-{}-{}.png'.format(i, n_states, n_actions))
        plt.close()

        u, v = graph_laplacian_spectra(adj)
        plt.figure(figsize=(8,8))
        plt.bar(range(len(u)), u)
        plt.savefig('figs/value_graphs/{}-lap.png'.format(i))
        plt.close()

        plt.figure(figsize=(16,16))
        n = 5
        for j in range(n*n):
            plt.subplot(n,n,j+1)
            nx.draw(G, pos, node_color=u[10*j] * v[10*j], node_size=150)
        plt.savefig('figs/value_graphs/{}-spectra.png'.format(i, n_states, n_actions))
        plt.close()
Ejemplo n.º 13
0
def find_symmetric_mdp(n_states, n_actions, discount, lr=1e-2):
    """
    Approximately find a mdp with ??? symmetry
    """
    model_init = rnd.standard_normal(n_states * n_states * n_actions +
                                     n_states * n_actions)
    pis = utils.get_deterministic_policies(n_states, n_actions)
    # pis = [utils.random_policy(n_states, n_actions) for _ in range(100)]
    pis = np.stack(pis)
    # print(pis.shape)
    V = vmap(lambda P, r, pi: utils.value_functional(P, r, pi, discount),
             in_axes=(None, None, 0))

    def loss_fn(model_params):
        # policy symmetry
        P, r = ss.parse_model_params(n_states, n_actions, model_params)
        return np.sum(
            np.square(
                V(utils.softmax(P), r, pis) -
                V(utils.softmax(P), r, np.flip(pis, 1))))

    # def loss_fn(model_params):
    #     # value symmetry
    #     P, r = ss.parse_model_params(n_states, n_actions, model_params)
    #     vals = V(utils.softmax(P), r, pis)
    #     n = n_states//2
    #     return np.sum(np.square(vals[:, :n] - vals[:, n:]))

    dldp = grad(loss_fn)
    update_fn = lambda model: model - lr * dldp(model)
    init = (model_init, np.zeros_like(model_init))
    model_params, momentum_var = utils.solve(
        ss.momentum_bundler(update_fn, 0.9), init)[-1]

    P, r = ss.parse_model_params(n_states, n_actions, model_params)
    d0 = rnd.random((n_states, 1))
    return utils.MDP(n_states, n_actions, P, r, discount, d0)