Ejemplo n.º 1
0
def generate_vi(mdp, c, lr=0.1):
    init_pi = utils.random_policy(mdp.S,mdp.A)
    init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount)
    vs = np.stack(utils.solve(ss.value_iteration(mdp, lr), init_v))[:,:,0]
    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
Ejemplo n.º 2
0
def generate_pg(mdp, c, lr=0.01):
    init_pi = utils.random_policy(mdp.S,mdp.A)
    init_logit = np.log(init_pi)
    logits = utils.solve(ss.policy_gradient_iteration_logits(mdp, lr), init_logit)
    vs = np.stack([utils.value_functional(mdp.P, mdp.r, utils.softmax(logit), mdp.discount) for logit in logits])[:,:,0]
    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
Ejemplo n.º 3
0
def generate_model_cs():
    """
    Compare using all deterministic policies versus fewer mixed policies.
    Starts to get interesting in higher dims?



    """
    n_states = 32
    n_actions = 2
    lr = 0.01
    k = 64

    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    init = rnd.standard_normal((mdp.S * mdp.S * mdp.A + mdp.S * mdp.A))

    pi_star = utils.solve(policy_iteration(mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print('pi_star\n', pi_star)

    # adversarial pis
    # apis = utils.get_deterministic_policies(mdp.S, mdp.A)
    apis = np.stack([utils.random_det_policy(mdp.S, mdp.A) for _ in range(k)])

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1])
    error = np.mean(
        (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) -
         utils.value_functional(utils.softmax(p_logits), r, pi_star,
                                mdp.discount))**2)
    print('\n', error)
    new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount,
                        mdp.d0)
    pi_star = utils.solve(policy_iteration(new_mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print(pi_star)

    apis = np.stack([utils.random_policy(mdp.S, mdp.A) for _ in range(k)])

    update_fn = model_iteration(mdp, lr, apis)
    params = utils.solve(update_fn, init)
    p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1])
    error = np.mean(
        (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) -
         utils.value_functional(utils.softmax(p_logits), r, pi_star,
                                mdp.discount))**2)
    print('\n', error)
    new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount,
                        mdp.d0)
    pi_star = utils.solve(policy_iteration(new_mdp),
                          utils.softmax(rnd.standard_normal(
                              (mdp.S, mdp.A))))[-1]
    print(pi_star)
Ejemplo n.º 4
0
def k_step_option_similarity():
    n_states, n_actions = 6, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)

    pi = utils.random_policy(n_states, n_actions)
    P = multi_step_transition_fn(mdp.P, pi, 3)
    # P[:,-1] = P[:,-2]
    # s(o1, o2) = sum_s' P(s' | s1) * log( P(s' | s2)  /  P(s' | s1))
    kl = -np.sum(P[:, :, None] * np.log(P[:, None, :] / P[:, :, None]), axis=0)
    print(kl)
Ejemplo n.º 5
0
def generate_pi(mdp, c):
    init_pi = utils.random_policy(mdp.S,mdp.A)
    pis = utils.solve(ss.policy_iteration(mdp), init_pi)
    vs = np.stack([utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) for pi in pis])[:,:,0]
    n = vs.shape[0]
    plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n-2))
    plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10)
    plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')

    for i in range(len(vs)-2):
        dv = 0.1*(vs[i+1, :] - vs[i, :])
        plt.arrow(vs[i, 0], vs[i, 1], dv[0], dv[1], color=c, alpha=0.5, width=0.005)
Ejemplo n.º 6
0
def emp_est_snr_graph():
    n_states, n_actions = 12, 3
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = [utils.random_policy(n_states, n_actions) for _ in range(100)]

    vs = []
    hs = []
    for i, pi in enumerate(pis):
        print('\r{}'.format(i), end='', flush=True)

        # try:
        vs.append(est_var_R(mdp, pi))
        hs.append(utils.entropy(pi))
        # except ValueError as err:
        #     print(err)

    plt.scatter(hs, vs)
    plt.show()
Ejemplo n.º 7
0
def state_action_vis():
    # want to pick policies that maximise exploration.
    # but. how to solve for this analytically?! not sure this is going to work...
    # unless? is there a way to analytically set pi = 1/visitation?!
    # if we iterate. estimate visitation under pi, set pi = 1/visitaiton.
    # does it converge? where does it converge?
    # it shouldnt converge?!?

    mdp = utils.build_random_mdp(12, 2, 0.5)
    pi = utils.random_policy(mdp.S, mdp.A)
    v_sa_sa = state_action_visitation_distribution(mdp, pi)

    # sum over initial conditions to get discounted state-action visitation probability
    d0_sa = np.reshape(np.einsum('jk,jl->jk', pi, mdp.d0), (mdp.S * mdp.A, ))
    ps = np.einsum('ik,k->i', v_sa_sa, d0_sa)

    plt.imshow(v_sa_sa)
    plt.show()
Ejemplo n.º 8
0
def generate_snr_map():
    n_states, n_actions = 2, 3
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    # pis = utils.gen_grid_policies(11)
    pis = [utils.random_policy(n_states, n_actions) for _ in range(512)]
    Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis)

    mags = [grad_mag(mdp.P, mdp.r, pi, mdp.discount) for pi in pis]
    uncert = [variance(mdp.P, mdp.r, pi, mdp.discount) for pi in pis]

    snr = [s / n for s, n in zip(mags, uncert)]

    plt.subplot(3, 1, 1)
    plt.title('Magnitude')
    plt.scatter(Vs[:, 0], Vs[:, 1], c=mags)

    plt.subplot(3, 1, 2)
    plt.title('Variance')
    plt.scatter(Vs[:, 0], Vs[:, 1], c=uncert)

    plt.subplot(3, 1, 3)
    plt.title('SNR')
    plt.scatter(Vs[:, 0], Vs[:, 1], c=snr)
    plt.show()
Ejemplo n.º 9
0
# left
P[:, :, 3] = np.array([
    [1, 1, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0],
    [0, 0, 1, 1, 0, 0],
    [0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 1, 1],
    [0, 0, 0, 0, 0, 0],
])

# rewards. 6 x 4
r = np.array([
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [1, 0, 0, 0],  # rewarded for going up at the finish.
    [1, 0, 0, 0],
])

# initial distribution
d0 = np.array([[0.5, 0.5, 0, 0, 0, 0]])

pi = np.array(utils.random_policy(6, 4))
pi[[0, 2, 4]] = pi[[1, 3, 5]]
V = utils.value_functional(P, r, pi, 0.5)
Q_t = utils.bellman_operator(P, r, V, 0.5)
# print(np.sum(P, axis=-1))
print(Q_t)
Ejemplo n.º 10
0
    Can we do TD with values from different MDPs?
    """
    V_true = vmap(
        lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount))
    V_guess = vmap(lambda params, pi: utils.value_functional(
        *mdp_sampler(params), pi, mdp.discount),
                   in_axes=(None, None, 0))
    dLdp = grad(lambda params: mse(V_true(pis), V_guess(params, pis)))

    @jit
    def update_fn(params, Q):
        m = symmetric_sampler(params)
        Q_ = utils.bellman_optimality_operator(m.P, m.r, Q, m.discount)
        params_tp1 -= lr * dLdp(
            params
        )  # done based on observations... could use model iteration!?
        Q_tp1 = Q + lr * (Q_ - Q)
        return Q_tp1, params_tp1

    return update_fn


if __name__ == "__main__":
    # np.random.seed(0)
    n_states, n_actions = 16, 2
    mdp = utils.build_random_mdp(n_states, n_actions, 0.5)
    pis = [utils.random_policy(n_states, n_actions) for _ in range(1000)]
    # pis = utils.get_deterministic_policies(n_states, n_actions)

    onoffpolicy_abstraction(mdp, pis)