def generate_vi(mdp, c, lr=0.1): init_pi = utils.random_policy(mdp.S,mdp.A) init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount) vs = np.stack(utils.solve(ss.value_iteration(mdp, lr), init_v))[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
def generate_pg(mdp, c, lr=0.01): init_pi = utils.random_policy(mdp.S,mdp.A) init_logit = np.log(init_pi) logits = utils.solve(ss.policy_gradient_iteration_logits(mdp, lr), init_logit) vs = np.stack([utils.value_functional(mdp.P, mdp.r, utils.softmax(logit), mdp.discount) for logit in logits])[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x')
def generate_model_cs(): """ Compare using all deterministic policies versus fewer mixed policies. Starts to get interesting in higher dims? """ n_states = 32 n_actions = 2 lr = 0.01 k = 64 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) init = rnd.standard_normal((mdp.S * mdp.S * mdp.A + mdp.S * mdp.A)) pi_star = utils.solve(policy_iteration(mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print('pi_star\n', pi_star) # adversarial pis # apis = utils.get_deterministic_policies(mdp.S, mdp.A) apis = np.stack([utils.random_det_policy(mdp.S, mdp.A) for _ in range(k)]) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1]) error = np.mean( (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) - utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount))**2) print('\n', error) new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star = utils.solve(policy_iteration(new_mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print(pi_star) apis = np.stack([utils.random_policy(mdp.S, mdp.A) for _ in range(k)]) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1]) error = np.mean( (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) - utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount))**2) print('\n', error) new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star = utils.solve(policy_iteration(new_mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print(pi_star)
def k_step_option_similarity(): n_states, n_actions = 6, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pi = utils.random_policy(n_states, n_actions) P = multi_step_transition_fn(mdp.P, pi, 3) # P[:,-1] = P[:,-2] # s(o1, o2) = sum_s' P(s' | s1) * log( P(s' | s2) / P(s' | s1)) kl = -np.sum(P[:, :, None] * np.log(P[:, None, :] / P[:, :, None]), axis=0) print(kl)
def generate_pi(mdp, c): init_pi = utils.random_policy(mdp.S,mdp.A) pis = utils.solve(ss.policy_iteration(mdp), init_pi) vs = np.stack([utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) for pi in pis])[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n-2)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x') for i in range(len(vs)-2): dv = 0.1*(vs[i+1, :] - vs[i, :]) plt.arrow(vs[i, 0], vs[i, 1], dv[0], dv[1], color=c, alpha=0.5, width=0.005)
def emp_est_snr_graph(): n_states, n_actions = 12, 3 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = [utils.random_policy(n_states, n_actions) for _ in range(100)] vs = [] hs = [] for i, pi in enumerate(pis): print('\r{}'.format(i), end='', flush=True) # try: vs.append(est_var_R(mdp, pi)) hs.append(utils.entropy(pi)) # except ValueError as err: # print(err) plt.scatter(hs, vs) plt.show()
def state_action_vis(): # want to pick policies that maximise exploration. # but. how to solve for this analytically?! not sure this is going to work... # unless? is there a way to analytically set pi = 1/visitation?! # if we iterate. estimate visitation under pi, set pi = 1/visitaiton. # does it converge? where does it converge? # it shouldnt converge?!? mdp = utils.build_random_mdp(12, 2, 0.5) pi = utils.random_policy(mdp.S, mdp.A) v_sa_sa = state_action_visitation_distribution(mdp, pi) # sum over initial conditions to get discounted state-action visitation probability d0_sa = np.reshape(np.einsum('jk,jl->jk', pi, mdp.d0), (mdp.S * mdp.A, )) ps = np.einsum('ik,k->i', v_sa_sa, d0_sa) plt.imshow(v_sa_sa) plt.show()
def generate_snr_map(): n_states, n_actions = 2, 3 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) # pis = utils.gen_grid_policies(11) pis = [utils.random_policy(n_states, n_actions) for _ in range(512)] Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) mags = [grad_mag(mdp.P, mdp.r, pi, mdp.discount) for pi in pis] uncert = [variance(mdp.P, mdp.r, pi, mdp.discount) for pi in pis] snr = [s / n for s, n in zip(mags, uncert)] plt.subplot(3, 1, 1) plt.title('Magnitude') plt.scatter(Vs[:, 0], Vs[:, 1], c=mags) plt.subplot(3, 1, 2) plt.title('Variance') plt.scatter(Vs[:, 0], Vs[:, 1], c=uncert) plt.subplot(3, 1, 3) plt.title('SNR') plt.scatter(Vs[:, 0], Vs[:, 1], c=snr) plt.show()
# left P[:, :, 3] = np.array([ [1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0], ]) # rewards. 6 x 4 r = np.array([ [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1, 0, 0, 0], # rewarded for going up at the finish. [1, 0, 0, 0], ]) # initial distribution d0 = np.array([[0.5, 0.5, 0, 0, 0, 0]]) pi = np.array(utils.random_policy(6, 4)) pi[[0, 2, 4]] = pi[[1, 3, 5]] V = utils.value_functional(P, r, pi, 0.5) Q_t = utils.bellman_operator(P, r, V, 0.5) # print(np.sum(P, axis=-1)) print(Q_t)
Can we do TD with values from different MDPs? """ V_true = vmap( lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)) V_guess = vmap(lambda params, pi: utils.value_functional( *mdp_sampler(params), pi, mdp.discount), in_axes=(None, None, 0)) dLdp = grad(lambda params: mse(V_true(pis), V_guess(params, pis))) @jit def update_fn(params, Q): m = symmetric_sampler(params) Q_ = utils.bellman_optimality_operator(m.P, m.r, Q, m.discount) params_tp1 -= lr * dLdp( params ) # done based on observations... could use model iteration!? Q_tp1 = Q + lr * (Q_ - Q) return Q_tp1, params_tp1 return update_fn if __name__ == "__main__": # np.random.seed(0) n_states, n_actions = 16, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = [utils.random_policy(n_states, n_actions) for _ in range(1000)] # pis = utils.get_deterministic_policies(n_states, n_actions) onoffpolicy_abstraction(mdp, pis)