def generate_polytope_densities(): n_states, n_actions = 2, 2 pis = utils.gen_grid_policies(41) nx = 4 ny = 5 plt.figure(figsize=(16, 16)) for i in range(nx * ny): print(i) mdp = utils.build_random_mdp(n_states, n_actions, 0.5) Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) # just set all to be the same probability p_pi = 0.1 pVs = [ density_value_functional(p_pi, mdp.P, mdp.r, pi, mdp.discount) for pi in pis ] plt.subplot(nx, ny, i + 1) fig = plt.scatter(Vs[:, 0], Vs[:, 1], c=pVs) # plt.colorbar() fig.axes.get_xaxis().set_visible(False) fig.axes.get_yaxis().set_visible(False) plt.tight_layout() plt.show()
def random_test(): """ Explore how the unconstrained dynamics in a random setting. """ n_states, n_actions = 3, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.9) P = mdp.P r = mdp.r # a distribution over future states assert np.isclose(np.sum(P, axis=0), np.ones( (n_states, n_actions))).all() p, q = mdp_encoder(P, r) # print('P', P) # print('r', r) # print('q', q) # print('p', p) # r(s, a) = q(s) - KL(P(. | s, a) || p(. | s)) # TODO how to do with matrices!? # kl = - (np.einsum('ijk,ij->jk', P, np.log(p)) - np.einsum('ijk,ijk->jk', P, np.log(P))) ce = numpy.zeros((n_states, n_actions)) for j in range(n_states): for k in range(n_actions): # actions ce[j, k] = CE(P[:, j, k], p[:, j]) r_approx = q[:, np.newaxis] + ce print('r', np.around(r, 3), r.shape) print('r_approx', np.around(r_approx, 3), r_approx.shape) print('r ~= q - CE(P || p): {}'.format( np.isclose(r, r_approx, atol=1e-3).all()))
def graph_PG(): # ffmpeg -framerate 10 -start_number 0 -i %d.png -c:v libx264 -r 30 -pix_fmt yuv420p out.mp4 n_states = 6 n_actions = 4 det_pis = utils.get_deterministic_policies(n_states, n_actions) print('n pis: {}'.format(len(det_pis))) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) A = graph.mdp_topology(det_pis) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) basis = graph.construct_mdp_basis(det_pis, mdp) init_logits = np.random.standard_normal((n_states, n_actions)) init_v = utils.value_functional(mdp.P, mdp.r, utils.softmax(init_logits), mdp.discount).squeeze() a = graph.sparse_coeffs(basis, init_v, lr=0.1) print('\nSolving PG') pis = utils.solve(search_spaces.policy_gradient_iteration_logits(mdp, 0.1), init_logits) print("\n{} policies to vis".format(len(pis))) n = len(pis) # pis = pis[::n//100] pis = pis[0:20] for i, pi in enumerate(pis[:-1]): print('Iteration: {}'.format(i)) v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a) plt.figure() nx.draw(G, pos, node_color=a) # plt.show() plt.savefig('figs/pg_graphs/{}.png'.format(i)) plt.close()
def value_graph(): # vs = [np.sum(utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze()**2) for pi in det_pis] # plt.figure(figsize=(16,16)) # nx.draw(G, pos, node_color=vs, node_size=150) # plt.savefig('figs/pi_graphs/val.png') # plt.close() n_states = 10 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) n = len(det_pis) print('n pis: {}'.format(n)) # how does discount effect these!? mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] vs = np.stack(values).reshape((n, n_states)) W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8) A = graph.mdp_topology(det_pis) adj = A*W G = nx.from_numpy_array(adj) pos = nx.spring_layout(G, iterations=200) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions)) plt.close()
def generate_model_cs(): """ Compare using all deterministic policies versus fewer mixed policies. Starts to get interesting in higher dims? """ n_states = 32 n_actions = 2 lr = 0.01 k = 64 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) init = rnd.standard_normal((mdp.S * mdp.S * mdp.A + mdp.S * mdp.A)) pi_star = utils.solve(policy_iteration(mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print('pi_star\n', pi_star) # adversarial pis # apis = utils.get_deterministic_policies(mdp.S, mdp.A) apis = np.stack([utils.random_det_policy(mdp.S, mdp.A) for _ in range(k)]) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1]) error = np.mean( (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) - utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount))**2) print('\n', error) new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star = utils.solve(policy_iteration(new_mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print(pi_star) apis = np.stack([utils.random_policy(mdp.S, mdp.A) for _ in range(k)]) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) p_logits, r = parse_model_params(mdp.S, mdp.A, params[-1]) error = np.mean( (utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) - utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount))**2) print('\n', error) new_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star = utils.solve(policy_iteration(new_mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] print(pi_star)
def k_step_option_similarity(): n_states, n_actions = 6, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pi = utils.random_policy(n_states, n_actions) P = multi_step_transition_fn(mdp.P, pi, 3) # P[:,-1] = P[:,-2] # s(o1, o2) = sum_s' P(s' | s1) * log( P(s' | s2) / P(s' | s1)) kl = -np.sum(P[:, :, None] * np.log(P[:, None, :] / P[:, :, None]), axis=0) print(kl)
def test_estimation(): n_states = 5 n_actions = 2 det_pis = utils.get_deterministic_policies(mdp.S, mdp.A) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) basis = graph.construct_mdp_basis(det_pis, mdp) v = np.random.random((n_states, )) a = graph.estimate_coeffs(basis.T, v) print(a)
def lmdp_field(): """ For each policy. Calculate its dynamics, P_pi. Estimate the value via the LMDP. Plot difference under linearTD operator. """ n_states, n_actions = 2, 2 pis = utils.gen_grid_policies(11) mdp = utils.build_random_mdp(n_states, n_actions, 0.5) p, q = lmdps.mdp_encoder(mdp.P, mdp.r) vs = [] dvs = [] for pi in pis: u = np.einsum('ijk,jk->ij', mdp.P, pi) v = lmdps.linear_value_functional(p, q, u, mdp.discount) z = np.exp(v) Tz = lmdps.linear_bellman_operator(p, q, z, mdp.discount) dv = np.log(Tz) - np.log(z) vs.append(v) dvs.append(dv) dvs = np.vstack(dvs) vs = np.vstack(vs) normed_dvs = utils.normalize(dvs) plt.figure(figsize=(16, 16)) plt.subplot(1, 2, 1) plt.title('Linearised Bellman operator') plt.quiver(vs[:, 0], vs[:, 1], normed_dvs[:, 0], normed_dvs[:, 1], np.linalg.norm(dvs, axis=1)) # plot bellman Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) diff_op = lambda V: utils.bellman_optimality_operator( mdp.P, mdp.r, np.expand_dims(V, 1), mdp.discount) - np.expand_dims( V, 1) dVs = np.stack([np.max(diff_op(V), axis=1) for V in Vs]) normed_dVs = utils.normalize(dVs) plt.subplot(1, 2, 2) plt.title('Bellman operator') plt.quiver(Vs[:, 0], Vs[:, 1], normed_dVs[:, 0], normed_dVs[:, 1], np.linalg.norm(dVs, axis=1)) # plt.savefig('figs/LBO_BO.png') plt.show()
def generate_cvi(): print('\nRunning PVI vs VI') n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) fn = ss.complex_value_iteration(mdp, 0.01) Q = rnd.standard_normal((n_states, 1)) + 1j*rnd.standard_normal((n_states, 1)) results = utils.solve(fn, Q) print(results)
def generate_model_iteration(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = utils.gen_grid_policies(7) init = rnd.standard_normal( (mdp.S * mdp.S * mdp.A + mdp.S * mdp.A) ) # needs its own init. alternatively could find init that matches value of other inits?!? vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16, 16)) plt.scatter(vs[:, 0], vs[:, 1], c='b', s=10, alpha=0.75) lr = 0.01 pi_star = utils.solve(policy_iteration(mdp), utils.softmax(rnd.standard_normal( (mdp.S, mdp.A))))[-1] # adversarial pis apis = utils.get_deterministic_policies(mdp.S, mdp.A) apis = np.stack(apis) update_fn = model_iteration(mdp, lr, apis) params = utils.solve(update_fn, init) params = [parse_model_params(mdp.S, mdp.A, p) for p in params] vs = np.vstack([ utils.value_functional(utils.softmax(p_logits), r, pi_star, mdp.discount).T for p_logits, r in params ]) n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c='g', label='PG') plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n - 2), cmap='spring', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='g', marker='x') p_logits, r = params[-1] vs = utils.polytope(utils.softmax(p_logits), r, mdp.discount, pis) plt.scatter(vs[:, 0], vs[:, 1], c='r', s=10, alpha=0.75) plt.title('Model iteration') plt.xlabel('Value of state 1') plt.ylabel('Value of state 2') # plt.show() plt.savefig('figs/model_iteration_1.png') learned_mdp = utils.MDP(mdp.S, mdp.A, utils.softmax(p_logits), r, mdp.discount, mdp.d0) pi_star_approx = utils.solve( policy_iteration(learned_mdp), utils.softmax(rnd.standard_normal((mdp.S, mdp.A))))[-1] print(pi_star_approx, '\n', pi_star)
def test_sparse_estimation(): n_states = 5 n_actions = 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.9) det_pis = utils.get_deterministic_policies(mdp.S, mdp.A) basis = graph.construct_mdp_basis(det_pis, mdp) v = utils.value_functional(mdp.P, mdp.r, det_pis[2], mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v) print(a)
def state_action_vis(): # want to pick policies that maximise exploration. # but. how to solve for this analytically?! not sure this is going to work... # unless? is there a way to analytically set pi = 1/visitation?! # if we iterate. estimate visitation under pi, set pi = 1/visitaiton. # does it converge? where does it converge? # it shouldnt converge?!? mdp = utils.build_random_mdp(12, 2, 0.5) pi = utils.random_policy(mdp.S, mdp.A) v_sa_sa = state_action_visitation_distribution(mdp, pi) # sum over initial conditions to get discounted state-action visitation probability d0_sa = np.reshape(np.einsum('jk,jl->jk', pi, mdp.d0), (mdp.S * mdp.A, )) ps = np.einsum('ik,k->i', v_sa_sa, d0_sa) plt.imshow(v_sa_sa) plt.show()
def emp_est_snr_graph(): n_states, n_actions = 12, 3 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = [utils.random_policy(n_states, n_actions) for _ in range(100)] vs = [] hs = [] for i, pi in enumerate(pis): print('\r{}'.format(i), end='', flush=True) # try: vs.append(est_var_R(mdp, pi)) hs.append(utils.entropy(pi)) # except ValueError as err: # print(err) plt.scatter(hs, vs) plt.show()
def test_everything(): n_states = 5 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) mdp = utils.build_random_mdp(n_states, n_actions, 0.9) A = graph.mdp_topology(det_pis) basis = graph.construct_mdp_basis(det_pis, mdp) # v = np.random.random((n_states, )) v = utils.value_functional(mdp.P, mdp.r, det_pis[2], mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) nx.draw(G, pos, node_color=a) plt.show()
def value_graph_laplacian(): n_states = 8 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) n = len(det_pis) print('n pis: {}'.format(n)) mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] Vs = np.stack(values).reshape((n, n_states)) A = graph.mdp_topology(det_pis) W = 1/(np.abs(np.sum(Vs[None, :, :] - Vs[:, None, :], axis=-1)) + 1e-8) adj = A*W G = nx.from_numpy_array(adj) pos = nx.spring_layout(G, iterations=200) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/value_graph-{}-{}.png'.format(n_states, n_actions)) plt.close() # how can you calulate expected eignenvalues!? # observation. the underlying complexity of the value topology is linear!?!? # how hard is it to estimate the main eigen vec from noisy observations!? # that would tell us the complexity!?!? for i, alpha in enumerate(np.linspace(0, 1, 10)): us = [] for _ in range(50): vs = Vs + alpha*np.random.standard_normal(Vs.shape) W = 1/(np.abs(np.sum(vs[None, :, :] - vs[:, None, :], axis=-1)) + 1e-8) adj = A*W u, v = graph_laplacian_spectra(adj) us.append(u) us = np.stack(us, axis=0) mean = np.mean(us, axis=0) var = np.var(us, axis=0) plt.bar(range(len(mean)), mean, yerr=np.sqrt(var)) plt.savefig('figs/value_graphs/{}-lap.png'.format(i)) plt.close()
def compare_mdp_lmdp(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.9) pis = utils.gen_grid_policies(7) vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16, 16)) plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star = lmdps.lmdp_decoder(u, mdp.P) pi_p = lmdps.lmdp_decoder(p, mdp.P) # solve MDP init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] # pi_star = onehot(np.argmax(qs, axis=1), n_actions) # evaluate both policies. v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount) v_p = utils.value_functional(mdp.P, mdp.r, pi_p, mdp.discount) plt.scatter(v_star[0, 0], v_star[1, 0], c='m', alpha=0.5, marker='x', label='mdp') plt.scatter(v_u_star[0, 0], v_u_star[1, 0], c='g', alpha=0.5, marker='x', label='lmdp') plt.scatter(v_p[0, 0], v_p[1, 0], c='k', marker='x', alpha=0.5, label='p') plt.legend() plt.show()
def emp_est_snr_map(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = utils.gen_grid_policies(5) vals = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) vars = [] hs = [] for i, pi in enumerate(pis): print('\r{}'.format(i), end='', flush=True) vars.append(est_var_R(mdp, pi)) hs.append(utils.entropy(pi)) plt.subplot(2, 1, 1) plt.scatter(vals[:, 0], vals[:, 1], c=hs) plt.subplot(2, 1, 2) plt.scatter(vals[:, 0], vals[:, 1], c=vars) # plt.subplot(3,1,1) # plt.scatter(vals[:, 0], vals[:, 0], c=hs) plt.show()
def hyperbolic_polytope(): # https://arxiv.org/abs/1902.06865 n_states, n_actions = 2, 2 N = 21 pis = utils.gen_grid_policies(N) mdp = utils.build_random_mdp(n_states, n_actions, None) n = 10 discounts = np.linspace(0.1, 1-1e-4, n) Vs = [] for discount in discounts: Vs.append((1-discount)*utils.polytope(mdp.P, mdp.r, discount, pis)) h_V = sum(Vs)/n plt.subplot(2, 1, 1) plt.scatter(h_V[:, 0], h_V[:, 1]) plt.subplot(2, 1, 2) V = (1-0.9)*utils.polytope(mdp.P, mdp.r, 0.9, pis) plt.scatter(V[:, 0], V[:, 1]) plt.show()
def plot(): n_states = 2 n_actions = 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) value = vmap( lambda pi: utils.value_functional(mdp.P, mdp.r, pi, mdp.discount)) pis = np.stack(utils.gen_grid_policies(101), axis=0) vs = value(pis) plt.scatter(vs[:, 0], vs[:, 1], s=10) pis = np.stack(utils.get_deterministic_policies(2, 2), axis=0) vs = value(pis) plt.scatter(vs[:, 0], vs[:, 1], s=10, c='r') plt.xlabel('The value of state 1') plt.ylabel('The value of state 2') plt.title('The value polytope') plt.show()
def value_graph_laplacians(): n_states = 8 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) N = len(det_pis) print('n pis: {}'.format(N)) for i in range(1): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) values = [utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() for pi in det_pis] Vs = np.stack(values).reshape((N, n_states)) A = graph.mdp_topology(det_pis) W = np.exp(-np.linalg.norm(Vs[None, :, :] - Vs[:, None, :], ord=np.inf, axis=-1)+1e-8) # mVs = np.mean(Vs, axis=0) # n_states # W = np.dot((Vs - mVs) , (Vs - mVs).T) adj = W * A G = nx.from_numpy_array(adj) pos = nx.spectral_layout(G) #, iterations=500) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=[np.sum(v) for v in values], node_size=150) plt.savefig('figs/value_graphs/{}-value_graph-{}-{}.png'.format(i, n_states, n_actions)) plt.close() u, v = graph_laplacian_spectra(adj) plt.figure(figsize=(8,8)) plt.bar(range(len(u)), u) plt.savefig('figs/value_graphs/{}-lap.png'.format(i)) plt.close() plt.figure(figsize=(16,16)) n = 5 for j in range(n*n): plt.subplot(n,n,j+1) nx.draw(G, pos, node_color=u[10*j] * v[10*j], node_size=150) plt.savefig('figs/value_graphs/{}-spectra.png'.format(i, n_states, n_actions)) plt.close()
def compare_acc(): n_states, n_actions = 2, 2 lmdp = [] lmdp_rnd = [] for _ in range(10): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star = lmdps.lmdp_decoder(u, mdp.P) # solve MDP init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] # solve via LMDPs # with p set to the random dynamics p, q = lmdps.mdp_encoder(mdp.P, mdp.r) p = np.einsum('ijk,jk->ij', mdp.P, np.ones((n_states, n_actions)) / n_actions) # q = np.max(mdp.r, axis=1, keepdims=True) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star_random = lmdps.lmdp_decoder(u, mdp.P) # evaluate both policies. v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount) v_u_star_random = utils.value_functional(mdp.P, mdp.r, pi_u_star_random, mdp.discount) lmdp.append(np.isclose(v_star, v_u_star, 1e-3).all()) lmdp_rnd.append(np.isclose(v_star, v_u_star_random, 1e-3).all()) print([np.sum(lmdp), np.sum(lmdp_rnd)]) plt.bar(range(2), [np.sum(lmdp), np.sum(lmdp_rnd)]) plt.show()
def mdp_lmdp_optimality(): n_states, n_actions = 2, 2 n = 5 plt.figure(figsize=(8, 16)) plt.title('Optimal control (LMDP) vs optimal policy (MDP)') for i in range(n): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] P_pi_star = np.einsum('ijk,jk->ij', mdp.P, pi_star) plt.subplot(n, 2, 2 * i + 1) plt.imshow(u) plt.subplot(n, 2, 2 * i + 2) plt.imshow(P_pi_star) plt.savefig('figs/lmdp_mdp_optimal_dynamics.png') plt.show()
def generate_snr_map(): n_states, n_actions = 2, 3 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) # pis = utils.gen_grid_policies(11) pis = [utils.random_policy(n_states, n_actions) for _ in range(512)] Vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) mags = [grad_mag(mdp.P, mdp.r, pi, mdp.discount) for pi in pis] uncert = [variance(mdp.P, mdp.r, pi, mdp.discount) for pi in pis] snr = [s / n for s, n in zip(mags, uncert)] plt.subplot(3, 1, 1) plt.title('Magnitude') plt.scatter(Vs[:, 0], Vs[:, 1], c=mags) plt.subplot(3, 1, 2) plt.title('Variance') plt.scatter(Vs[:, 0], Vs[:, 1], c=uncert) plt.subplot(3, 1, 3) plt.title('SNR') plt.scatter(Vs[:, 0], Vs[:, 1], c=snr) plt.show()
def test_density(): mdp = utils.build_random_mdp(2, 2, 0.9) pi = utils.softmax(rnd.standard_normal((2,2)), axis=1) p_V = density_value_functional(0.1, mdp.P, mdp.r, pi, 0.9) print(p_V)
v_star = traj[-1] trajs.append(traj) return trajs class NumpyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() return json.JSONEncoder.default(self, obj) if __name__ == '__main__': rnd.seed(42) n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) pis = utils.gen_grid_policies(4) use_momentum = False fname = 'test1.json' with open(fname, 'w') as f: for lr in np.logspace(-2, -1, 2): traj = value_iteration(mdp, pis, lr) data = { '{}-{}-{}'.format(value_iteration.__name__, lr, use_momentum): [np.array(t).tolist() for t in traj] } s = json.dumps(data, cls=NumpyEncoder) f.write(s + '\n')
# pool = multiprocessing.Pool(n**2) # # couldnt serialise the mdp collection. so just unwrap them here. # lens_n_pi_stars = pool.map(iteration_fn, [(mdp.P, mdp.r, mdp.discount, mdp.d0, pis, lr) for lr in lrs]) # for i, lr, results in zip(range(n**2), lrs, lens_n_pi_stars): # len, pi_star = results for i, lr in enumerate(lrs): print('\n{}: {}\n'.format(i, lr)) lens, pi_stars = iteration_fn( (mdp.P, mdp.r, mdp.discount, mdp.d0, pis, lr)) plt.subplot(n, n, i + 1) plt.title('Learning rate: {}'.format(lr)) fig = plt.scatter(Vs[:, 0], Vs[:, 1], c=lens, s=5) fig.axes.get_xaxis().set_visible(False) fig.axes.get_yaxis().set_visible(False) plt.tight_layout() plt.savefig('figs/iteration-lrs/0-{}.png'.format(name)) if __name__ == '__main__': rnd.seed(41) n_states, n_actions = 2, 2 mdps = [utils.build_random_mdp(n_states, n_actions, 0.5) for _ in range(5)] pis = utils.gen_grid_policies(31) for i, mdp in enumerate(mdps): print('\nMDP {}\n'.format(i)) generate_iteration_figures(mdp, pis, param_policy_gradient, str(i))
import numpy as np import mdp.utils as utils from mdp.search_spaces import * def clip_solver_traj(traj): if np.isclose(traj[-1], traj[-2], 1e-8).all(): return traj[:-1] else: return traj mdp = utils.build_random_mdp(2, 2, 0.5) init = utils.softmax(rnd.standard_normal((mdp.S, mdp.A)), axis=1) pi_traj = clip_solver_traj(utils.solve(policy_iteration(mdp), init)) print(pi_traj)