def graph_PI(): n_states = 10 n_actions = 2 det_pis = utils.get_deterministic_policies(n_states, n_actions) print('n pis: {}'.format(len(det_pis))) mdp = utils.build_random_sparse_mdp(n_states, n_actions, 0.5) A = graph.mdp_topology(det_pis) G = nx.from_numpy_array(A) pos = nx.spring_layout(G, iterations=200) basis = graph.construct_mdp_basis(det_pis, mdp) init_pi = utils.softmax(np.random.standard_normal((n_states, n_actions))) init_v = utils.value_functional(mdp.P, mdp.r, init_pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, init_v, lr=0.1) pis = utils.solve(search_spaces.policy_iteration(mdp), init_pi) print("\n{} policies to vis".format(len(pis))) for i, pi in enumerate(pis[:-1]): print('Iteration: {}'.format(i)) v = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount).squeeze() a = graph.sparse_coeffs(basis, v, lr=0.1, a_init=a) plt.figure(figsize=(16,16)) nx.draw(G, pos, node_color=a, node_size=150) # plt.show() plt.savefig('figs/pi_graphs/{}.png'.format(i)) plt.close()
def generate_pi(mdp, c): init_pi = utils.random_policy(mdp.S,mdp.A) pis = utils.solve(ss.policy_iteration(mdp), init_pi) vs = np.stack([utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) for pi in pis])[:,:,0] n = vs.shape[0] plt.scatter(vs[0, 0], vs[0, 1], c=c, s=30, label='{}'.format(n-2)) plt.scatter(vs[1:-1, 0], vs[1:-1, 1], c=range(n-2), cmap='viridis', s=10) plt.scatter(vs[-1, 0], vs[-1, 1], c='m', marker='x') for i in range(len(vs)-2): dv = 0.1*(vs[i+1, :] - vs[i, :]) plt.arrow(vs[i, 0], vs[i, 1], dv[0], dv[1], color=c, alpha=0.5, width=0.005)
def policy_iteration(mdp, pis): # pi_star = utils.solve(ss.policy_iteration(mdp), pis[0])[-1] lens, pi_stars = [], [] for pi in pis: pi_traj = clip_solver_traj(utils.solve(ss.policy_iteration(mdp), pi)) pi_star = pi_traj[-1] pi_stars.append(pi_star) lens.append(len(pi_traj)) return lens, pi_stars
def onoffpolicy_abstraction(mdp, pis): tol = 0.01 init = np.random.random((mdp.S, mdp.A)) init = init / np.sum(init, axis=1, keepdims=True) # ### all policy abstraction # # n x |S| x |A| # Qs = np.stack([utils.bellman_operator(mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi, mdp.discount), mdp.discount) for pi in pis], axis=0) # similar_states = np.sum(np.sum(np.abs(Qs[:, :, None, :] - Qs[:, None, :, :]), axis=3), axis=0) # |S| x |S| # all_idx, all_abstracted_mdp, all_f = abs.build_state_abstraction(similar_states, mdp) ### optimal policy abstraction pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1] Q_star = utils.bellman_operator( mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount), mdp.discount) # similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]), axis=-1) # |S| x |S|. preserves optimal policy's value (for all actions) # similar_states = np.abs(np.max(Q_star[:, None, :],axis=-1) - np.max(Q_star[None, :, :],axis=-1)) # |S| x |S|. preserves optimal action's value # V = utils.value_functional(mdp.P, mdp.r, init, mdp.discount) similar_states = np.abs(V[None, :, :] - V[:, None, :])[:, :, 0] optimal_idx, optimal_abstracted_mdp, optimal_f = abs.build_state_abstraction( similar_states, mdp, tol) mdps = [mdp, optimal_abstracted_mdp] names = ['ground', 'optimal_abstracted_mdp'] solvers = [abs.Q, abs.SARSA, abs.VI] lifts = [np.eye(mdp.S), optimal_f] idxs = [range(mdp.S), optimal_idx] # if all_f.shape[0] == optimal_f.shape[0]: # raise ValueError('Abstractions are the same so we probs wont see any difference...') print('\nAbstraction:', optimal_f.shape) truth = abs.PI(init, mdp, np.eye(mdp.S)) results = [] for n, M, idx, f in zip(names, mdps, idxs, lifts): for solve in solvers: err = np.max(np.abs(truth - solve(init[idx, :], M, f))) results.append((n, solve.__name__, err)) return results
def compare_mdp_lmdp(): n_states, n_actions = 2, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.9) pis = utils.gen_grid_policies(7) vs = utils.polytope(mdp.P, mdp.r, mdp.discount, pis) plt.figure(figsize=(16, 16)) plt.scatter(vs[:, 0], vs[:, 1], s=10, alpha=0.75) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star = lmdps.lmdp_decoder(u, mdp.P) pi_p = lmdps.lmdp_decoder(p, mdp.P) # solve MDP init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] # pi_star = onehot(np.argmax(qs, axis=1), n_actions) # evaluate both policies. v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount) v_p = utils.value_functional(mdp.P, mdp.r, pi_p, mdp.discount) plt.scatter(v_star[0, 0], v_star[1, 0], c='m', alpha=0.5, marker='x', label='mdp') plt.scatter(v_u_star[0, 0], v_u_star[1, 0], c='g', alpha=0.5, marker='x', label='lmdp') plt.scatter(v_p[0, 0], v_p[1, 0], c='k', marker='x', alpha=0.5, label='p') plt.legend() plt.show()
def compare_acc(): n_states, n_actions = 2, 2 lmdp = [] lmdp_rnd = [] for _ in range(10): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star = lmdps.lmdp_decoder(u, mdp.P) # solve MDP init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] # solve via LMDPs # with p set to the random dynamics p, q = lmdps.mdp_encoder(mdp.P, mdp.r) p = np.einsum('ijk,jk->ij', mdp.P, np.ones((n_states, n_actions)) / n_actions) # q = np.max(mdp.r, axis=1, keepdims=True) u, v = lmdps.lmdp_solver(p, q, mdp.discount) pi_u_star_random = lmdps.lmdp_decoder(u, mdp.P) # evaluate both policies. v_star = utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount) v_u_star = utils.value_functional(mdp.P, mdp.r, pi_u_star, mdp.discount) v_u_star_random = utils.value_functional(mdp.P, mdp.r, pi_u_star_random, mdp.discount) lmdp.append(np.isclose(v_star, v_u_star, 1e-3).all()) lmdp_rnd.append(np.isclose(v_star, v_u_star_random, 1e-3).all()) print([np.sum(lmdp), np.sum(lmdp_rnd)]) plt.bar(range(2), [np.sum(lmdp), np.sum(lmdp_rnd)]) plt.show()
def mdp_lmdp_optimality(): n_states, n_actions = 2, 2 n = 5 plt.figure(figsize=(8, 16)) plt.title('Optimal control (LMDP) vs optimal policy (MDP)') for i in range(n): mdp = utils.build_random_mdp(n_states, n_actions, 0.5) # solve via LMDPs p, q = lmdps.mdp_encoder(mdp.P, mdp.r) u, v = lmdps.lmdp_solver(p, q, mdp.discount) init = np.random.standard_normal((n_states, n_actions)) pi_star = utils.solve(search_spaces.policy_iteration(mdp), init)[-1] P_pi_star = np.einsum('ijk,jk->ij', mdp.P, pi_star) plt.subplot(n, 2, 2 * i + 1) plt.imshow(u) plt.subplot(n, 2, 2 * i + 2) plt.imshow(P_pi_star) plt.savefig('figs/lmdp_mdp_optimal_dynamics.png') plt.show()
def PI(init, M, f): pi_star = utils.solve(ss.policy_iteration(M), np.log(init))[-1] return utils.value_functional(M.P, M.r, np.dot(f.T, pi_star), M.discount)
V_star = utils.solve(ss.value_iteration(M, 0.01), V_init)[-1] # lift return np.dot(f.T, V_star) if __name__ == '__main__': tol = 0.01 n_states, n_actions = 512, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) init = np.random.random((mdp.S, mdp.A)) init = init / np.sum(init, axis=1, keepdims=True) pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1] Q_star = utils.bellman_operator( mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount), mdp.discount) similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]), axis=-1) # |S| x |S| optimal_idx, optimal_abstracted_mdp, optimal_f = build_state_abstraction( similar_states, mdp, tol) truth = PI(init, mdp, np.eye(mdp.S)) approx = Q(init[optimal_idx], optimal_abstracted_mdp, optimal_f) print(