def Q(init, M, f): # solve V_init = utils.value_functional(M.P, M.r, init, M.discount) Q_init = utils.bellman_operator(M.P, M.r, V_init, M.discount) Q_star = utils.solve(ss.q_learning(M, 0.01), Q_init)[-1] # lift return np.dot(f.T, np.max(Q_star, axis=1, keepdims=True))
def onoffpolicy_abstraction(mdp, pis): tol = 0.01 init = np.random.random((mdp.S, mdp.A)) init = init / np.sum(init, axis=1, keepdims=True) # ### all policy abstraction # # n x |S| x |A| # Qs = np.stack([utils.bellman_operator(mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi, mdp.discount), mdp.discount) for pi in pis], axis=0) # similar_states = np.sum(np.sum(np.abs(Qs[:, :, None, :] - Qs[:, None, :, :]), axis=3), axis=0) # |S| x |S| # all_idx, all_abstracted_mdp, all_f = abs.build_state_abstraction(similar_states, mdp) ### optimal policy abstraction pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1] Q_star = utils.bellman_operator( mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount), mdp.discount) # similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]), axis=-1) # |S| x |S|. preserves optimal policy's value (for all actions) # similar_states = np.abs(np.max(Q_star[:, None, :],axis=-1) - np.max(Q_star[None, :, :],axis=-1)) # |S| x |S|. preserves optimal action's value # V = utils.value_functional(mdp.P, mdp.r, init, mdp.discount) similar_states = np.abs(V[None, :, :] - V[:, None, :])[:, :, 0] optimal_idx, optimal_abstracted_mdp, optimal_f = abs.build_state_abstraction( similar_states, mdp, tol) mdps = [mdp, optimal_abstracted_mdp] names = ['ground', 'optimal_abstracted_mdp'] solvers = [abs.Q, abs.SARSA, abs.VI] lifts = [np.eye(mdp.S), optimal_f] idxs = [range(mdp.S), optimal_idx] # if all_f.shape[0] == optimal_f.shape[0]: # raise ValueError('Abstractions are the same so we probs wont see any difference...') print('\nAbstraction:', optimal_f.shape) truth = abs.PI(init, mdp, np.eye(mdp.S)) results = [] for n, M, idx, f in zip(names, mdps, idxs, lifts): for solve in solvers: err = np.max(np.abs(truth - solve(init[idx, :], M, f))) results.append((n, solve.__name__, err)) return results
# left P[:, :, 3] = np.array([ [1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0], ]) # rewards. 6 x 4 r = np.array([ [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1, 0, 0, 0], # rewarded for going up at the finish. [1, 0, 0, 0], ]) # initial distribution d0 = np.array([[0.5, 0.5, 0, 0, 0, 0]]) pi = np.array(utils.random_policy(6, 4)) pi[[0, 2, 4]] = pi[[1, 3, 5]] V = utils.value_functional(P, r, pi, 0.5) Q_t = utils.bellman_operator(P, r, V, 0.5) # print(np.sum(P, axis=-1)) print(Q_t)
def T(V): # current value estimate, similarity measure assert V.shape[1] == 1 return np.max(utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount), axis=1, keepdims=True)
def T(V): assert V.shape[1] == 1 return utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount)
def q_learning(mdp, lr): pi = lambda Q: utils.onehot(np.argmax(Q, axis=1), Q.shape[1]) # greedy T = lambda Q: utils.bellman_operator(mdp.P, mdp.r, np.max(Q, axis=1), mdp.discount) U = lambda Q: Q + lr * (T(Q) - Q) return jit(U)
def sarsa(mdp, lr): pi = lambda Q: utils.onehot(np.argmax(Q, axis=1), Q.shape[1]) # greedy T = lambda Q: utils.bellman_operator(mdp.P, mdp.r, np.einsum('jk,jk->j', Q, pi(Q)), mdp.discount) U = lambda Q: Q + lr * (T(Q) - Q) return jit(U)
def update_fn(cores): V = utils.value_functional(mdp.P, mdp.r, utils.softmax(build(cores), axis=1), mdp.discount) Q = utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount) A = Q-V grads = [np.einsum('ijkl,ij->kl', d, A) for d in dlogpi_dw(cores)] return [c+lr*utils.clip_by_norm(g, 100)+1e-6*dH for c, g, dH in zip(cores, grads, dHdw(cores))]
def update_fn(pi): V = utils.value_functional(mdp.P, mdp.r, pi, mdp.discount) Q = utils.bellman_operator(mdp.P, mdp.r, V, mdp.discount) return utils.onehot(np.argmax(Q, axis=1), mdp.A) # greedy update
return np.dot(f.T, V_star) if __name__ == '__main__': tol = 0.01 n_states, n_actions = 512, 2 mdp = utils.build_random_mdp(n_states, n_actions, 0.5) init = np.random.random((mdp.S, mdp.A)) init = init / np.sum(init, axis=1, keepdims=True) pi_star = utils.solve(ss.policy_iteration(mdp), np.log(init))[-1] Q_star = utils.bellman_operator( mdp.P, mdp.r, utils.value_functional(mdp.P, mdp.r, pi_star, mdp.discount), mdp.discount) similar_states = np.sum(np.abs(Q_star[:, None, :] - Q_star[None, :, :]), axis=-1) # |S| x |S| optimal_idx, optimal_abstracted_mdp, optimal_f = build_state_abstraction( similar_states, mdp, tol) truth = PI(init, mdp, np.eye(mdp.S)) approx = Q(init[optimal_idx], optimal_abstracted_mdp, optimal_f) print( '\n', 'bound >=V*-V', '\n', '{} >= {}'.format(2 * tol / (1 - mdp.discount)**2, np.max(np.abs(truth - approx))))