def sample_states(env, q_fn, visitation_probs, n_sample, ent_wt): dS, dA = visitation_probs.shape samples = np.random.choice(np.arange(dS * dA), size=n_sample, p=visitation_probs.reshape(dS * dA)) policy = get_policy(q_fn, ent_wt=ent_wt) observations = samples // dA actions = samples % dA a_logprobs = np.log(policy[observations, actions]) observations_next = [] for i in range(n_sample): t_distr = env.tabular_trans_distr(observations[i], actions[i]) next_state = flat_to_one_hot(np.random.choice(np.arange(len(t_distr)), p=t_distr), ndim=dS) observations_next.append(next_state) observations_next = np.array(observations_next) return { 'observations': flat_to_one_hot(observations, ndim=dS), 'actions': flat_to_one_hot(actions, ndim=dA), 'a_logprobs': a_logprobs, 'observations_next': observations_next }
def direct_method(env,ent_wt,true_sa_visits,discount,dim_obs,reward,transition,state_only): # this version of the direct method solves a system of linear equations (as in slide 12) learned_rew, learned_q = tabular_maxent_irl(env, true_sa_visits, lr=0.01, num_itrs=1000, ent_wt=ent_wt, state_only=state_only, discount=discount) learned_policy = get_policy(learned_q, ent_wt=ent_wt) policy_irl = np.argmax(learned_policy, axis=1) reward_array_irl = np.array([reward[k, policy_irl[k]] for k in range(dim_obs)]) transition_matrix = np.array([transition[j, policy_irl[j]] for j in range(dim_obs)]) A = np.identity(dim_obs) - discount*transition_matrix b = reward_array_irl V_irl = np.linalg.solve(A,b) return V_irl
def compute_visitation(env, q_fn, ent_wt=1.0, T=50, discount=1.0): pol_probs = get_policy(q_fn, ent_wt=ent_wt) dim_obs = env.observation_space.flat_dim dim_act = env.action_space.flat_dim state_visitation = np.expand_dims(env.initial_state_distribution, axis=1) t_matrix = env.transition_matrix # S x A x S sa_visit_t = np.zeros((dim_obs, dim_act, T)) for i in range(T): sa_visit = state_visitation * pol_probs sa_visit_t[:,:,i] = sa_visit #(discount**i) * sa_visit # sum-out (SA)S new_state_visitation = np.einsum('ij,ijk->k', sa_visit, t_matrix) state_visitation = np.expand_dims(new_state_visitation, axis=1) return np.sum(sa_visit_t, axis=2) / float(T)
def compute_visitation(env, q_fn, ent_wt=1.0, T=50, discount=1.0): pol_probs = get_policy(q_fn, ent_wt=ent_wt) dim_obs = env.observation_space.flat_dim dim_act = env.action_space.flat_dim state_visitation = np.expand_dims(env.initial_state_distribution, axis=1) t_matrix = env.transition_matrix # S x A x S sa_visit_t = np.zeros((dim_obs, dim_act, T)) for i in range(T): sa_visit = state_visitation * pol_probs sa_visit_t[:, :, i] = sa_visit # (discount**i) * sa_visit # sum-out (SA)S new_state_visitation = np.einsum('ij,ijk->k', sa_visit, t_matrix) state_visitation = np.expand_dims(new_state_visitation, axis=1) return np.sum(sa_visit_t, axis=2) / float(T)
def sample_states(env, q_fn, visitation_probs, n_sample, ent_wt): dS, dA = visitation_probs.shape samples = np.random.choice(np.arange(dS*dA), size=n_sample, p=visitation_probs.reshape(dS*dA)) policy = get_policy(q_fn, ent_wt=ent_wt) observations = samples // dA actions = samples % dA a_logprobs = np.log(policy[observations, actions]) observations_next = [] for i in range(n_sample): t_distr = env.tabular_trans_distr(observations[i], actions[i]) next_state = flat_to_one_hot(np.random.choice(np.arange(len(t_distr)), p=t_distr), ndim=dS) observations_next.append(next_state) observations_next = np.array(observations_next) return {'observations': flat_to_one_hot(observations, ndim=dS), 'actions': flat_to_one_hot(actions, ndim=dA), 'a_logprobs': a_logprobs, 'observations_next': observations_next}
from simple_env import random_env np.set_printoptions(suppress=True) # Environment parameters env = random_env(16, 4, seed=1, terminate=False, t_sparsity=0.8) dS = env.spec.observation_space.flat_dim dU = env.spec.action_space.flat_dim dO = 8 ent_wt = 1.0 discount = 0.9 obs_matrix = np.random.randn(dS, dO) # Compute optimal policy for double checking true_q = q_iteration(env, K=150, ent_wt=ent_wt, gamma=discount) true_sa_visits = compute_visitation(env, true_q, ent_wt=ent_wt, T=5, discount=discount) expert_pol = get_policy(true_q, ent_wt=ent_wt) # Run MaxEnt IRL State-only learned_rew, learned_q = tabular_maxent_irl(env, true_sa_visits, lr=0.01, num_itrs=1000, ent_wt=ent_wt, state_only=True, discount=discount) learned_pol = get_policy(learned_q, ent_wt=ent_wt) # Normalize reward (if state_only=True, reward is accurate up to a constant) adjusted_rew = learned_rew - np.mean(learned_rew) + np.mean(env.rew_matrix) diff_rew = np.abs(env.rew_matrix - adjusted_rew) diff_pol = np.abs(expert_pol - learned_pol) print('----- Results State Only -----') print('InfNormRewError', np.max(diff_rew))
env = random_env(16, 4, seed=1, terminate=False, t_sparsity=0.8) dS = env.spec.observation_space.flat_dim dU = env.spec.action_space.flat_dim dO = 8 ent_wt = 1.0 discount = 0.9 obs_matrix = np.random.randn(dS, dO) # Compute optimal policy for double checking true_q = q_iteration(env, K=150, ent_wt=ent_wt, gamma=discount) true_sa_visits = compute_visitation(env, true_q, ent_wt=ent_wt, T=5, discount=discount) expert_pol = get_policy(true_q, ent_wt=ent_wt) # Run MaxEnt IRL State-only learned_rew, learned_q = tabular_maxent_irl(env, true_sa_visits, lr=0.01, num_itrs=1000, ent_wt=ent_wt, state_only=True, discount=discount) learned_pol = get_policy(learned_q, ent_wt=ent_wt) # Normalize reward (if state_only=True, reward is accurate up to a constant) adjusted_rew = learned_rew - np.mean(learned_rew) + np.mean(env.rew_matrix) diff_rew = np.abs(env.rew_matrix - adjusted_rew)