Esempio n. 1
0
def sample_states(env, q_fn, visitation_probs, n_sample, ent_wt):
    dS, dA = visitation_probs.shape
    samples = np.random.choice(np.arange(dS * dA),
                               size=n_sample,
                               p=visitation_probs.reshape(dS * dA))
    policy = get_policy(q_fn, ent_wt=ent_wt)
    observations = samples // dA
    actions = samples % dA
    a_logprobs = np.log(policy[observations, actions])

    observations_next = []
    for i in range(n_sample):
        t_distr = env.tabular_trans_distr(observations[i], actions[i])
        next_state = flat_to_one_hot(np.random.choice(np.arange(len(t_distr)),
                                                      p=t_distr),
                                     ndim=dS)
        observations_next.append(next_state)
    observations_next = np.array(observations_next)

    return {
        'observations': flat_to_one_hot(observations, ndim=dS),
        'actions': flat_to_one_hot(actions, ndim=dA),
        'a_logprobs': a_logprobs,
        'observations_next': observations_next
    }
Esempio n. 2
0
def direct_method(env,ent_wt,true_sa_visits,discount,dim_obs,reward,transition,state_only):
# this version of the direct method solves a system of linear equations (as in slide 12)
    learned_rew, learned_q = tabular_maxent_irl(env, true_sa_visits, lr=0.01, num_itrs=1000,
                                                ent_wt=ent_wt, state_only=state_only,
                                                discount=discount)
    learned_policy = get_policy(learned_q, ent_wt=ent_wt)
    policy_irl = np.argmax(learned_policy, axis=1)
    reward_array_irl = np.array([reward[k, policy_irl[k]] for k in range(dim_obs)])
    transition_matrix = np.array([transition[j, policy_irl[j]] for j in range(dim_obs)])

    A = np.identity(dim_obs) - discount*transition_matrix
    b = reward_array_irl
    V_irl = np.linalg.solve(A,b)

    return V_irl
Esempio n. 3
0
def compute_visitation(env, q_fn, ent_wt=1.0, T=50, discount=1.0):
    pol_probs = get_policy(q_fn, ent_wt=ent_wt)

    dim_obs = env.observation_space.flat_dim
    dim_act = env.action_space.flat_dim
    state_visitation = np.expand_dims(env.initial_state_distribution, axis=1)
    t_matrix = env.transition_matrix  # S x A x S
    sa_visit_t = np.zeros((dim_obs, dim_act, T))

    for i in range(T):
        sa_visit = state_visitation * pol_probs
        sa_visit_t[:,:,i] = sa_visit #(discount**i) * sa_visit
        # sum-out (SA)S
        new_state_visitation = np.einsum('ij,ijk->k', sa_visit, t_matrix)
        state_visitation = np.expand_dims(new_state_visitation, axis=1)
    return np.sum(sa_visit_t, axis=2) / float(T)
Esempio n. 4
0
def compute_visitation(env, q_fn, ent_wt=1.0, T=50, discount=1.0):
    pol_probs = get_policy(q_fn, ent_wt=ent_wt)

    dim_obs = env.observation_space.flat_dim
    dim_act = env.action_space.flat_dim
    state_visitation = np.expand_dims(env.initial_state_distribution, axis=1)
    t_matrix = env.transition_matrix  # S x A x S
    sa_visit_t = np.zeros((dim_obs, dim_act, T))

    for i in range(T):
        sa_visit = state_visitation * pol_probs
        sa_visit_t[:, :, i] = sa_visit  # (discount**i) * sa_visit
        # sum-out (SA)S
        new_state_visitation = np.einsum('ij,ijk->k', sa_visit, t_matrix)
        state_visitation = np.expand_dims(new_state_visitation, axis=1)
    return np.sum(sa_visit_t, axis=2) / float(T)
Esempio n. 5
0
def sample_states(env, q_fn, visitation_probs, n_sample, ent_wt):
    dS, dA = visitation_probs.shape
    samples = np.random.choice(np.arange(dS*dA), size=n_sample, p=visitation_probs.reshape(dS*dA))
    policy = get_policy(q_fn, ent_wt=ent_wt)
    observations = samples // dA
    actions = samples % dA
    a_logprobs = np.log(policy[observations, actions])

    observations_next = []
    for i in range(n_sample):
        t_distr = env.tabular_trans_distr(observations[i], actions[i])
        next_state = flat_to_one_hot(np.random.choice(np.arange(len(t_distr)), p=t_distr), ndim=dS)
        observations_next.append(next_state)
    observations_next = np.array(observations_next)

    return {'observations': flat_to_one_hot(observations, ndim=dS),
            'actions': flat_to_one_hot(actions, ndim=dA),
            'a_logprobs': a_logprobs,
            'observations_next': observations_next}
Esempio n. 6
0
    from simple_env import random_env
    np.set_printoptions(suppress=True)

    # Environment parameters
    env = random_env(16, 4, seed=1, terminate=False, t_sparsity=0.8)
    dS = env.spec.observation_space.flat_dim
    dU = env.spec.action_space.flat_dim
    dO = 8
    ent_wt = 1.0
    discount = 0.9
    obs_matrix = np.random.randn(dS, dO)

    # Compute optimal policy for double checking
    true_q = q_iteration(env, K=150, ent_wt=ent_wt, gamma=discount)
    true_sa_visits = compute_visitation(env, true_q, ent_wt=ent_wt, T=5, discount=discount)
    expert_pol = get_policy(true_q, ent_wt=ent_wt)

    # Run MaxEnt IRL State-only
    learned_rew, learned_q = tabular_maxent_irl(env, true_sa_visits, lr=0.01, num_itrs=1000,
                                                ent_wt=ent_wt, state_only=True,
                                                discount=discount)
    learned_pol = get_policy(learned_q, ent_wt=ent_wt)

    
    # Normalize reward (if state_only=True, reward is accurate up to a constant)
    adjusted_rew = learned_rew - np.mean(learned_rew) + np.mean(env.rew_matrix)

    diff_rew = np.abs(env.rew_matrix - adjusted_rew)
    diff_pol = np.abs(expert_pol - learned_pol)
    print('----- Results State Only -----')
    print('InfNormRewError', np.max(diff_rew))
Esempio n. 7
0
    env = random_env(16, 4, seed=1, terminate=False, t_sparsity=0.8)
    dS = env.spec.observation_space.flat_dim
    dU = env.spec.action_space.flat_dim
    dO = 8
    ent_wt = 1.0
    discount = 0.9
    obs_matrix = np.random.randn(dS, dO)

    # Compute optimal policy for double checking
    true_q = q_iteration(env, K=150, ent_wt=ent_wt, gamma=discount)
    true_sa_visits = compute_visitation(env,
                                        true_q,
                                        ent_wt=ent_wt,
                                        T=5,
                                        discount=discount)
    expert_pol = get_policy(true_q, ent_wt=ent_wt)

    # Run MaxEnt IRL State-only
    learned_rew, learned_q = tabular_maxent_irl(env,
                                                true_sa_visits,
                                                lr=0.01,
                                                num_itrs=1000,
                                                ent_wt=ent_wt,
                                                state_only=True,
                                                discount=discount)
    learned_pol = get_policy(learned_q, ent_wt=ent_wt)

    # Normalize reward (if state_only=True, reward is accurate up to a constant)
    adjusted_rew = learned_rew - np.mean(learned_rew) + np.mean(env.rew_matrix)

    diff_rew = np.abs(env.rew_matrix - adjusted_rew)