def calc_frontier(mdp_env,
                  u_expert,
                  reward_posterior,
                  posterior_probs,
                  lambda_range,
                  alpha,
                  debug=False):
    '''takes an MDP and runs over a range of lambdas to output the expected value and CVaR of the resulting solutions to the LP
        mdp_env: the mdp to run on
        u_expert: the baseline expert to try and beat (set to zeros just to be robust)
        reward_posterior: the reward posterior from B-IRL(already burned and skiped and ready to run in LP)
        posterior_probs: the probabilities of each element in the posterior (uniform if from MCMC)
        lambda_range: a list of lambda values to try
        alpha: the CVaR alpha (risk sensitivity) higher is more risk-sensitive/conservative
    '''

    cvar_exprews = []

    for lamda in lambda_range:
        cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(
            mdp_env, u_expert, reward_posterior, posterior_probs, alpha, debug,
            lamda)

        print("Policy for lambda={} and alpha={}".format(lamda, alpha))
        utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env)
        print("stochastic policy")
        utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env)
        print("CVaR of policy = {}".format(cvar_value))
        print("Expected return of policy = {}".format(exp_ret))
        cvar_exprews.append((cvar_value, exp_ret))
    return cvar_exprews
Ejemplo n.º 2
0
def calc_max_ent_u_sa(mdp_env,
                      demos,
                      max_epochs=1000,
                      horizon=None,
                      learning_rate=0.01):
    import mdp
    import utils
    seed_weights = np.zeros(mdp_env.get_reward_dimensionality())

    # Parameters
    if horizon is None:
        horizon = mdp_env.num_states

    # Main algorithm call
    r_weights, grads, state_features, maxent_pi = maxEntIRL(mdp_env,
                                                            demos,
                                                            seed_weights,
                                                            max_epochs,
                                                            horizon,
                                                            learning_rate,
                                                            norm="l2")

    # Construct reward function from weights and state features
    reward_fxn = []
    for s_i in range(mdp_env.num_states):
        reward_fxn.append(np.dot(r_weights, state_features[s_i]))
    reward_fxn = np.reshape(reward_fxn, (mdp_env.num_rows, mdp_env.num_cols))
    print("learned reward function")
    print(reward_fxn)

    u_s = mdp.get_policy_state_occupancy_frequencies(maxent_pi, mdp_env)
    u_sa = mdp.stoch_policy_to_usa(maxent_pi, mdp_env)
    utils.print_policy_from_occupancies(u_sa, mdp_env)
    utils.print_stochastic_policy_action_probs(u_sa, mdp_env)

    return u_sa, r_weights, maxent_pi
Ejemplo n.º 3
0
print(demonstrations)

state_feature_list = [tuple(fs) for fs in mdp_env.state_features]
pg.get_policy_string_from_trajectory(traj_demonstrations[0], state_feature_list, mdp_env)


    
# In[4]:

#LPAL solution
u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env)
lpal_usa = mdp.solve_lpal_policy(mdp_env, u_expert)
#utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A)
print("lpal policy")
utils.print_policy_from_occupancies(lpal_usa, mdp_env)
utils.print_stochastic_policy_action_probs(lpal_usa, mdp_env)
pi_dict = utils.get_stoch_policy_string_dictionary_from_occupancies(lpal_usa, mdp_env)
state_feature_list = [tuple(fs) for fs in mdp_env.state_features]
pg.plot_optimal_policy_stochastic(pi_dict, state_feature_list, mdp_env.num_rows, mdp_env.num_cols)


# In[4]:


import maxent
#just keep states in traj_demos
maxent_demos = []
for d in traj_demonstrations:
    #add only states to demos
    demo = []
    for s,a in d:
    lamda = 0.9

    posterior = generate_posterior_samples(num_samples)

    #print(generate_reward_sample())

    r_sa = np.mean(posterior, axis=1)
    #print("rsa", r_sa)
    init_distribution = np.ones(num_states)/num_states  #uniform distribution
    mdp_env = mdp.MachineReplacementMDP(num_states, r_sa, gamma, init_distribution)
    #print(mdp_env.Ps)
    print("mean MDP reward", r_sa)

    u_sa = mdp.solve_mdp_lp(mdp_env, debug=True)
    print("mean policy from posterior")
    utils.print_stochastic_policy_action_probs(u_sa, mdp_env)
    print("MAP/Mean policy from posterior")
    utils.print_policy_from_occupancies(u_sa, mdp_env) 
    print("rewards")
    print(mdp_env.r_sa)
    print("expected value = ", np.dot(u_sa, r_sa))
    stoch_pi = utils.get_optimal_policy_from_usa(u_sa, mdp_env)
    print("expected return", mdp.get_policy_expected_return(stoch_pi, mdp_env))
    print("values", mdp.get_state_values(u_sa, mdp_env))
    print('q-values', mdp.get_q_values(u_sa, mdp_env))

    
    
    #print(posterior)
    #print(posterior.shape)
Ejemplo n.º 5
0
lamda = 0.0

n = r_chain_burned.shape[0]
posterior_probs = np.ones(n) / n  #uniform dist since samples from MCMC

print("MDP A")    
print("features")
utils.display_onehot_state_features(mdp_env)

print("------ Robust Solution ---------")
u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states)
robust_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda)
#utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A)
print("Policy for lambda={} and alpha={}".format(lamda, alpha))
utils.print_policy_from_occupancies(robust_opt_usa, mdp_env)
utils.print_stochastic_policy_action_probs(robust_opt_usa, mdp_env)
print("solving for CVaR reward")
cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha)
# print("cvar reward weights", cvar_reward)
print("cvar reward weights", np.dot(q, r_chain_burned))


print("------ Regret Solution ---------")
traj_demonstrations = [demonstrations]
u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env)
print('expert u_sa', u_expert)

regret_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda)
#utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A)
print("Policy for lambda={} and alpha={}".format(lamda, alpha))
utils.print_policy_from_occupancies(regret_opt_usa, mdp_env)
    print("mean policy loss", mean_ploss)

    lava_states = []
    for s, f in enumerate(mdp_env_B.state_features):
        if (f == (0, 0, 0, 1)).all():  #hard coded lava feature
            lava_states.append(s)

    print("lava states", lava_states)

    print("initial dist")
    print(mdp_env_B.init_dist)

    print("map_u")
    print(np.sum(map_u_sa))
    utils.print_policy_occupancies_pretty(map_u_sa, mdp_env_B)
    utils.print_stochastic_policy_action_probs(map_u_sa, mdp_env_B)

    print("mean_u")
    print(np.sum(mean_u_sa))
    utils.print_policy_occupancies_pretty(mean_u_sa, mdp_env_B)
    utils.print_stochastic_policy_action_probs(mean_u_sa, mdp_env_B)

    num_states = mdp_env_B.get_num_states()
    map_lava = 0
    for s in lava_states:
        map_lava += np.sum(map_u_sa[s::num_states])

    print("map lava", map_lava)

    num_states = mdp_env_B.get_num_states()
    mean_lava = 0