Beispiel #1
0
cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(
    mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha,
    debug, lamda)
#utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A)
print("Policy for lambda={} and alpha={}".format(lamda, alpha))
utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env_A)
print("solving for CVaR reward")
cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env_A, u_expert,
                                          r_chain_burned.transpose(),
                                          posterior_probs, alpha)
# print("cvar reward weights", cvar_reward)
print("cvar reward weights", np.dot(q, r_chain_burned))

print("------ Regret Solution ---------")
traj_demonstrations = [demonstrations]
u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env_A)
print('expert u_sa', u_expert)

cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(
    mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha,
    debug, lamda)
#utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A)
print("Policy for lambda={} and alpha={}".format(lamda, alpha))
utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env_A)
print("solving for CVaR reward")
cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env_A, u_expert,
                                          r_chain_burned.transpose(),
                                          posterior_probs, alpha)
# print("cvar reward weights", cvar_reward)
print("cvar reward weights", np.dot(q, r_chain_burned))
print("------ Robust Solution ---------")
u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states)
robust_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda)
#utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A)
print("Policy for lambda={} and alpha={}".format(lamda, alpha))
utils.print_policy_from_occupancies(robust_opt_usa, mdp_env)
utils.print_stochastic_policy_action_probs(robust_opt_usa, mdp_env)
print("solving for CVaR reward")
cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha)
# print("cvar reward weights", cvar_reward)
print("cvar reward weights", np.dot(q, r_chain_burned))


print("------ Regret Solution ---------")
traj_demonstrations = [demonstrations]
u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env)
print('expert u_sa', u_expert)

regret_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda)
#utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A)
print("Policy for lambda={} and alpha={}".format(lamda, alpha))
utils.print_policy_from_occupancies(regret_opt_usa, mdp_env)
#utils.print_stochastic_policy_action_probs(regret_opt_usa, mdp_env)
print("solving for CVaR reward")
regret_reward, q = mdp.solve_minCVaR_reward(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha)
# print("cvar reward weights", cvar_reward)
print("cvar reward weights", np.dot(q, r_chain_burned))

print("-------- IRD Solution -------")
u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env)#mdp_env.state_features[init_demo_state]#np.zeros(mdp_env.get_reward_dimensionality())#
ird_w = utils.get_worst_case_feature_weights_binary_ird(r_chain_burned, u_expert, mdp_env)
        mean_r = np.dot(mdp_env.state_features, mean_w)
        print("Mean rewards")
        utils.print_as_grid(mean_r, mdp_env)

    ###Compute policy loss wrt true reward
    mean_ploss = utils.policy_loss(mean_u_sa, mdp_env, opt_u_sa)
    map_ploss = utils.policy_loss(map_u_sa, mdp_env, opt_u_sa)

    print("mean = {}, map = {}".format(mean_ploss, map_ploss))

    ###run CVaR IRL to get policy
    print("optimizing CVAR")
    #running just the robust version for now
    traj_demonstrations = [demonstrations]
    u_expert = utils.u_sa_from_demos(
        traj_demonstrations,
        mdp_env)  #np.zeros(mdp_env.num_actions * mdp_env.num_states)

    n = w_chain_burned.shape[0]
    posterior_probs = np.ones(n) / n  #uniform dist since samples from MCMC
    cvar_losses = []
    for lamda in lamdas:
        cvar_u_sa, cvar, exp_ret = mdp.solve_max_cvar_policy(
            mdp_env,
            u_expert,
            w_chain_burned.transpose(),
            posterior_probs,
            alpha,
            False,
            lamda=lamda)
        if debug:
Beispiel #4
0
                                step_stdev,
                                debug=False,
                                mcmc_norm=mcmc_norm,
                                likelihood=likelihood,
                                prior="non-pos")

map_w, map_u, r_chain, u_chain = birl.sample_posterior(demonstrations,
                                                       num_samples, True)

print(train_mdp.feature_weights)

burn = 200
skip = 10
r_chain_burned = r_chain[burn::skip]

u_expert = utils.u_sa_from_demos(traj_demonstrations, train_mdp)
expert_returns = np.sort(np.dot(r_chain_burned, u_expert))

#get the r_sa matrix from the posterior
Rsa = utils.convert_w_to_rsa(r_chain_burned, train_mdp)

#what does the BROIL policy do?

#create test MDP
num_rows = 60
num_cols = 60
num_features = 6
num_reps = 20

init_seed = 12345
np.random.seed(init_seed)