def calc_frontier(mdp_env,
                  u_expert,
                  reward_posterior,
                  posterior_probs,
                  lambda_range,
                  alpha,
                  debug=False):
    '''takes an MDP and runs over a range of lambdas to output the expected value and CVaR of the resulting solutions to the LP
        mdp_env: the mdp to run on
        u_expert: the baseline expert to try and beat (set to zeros just to be robust)
        reward_posterior: the reward posterior from B-IRL(already burned and skiped and ready to run in LP)
        posterior_probs: the probabilities of each element in the posterior (uniform if from MCMC)
        lambda_range: a list of lambda values to try
        alpha: the CVaR alpha (risk sensitivity) higher is more risk-sensitive/conservative
    '''

    cvar_exprews = []

    for lamda in lambda_range:
        cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(
            mdp_env, u_expert, reward_posterior, posterior_probs, alpha, debug,
            lamda)

        print("Policy for lambda={} and alpha={}".format(lamda, alpha))
        utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env)
        print("stochastic policy")
        utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env)
        print("CVaR of policy = {}".format(cvar_value))
        print("Expected return of policy = {}".format(exp_ret))
        cvar_exprews.append((cvar_value, exp_ret))
    return cvar_exprews
Example #2
0
#Now let's see what CVaR optimization does.
alpha = 0.99
debug = False
lamda = 0.0
r_chain_burned = r_chain[burn::skip]
n = r_chain_burned.shape[0]
posterior_probs = np.ones(n) / n  #uniform dist since samples from MCMC

print("MDP A")
print("features")
utils.display_onehot_state_features(mdp_env_A)

print("------ Robust Solution ---------")
u_expert = np.zeros(mdp_env_B.num_actions * mdp_env_B.num_states)
cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(
    mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha,
    debug, lamda)
#utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A)
print("Policy for lambda={} and alpha={}".format(lamda, alpha))
utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env_A)
print("solving for CVaR reward")
cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env_A, u_expert,
                                          r_chain_burned.transpose(),
                                          posterior_probs, alpha)
# print("cvar reward weights", cvar_reward)
print("cvar reward weights", np.dot(q, r_chain_burned))

print("------ Regret Solution ---------")
traj_demonstrations = [demonstrations]
u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env_A)
print('expert u_sa', u_expert)
    #input()
    worst_index = np.argmin(r_chain_burned[:, 1])
    print(r_chain_burned[worst_index])
    print(np.sum(r_chain_burned[:, 1] < -0.82), "out of", len(r_chain_burned))
    #input()

    print("MAP policy")
    utils.print_policy_from_occupancies(map_u, mdp_env)

    #let's actually try using the optimal policy to get the feature counts and see if the regret method works?
    u_expert = u_sa
    alpha = 0.95
    n = r_chain_burned.shape[0]
    posterior_probs = np.ones(n) / n  #uniform dist since samples from MCMC
    cvar_opt_usa_regret, cvar, exp_ret = mdp.solve_max_cvar_policy(
        mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha,
        False)
    print("{}-CVaR policy regret optimal u_E".format(alpha))
    utils.print_policy_from_occupancies(cvar_opt_usa_regret, mdp_env)
    cvar_2, exp_ret2 = mdp.solve_cvar_expret_fixed_policy(
        mdp_env,
        cvar_opt_usa_regret,
        u_expert,
        r_chain_burned.transpose(),
        posterior_probs,
        alpha,
        debug=False)
    print(cvar, cvar_2)
    print(exp_ret, exp_ret2)
    input("same?")
            posterior = generate_posterior_samples(num_samples, num_states)

            r_sa = np.mean(posterior, axis=1)
            #print("rsa", r_sa)
            init_distribution = np.ones(
                num_states) / num_states  #uniform distribution
            mdp_env = mdp.MachineReplacementMDP(num_states, r_sa, gamma,
                                                init_distribution)
            #run CVaR optimization, just the robust version since we don't have demos
            u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states)

            # print("solving for CVaR optimal policy")
            posterior_probs = np.ones(
                num_samples
            ) / num_samples  #uniform dist since samples from MCMC
            import time
            t = time.time()
            cvar_opt_usa, cvar, exp_ret = mdp.solve_max_cvar_policy(
                mdp_env, u_expert, posterior, posterior_probs, alpha, False,
                lamda)
            run_times[rep, i] = time.time() - t

print(run_times)
print(np.mean(run_times, axis=0))
print(np.std(run_times, axis=0))
import os
if not os.path.exists('./results/stress_test/'):
    os.makedirs('./results/stress_test/')
np.savetxt("./results/stress_test/machine_replace_states.csv",
           run_times,
           delimiter=",")
    ###run CVaR IRL to get policy
    print("optimizing CVAR")
    #running just the robust version for now
    traj_demonstrations = [demonstrations]
    u_expert = utils.u_sa_from_demos(
        traj_demonstrations,
        mdp_env)  #np.zeros(mdp_env.num_actions * mdp_env.num_states)

    n = w_chain_burned.shape[0]
    posterior_probs = np.ones(n) / n  #uniform dist since samples from MCMC
    cvar_losses = []
    for lamda in lamdas:
        cvar_u_sa, cvar, exp_ret = mdp.solve_max_cvar_policy(
            mdp_env,
            u_expert,
            w_chain_burned.transpose(),
            posterior_probs,
            alpha,
            False,
            lamda=lamda)
        if debug:
            print("CVaR policy")
            utils.print_policy_from_occupancies(cvar_u_sa, mdp_env)

        cvar_ploss = utils.policy_loss(cvar_u_sa, mdp_env, opt_u_sa)
        cvar_losses.append(cvar_ploss)

    cvar_ploss_str = ""
    for loss in cvar_losses:
        cvar_ploss_str += ", {}".format(loss)

    print("cvar = {}".format(cvar_ploss_str))
Example #6
0
debug = False

n = r_chain_burned.shape[0]
print("num reward hypothesis", n)
posterior_probs = np.ones(n) / n  #uniform dist since samples from MCMC
traj_demonstrations = [demonstrations]
u_expert = utils.u_sa_from_demos(traj_demonstrations, train_mdp)
print("u expert", u_expert)

run_times = []
for rep in range(num_reps):
    print(rep)

    t = time.time()
    regret_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(
        test_mdp, u_expert, r_chain_burned.transpose(), posterior_probs, alpha,
        debug, lamda)
    #utils.print_stochastic_policy_action_probs(cvar_opt_usa, test_mdp_A)
    elapsed = time.time() - t
    run_times.append(elapsed)
    print(elapsed)
#save run times
#
import os
if not os.path.exists('./results/stress_test/'):
    os.makedirs('./results/stress_test/')
np.savetxt("./results/stress_test/grid_world_60_60.csv",
           run_times,
           delimiter=",")