cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy( mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env_A) print("solving for CVaR reward") cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha) # print("cvar reward weights", cvar_reward) print("cvar reward weights", np.dot(q, r_chain_burned)) print("------ Regret Solution ---------") traj_demonstrations = [demonstrations] u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env_A) print('expert u_sa', u_expert) cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy( mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env_A) print("solving for CVaR reward") cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env_A, u_expert, r_chain_burned.transpose(), posterior_probs, alpha) # print("cvar reward weights", cvar_reward) print("cvar reward weights", np.dot(q, r_chain_burned))
print("------ Robust Solution ---------") u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states) robust_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(robust_opt_usa, mdp_env) utils.print_stochastic_policy_action_probs(robust_opt_usa, mdp_env) print("solving for CVaR reward") cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha) # print("cvar reward weights", cvar_reward) print("cvar reward weights", np.dot(q, r_chain_burned)) print("------ Regret Solution ---------") traj_demonstrations = [demonstrations] u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env) print('expert u_sa', u_expert) regret_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(regret_opt_usa, mdp_env) #utils.print_stochastic_policy_action_probs(regret_opt_usa, mdp_env) print("solving for CVaR reward") regret_reward, q = mdp.solve_minCVaR_reward(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha) # print("cvar reward weights", cvar_reward) print("cvar reward weights", np.dot(q, r_chain_burned)) print("-------- IRD Solution -------") u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env)#mdp_env.state_features[init_demo_state]#np.zeros(mdp_env.get_reward_dimensionality())# ird_w = utils.get_worst_case_feature_weights_binary_ird(r_chain_burned, u_expert, mdp_env)
mean_r = np.dot(mdp_env.state_features, mean_w) print("Mean rewards") utils.print_as_grid(mean_r, mdp_env) ###Compute policy loss wrt true reward mean_ploss = utils.policy_loss(mean_u_sa, mdp_env, opt_u_sa) map_ploss = utils.policy_loss(map_u_sa, mdp_env, opt_u_sa) print("mean = {}, map = {}".format(mean_ploss, map_ploss)) ###run CVaR IRL to get policy print("optimizing CVAR") #running just the robust version for now traj_demonstrations = [demonstrations] u_expert = utils.u_sa_from_demos( traj_demonstrations, mdp_env) #np.zeros(mdp_env.num_actions * mdp_env.num_states) n = w_chain_burned.shape[0] posterior_probs = np.ones(n) / n #uniform dist since samples from MCMC cvar_losses = [] for lamda in lamdas: cvar_u_sa, cvar, exp_ret = mdp.solve_max_cvar_policy( mdp_env, u_expert, w_chain_burned.transpose(), posterior_probs, alpha, False, lamda=lamda) if debug:
step_stdev, debug=False, mcmc_norm=mcmc_norm, likelihood=likelihood, prior="non-pos") map_w, map_u, r_chain, u_chain = birl.sample_posterior(demonstrations, num_samples, True) print(train_mdp.feature_weights) burn = 200 skip = 10 r_chain_burned = r_chain[burn::skip] u_expert = utils.u_sa_from_demos(traj_demonstrations, train_mdp) expert_returns = np.sort(np.dot(r_chain_burned, u_expert)) #get the r_sa matrix from the posterior Rsa = utils.convert_w_to_rsa(r_chain_burned, train_mdp) #what does the BROIL policy do? #create test MDP num_rows = 60 num_cols = 60 num_features = 6 num_reps = 20 init_seed = 12345 np.random.seed(init_seed)