def calc_frontier(mdp_env, u_expert, reward_posterior, posterior_probs, lambda_range, alpha, debug=False): '''takes an MDP and runs over a range of lambdas to output the expected value and CVaR of the resulting solutions to the LP mdp_env: the mdp to run on u_expert: the baseline expert to try and beat (set to zeros just to be robust) reward_posterior: the reward posterior from B-IRL(already burned and skiped and ready to run in LP) posterior_probs: the probabilities of each element in the posterior (uniform if from MCMC) lambda_range: a list of lambda values to try alpha: the CVaR alpha (risk sensitivity) higher is more risk-sensitive/conservative ''' cvar_exprews = [] for lamda in lambda_range: cvar_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy( mdp_env, u_expert, reward_posterior, posterior_probs, alpha, debug, lamda) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(cvar_opt_usa, mdp_env) print("stochastic policy") utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env) print("CVaR of policy = {}".format(cvar_value)) print("Expected return of policy = {}".format(exp_ret)) cvar_exprews.append((cvar_value, exp_ret)) return cvar_exprews
def calc_max_ent_u_sa(mdp_env, demos, max_epochs=1000, horizon=None, learning_rate=0.01): import mdp import utils seed_weights = np.zeros(mdp_env.get_reward_dimensionality()) # Parameters if horizon is None: horizon = mdp_env.num_states # Main algorithm call r_weights, grads, state_features, maxent_pi = maxEntIRL(mdp_env, demos, seed_weights, max_epochs, horizon, learning_rate, norm="l2") # Construct reward function from weights and state features reward_fxn = [] for s_i in range(mdp_env.num_states): reward_fxn.append(np.dot(r_weights, state_features[s_i])) reward_fxn = np.reshape(reward_fxn, (mdp_env.num_rows, mdp_env.num_cols)) print("learned reward function") print(reward_fxn) u_s = mdp.get_policy_state_occupancy_frequencies(maxent_pi, mdp_env) u_sa = mdp.stoch_policy_to_usa(maxent_pi, mdp_env) utils.print_policy_from_occupancies(u_sa, mdp_env) utils.print_stochastic_policy_action_probs(u_sa, mdp_env) return u_sa, r_weights, maxent_pi
print(demonstrations) state_feature_list = [tuple(fs) for fs in mdp_env.state_features] pg.get_policy_string_from_trajectory(traj_demonstrations[0], state_feature_list, mdp_env) # In[4]: #LPAL solution u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env) lpal_usa = mdp.solve_lpal_policy(mdp_env, u_expert) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A) print("lpal policy") utils.print_policy_from_occupancies(lpal_usa, mdp_env) utils.print_stochastic_policy_action_probs(lpal_usa, mdp_env) pi_dict = utils.get_stoch_policy_string_dictionary_from_occupancies(lpal_usa, mdp_env) state_feature_list = [tuple(fs) for fs in mdp_env.state_features] pg.plot_optimal_policy_stochastic(pi_dict, state_feature_list, mdp_env.num_rows, mdp_env.num_cols) # In[4]: import maxent #just keep states in traj_demos maxent_demos = [] for d in traj_demonstrations: #add only states to demos demo = [] for s,a in d:
lamda = 0.9 posterior = generate_posterior_samples(num_samples) #print(generate_reward_sample()) r_sa = np.mean(posterior, axis=1) #print("rsa", r_sa) init_distribution = np.ones(num_states)/num_states #uniform distribution mdp_env = mdp.MachineReplacementMDP(num_states, r_sa, gamma, init_distribution) #print(mdp_env.Ps) print("mean MDP reward", r_sa) u_sa = mdp.solve_mdp_lp(mdp_env, debug=True) print("mean policy from posterior") utils.print_stochastic_policy_action_probs(u_sa, mdp_env) print("MAP/Mean policy from posterior") utils.print_policy_from_occupancies(u_sa, mdp_env) print("rewards") print(mdp_env.r_sa) print("expected value = ", np.dot(u_sa, r_sa)) stoch_pi = utils.get_optimal_policy_from_usa(u_sa, mdp_env) print("expected return", mdp.get_policy_expected_return(stoch_pi, mdp_env)) print("values", mdp.get_state_values(u_sa, mdp_env)) print('q-values', mdp.get_q_values(u_sa, mdp_env)) #print(posterior) #print(posterior.shape)
lamda = 0.0 n = r_chain_burned.shape[0] posterior_probs = np.ones(n) / n #uniform dist since samples from MCMC print("MDP A") print("features") utils.display_onehot_state_features(mdp_env) print("------ Robust Solution ---------") u_expert = np.zeros(mdp_env.num_actions * mdp_env.num_states) robust_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(robust_opt_usa, mdp_env) utils.print_stochastic_policy_action_probs(robust_opt_usa, mdp_env) print("solving for CVaR reward") cvar_reward, q = mdp.solve_minCVaR_reward(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha) # print("cvar reward weights", cvar_reward) print("cvar reward weights", np.dot(q, r_chain_burned)) print("------ Regret Solution ---------") traj_demonstrations = [demonstrations] u_expert = utils.u_sa_from_demos(traj_demonstrations, mdp_env) print('expert u_sa', u_expert) regret_opt_usa, cvar_value, exp_ret = mdp.solve_max_cvar_policy(mdp_env, u_expert, r_chain_burned.transpose(), posterior_probs, alpha, debug, lamda) #utils.print_stochastic_policy_action_probs(cvar_opt_usa, mdp_env_A) print("Policy for lambda={} and alpha={}".format(lamda, alpha)) utils.print_policy_from_occupancies(regret_opt_usa, mdp_env)
print("mean policy loss", mean_ploss) lava_states = [] for s, f in enumerate(mdp_env_B.state_features): if (f == (0, 0, 0, 1)).all(): #hard coded lava feature lava_states.append(s) print("lava states", lava_states) print("initial dist") print(mdp_env_B.init_dist) print("map_u") print(np.sum(map_u_sa)) utils.print_policy_occupancies_pretty(map_u_sa, mdp_env_B) utils.print_stochastic_policy_action_probs(map_u_sa, mdp_env_B) print("mean_u") print(np.sum(mean_u_sa)) utils.print_policy_occupancies_pretty(mean_u_sa, mdp_env_B) utils.print_stochastic_policy_action_probs(mean_u_sa, mdp_env_B) num_states = mdp_env_B.get_num_states() map_lava = 0 for s in lava_states: map_lava += np.sum(map_u_sa[s::num_states]) print("map lava", map_lava) num_states = mdp_env_B.get_num_states() mean_lava = 0