def debug_mdp(world): print("rewards") world.print_rewards() import time print("values") t0 = time.time() V = mdp.value_iteration(world) t1 = time.time() world.print_map(V) print(t1 - t0) print("values inplace") t0 = time.time() V = mdp.value_iteration_inplace(world) t1 = time.time() world.print_map(V) print(t1 - t0) Q = mdp.compute_q_values(world, V) print("Q-values") print(Q) print("optimal policy") opt_policy = mdp.find_optimal_policy(world, Q=Q) print(opt_policy) print("optimal policy") world.print_map(world.to_arrows(opt_policy))
def __init__(self, mdp_world, critical_threshold, precision=0.0001, debug=False): self.mdp_world = mdp_world self.entropy_threshold = critical_threshold self.precision = precision self.debug = debug self.q_values = mdp.compute_q_values(mdp_world) self.optimal_policy = mdp.find_optimal_policy(mdp_world, Q=self.q_values) #find critical states if debug: print("finding critical states") self.critical_state_actions = [] for s in self.mdp_world.states: if debug: print(s) #calculate entropy of optimal policy (assumes it is stochastic optimal) num_optimal_actions = len(self.optimal_policy[s]) action_probs = np.zeros(len(self.mdp_world.actions(s))) for i in range(num_optimal_actions): action_probs[i] = 1.0 / num_optimal_actions entropy = utils.entropy(action_probs) if debug: print(s, entropy) best_action = utils.argmax(self.mdp_world.actions(s), lambda a: self.q_values[s, a]) if entropy < self.entropy_threshold: self.critical_state_actions.append((s, best_action))
def __init__(self, mdp_world, precision, debug=False, remove_redundancy_lp = True): self.mdp_world = mdp_world self.precision = precision self.debug = debug self.q_values = mdp.compute_q_values(mdp_world, eps = precision) self.optimal_policy = mdp.find_optimal_policy(mdp_world, Q=self.q_values, epsilon=precision) teacher = machine_teaching.StateActionRankingTeacher(mdp_world, debug=self.debug, remove_redundancy_lp = remove_redundancy_lp, epsilon=precision) tests, _ = teacher.get_optimal_value_alignment_tests(use_suboptimal_rankings = False) #for now let's just select the first question for each halfspace self.test = [questions[0] for questions in tests]
def debug_demonstrations(): world = create_random_10x10_3feature() print("rewards") world.print_rewards() import time print("features") utils.display_onehot_state_features(world) Q = mdp.compute_q_values(world) #print("Q-values") #print(Q) print("optimal policy") opt_policy = mdp.find_optimal_policy(world, Q=Q) #print(opt_policy) print("optimal policy") world.print_map(world.to_arrows(opt_policy)) print(world.terminals) print("demo 1") demoA = utils.optimal_rollout_from_Qvals((1, 1), 3, Q, world, 0.0001) for (s, a) in demoA: print("({},{})".format(s, world.to_arrow(a))) print(mdp.calculate_trajectory_feature_counts(demoA, world)) print() print("demo 2") demoB = utils.sa_optimal_rollout_from_Qvals((1, 1), (0, 1), 3, Q, world, 0.0001) for (s, a) in demoB: print("({},{})".format(s, world.to_arrow(a))) print(mdp.calculate_trajectory_feature_counts(demoB, world)) tpair = TrajPair(demoA, demoB, world, 0.0001) print(world.weights)
#print("state features\n",state_features) state_features = mdp_gen.categorical_to_one_hot_features( state_features, num_features) print('one hot features', state_features) world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) mdp_family.append(world) #plot for visualization all_opts = [] all_features = [] for i, mdp_env in enumerate(mdp_family): V = mdp.value_iteration(mdp_env, epsilon=precision) Qopt = mdp.compute_q_values(mdp_env, V=V, eps=precision) opt_policy = mdp.find_optimal_policy(mdp_env, Q=Qopt, epsilon=precision) print(opt_policy) print(mdp_env.features) all_opts.append(opt_policy) all_features.append(mdp_env.features) #input() filename = "./data_analysis/figs/twoXtwo/firstthree.png" mdp_plot.plot_optimal_policy_vav_grid(all_opts[:3], all_features[:3], 1, 3, filename=filename) filename = "./data_analysis/figs/twoXtwo/lastthree.png" mdp_plot.plot_optimal_policy_vav_grid(all_opts[-3:], all_features[-3:], 1,
np.random.seed(seed) random.seed(seed) #First let's generate a random MDP state_features = eutils.create_random_features_row_col_m( num_rows, num_cols, num_features) #print("state features\n",state_features) true_weights = random_weights(num_features) true_world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma) V = mdp.value_iteration(true_world, epsilon=precision) true_exp_return = np.mean([V[s] for s in true_world.initials]) Qopt = mdp.compute_q_values(true_world, V=V, eps=precision) opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt, epsilon=precision) if debug: print("true weights: ", true_weights) print("rewards") true_world.print_rewards() print("value function") true_world.print_map(V) print("mdp features") utils.display_onehot_state_features(true_world) print("optimal policy") true_world.print_map(true_world.to_arrows(opt_policy))
seed = 1222 np.random.seed(seed) import random random.seed(seed) world = gw.create_safety_lava_world_nowalls() print("rewards") world.print_rewards() V = mdp.value_iteration(world) Q = mdp.compute_q_values(world, V) print("values") world.print_map(V) opt_policy = mdp.find_optimal_policy(world, Q=Q) print("optimal policy") world.print_map(world.to_arrows(opt_policy)) print(opt_policy) lava_colors = [ 'black', 'tab:green', 'white', 'tab:red', 'tab:blue', 'tab:gray', 'tab:green', 'tab:purple', 'tab:orange', 'tab:cyan' ] mdp_plot.plot_optimal_policy_vav(opt_policy, world.features, walls=True, show=False, arrow_color='k', feature_colors=lava_colors,
def get_machine_teaching_mdps(self): constraint_set = self.family_halfspaces candidate_mdps = self.mdp_family candidate_halfspaces = self.mdp_halfspaces #create boolean bookkeeping to see what has been covered in the set covered = [False for _ in constraint_set] #for each candidate demonstration trajectory check how many uncovered set elements it covers and find one with max added covers total_covered = 0 opt_mdps = [] while total_covered < len(constraint_set): if self.debug: print("set cover iteration") constraints_to_add = None best_mdp = None max_count = 0 for i, mdp_env in enumerate(candidate_mdps): # if self.debug: # print("-"*20) # print("MDP", i) # V = mdp.value_iteration(mdp_env, epsilon=self.precision) # Qopt = mdp.compute_q_values(mdp_env, V=V, eps=self.precision) # opt_policy = mdp.find_optimal_policy(mdp_env, Q = Qopt, epsilon=self.precision) # print("rewards") # mdp_env.print_rewards() # print("value function") # mdp_env.print_map(V) # print("mdp features") # utils.display_onehot_state_features(mdp_env) # print("optimal policy") # mdp_env.print_map(mdp_env.to_arrows(opt_policy)) # print("halfspace") # print(candidate_halfspaces[i]) #get the halfspaces induced by an optimal policy in this MDP constraints_new = candidate_halfspaces[i] count = self.count_new_covers(constraints_new, constraint_set, covered) #if self.debug: print("covered", count) if count > max_count: max_count = count constraints_to_add = constraints_new best_mdp = mdp_env if self.debug: print() print("best mdp so far") print("-" * 20) print("MDP", i) V = mdp.value_iteration(mdp_env, epsilon=self.precision) Qopt = mdp.compute_q_values(mdp_env, V=V, eps=self.precision) opt_policy = mdp.find_optimal_policy( mdp_env, Q=Qopt, epsilon=self.precision) print("rewards") mdp_env.print_rewards() print("value function") mdp_env.print_map(V) print("mdp features") utils.display_onehot_state_features(mdp_env) print("optimal policy") mdp_env.print_map(mdp_env.to_arrows(opt_policy)) print("halfspace") print(constraints_to_add) print("covered", count) #update covered flags and add best_traj to demo` opt_mdps.append(best_mdp) covered = self.update_covered_constraints(constraints_to_add, constraint_set, covered) total_covered += max_count #TODO: optimize by removing trajs if we decide to add to opt_demos return opt_mdps
eval_policies = [] eval_Qvalues = [] eval_weights = [] eval_halfspaces = [] all_halfspaces = [] num_eval_policies = 0 for i in range(num_eval_policies_tries): rand_world = copy.deepcopy(world) #print("trying", i) #change the reward weights eval_weight_vector = random_weights(num_features) rand_world.weights = eval_weight_vector #find the optimal policy under this MDP Qval = mdp.compute_q_values(rand_world, eps=precision) eval_policy = mdp.find_optimal_policy(rand_world, Q=Qval, epsilon=precision) #only save if not equal to optimal policy if eval_policy not in eval_policies: if debug: print("found distinct eval policy") print("weights", eval_weight_vector) rand_world.print_map(rand_world.to_arrows(eval_policy)) eval_policies.append(eval_policy) eval_Qvalues.append(Qval) eval_weights.append(eval_weight_vector) teacher = machine_teaching.StateActionRankingTeacher(rand_world, debug=False, epsilon=precision)