Ejemplo n.º 1
0
    print("--"*10)
    state_features = mdp_grid
    terminals = mdp_gen.get_terminals_from_grid(term_grid)
    #print("state features\n",state_features)
    state_features = mdp_gen.categorical_to_one_hot_features(state_features, num_features)
    print('one hot features', state_features)

    world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma)
    mdp_family.append(world)

#plot for visualization
all_opts = []
all_features = []
for i,mdp_env in enumerate(mdp_family):
    V = mdp.value_iteration(mdp_env, epsilon=precision)
    Qopt = mdp.compute_q_values(mdp_env, V=V, eps=precision)
    opt_policy = mdp.find_optimal_policy(mdp_env, Q = Qopt, epsilon=precision)
    print(opt_policy)
    print(mdp_env.features)
    all_opts.append(opt_policy)
    all_features.append(mdp_env.features)
    #input()
filename = "./data_analysis/figs/twoXtwo/firstthree.png"
mdp_plot.plot_optimal_policy_vav_grid(all_opts[:3], all_features[:3], 1, 3, filename=filename)
filename = "./data_analysis/figs/twoXtwo/lastthree.png"
mdp_plot.plot_optimal_policy_vav_grid(all_opts[-3:], all_features[-3:], 1, 3, filename=filename)
#plt.show()

family_teacher = machine_teaching.MdpFamilyTeacher(mdp_family, precision, debug)
mdp_set_cover = family_teacher.get_machine_teaching_mdps()
 state_features = eutils.create_random_features_row_col_m(
     num_rows, num_cols, num_features)
 #print("state features\n",state_features)
 true_weights = random_weights(num_features)
 print("true weights: ", true_weights)
 true_world = mdp.LinearFeatureGridWorld(state_features, true_weights,
                                         initials, terminals, gamma)
 print("rewards")
 true_world.print_rewards()
 print("value function")
 V = mdp.value_iteration(true_world)
 true_world.print_map(V)
 print("mdp features")
 utils.display_onehot_state_features(true_world)
 #find the optimal policy under this MDP
 Qopt = mdp.compute_q_values(true_world, V=V)
 opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt)
 print("optimal policy")
 true_world.print_map(true_world.to_arrows(opt_policy))
 #input()
 #now find a bunch of other optimal policies for the same MDP but with different weight vectors.
 #TODO: I wonder if there is a better way to create these eval policies?
 # Can we efficiently solve for all of them or should they all be close? (e.g. rewards sampled from gaussian centerd on true reward?)
 world = copy.deepcopy(true_world)
 eval_policies = []
 eval_Qvalues = []
 eval_weights = []
 num_eval_policies = 0
 for i in range(num_eval_policies_tries):
     #print("trying", i)
     #change the reward weights
            gamma = 0.9
            seed = init_seed + r_iter
            print("seed", seed)
            np.random.seed(seed)
            random.seed(seed)

            #First let's generate a random MDP
            state_features = eutils.create_random_features_row_col_m(
                num_rows, num_cols, num_features)
            #print("state features\n",state_features)
            true_weights = random_weights(num_features)
            true_world = mdp.LinearFeatureGridWorld(state_features,
                                                    true_weights, initials,
                                                    terminals, gamma)
            V = mdp.value_iteration(true_world, epsilon=precision)
            Qopt = mdp.compute_q_values(true_world, V=V, eps=precision)
            opt_policy = mdp.find_optimal_policy(true_world,
                                                 Q=Qopt,
                                                 epsilon=precision)

            if debug:
                print("true weights: ", true_weights)

                print("rewards")
                true_world.print_rewards()
                print("value function")

                true_world.print_map(V)
                print("mdp features")
                utils.display_onehot_state_features(true_world)