#First let's generate a random MDP
    state_features = eutils.create_random_features_row_col_m(
        num_rows, num_cols, num_features)
    #print("state features\n",state_features)
    true_weights = random_weights(num_features)
    print("true weights: ", true_weights)
    true_world = mdp.LinearFeatureGridWorld(state_features, true_weights,
                                            initials, terminals, gamma)
    print("rewards")
    true_world.print_rewards()
    print("value function")
    V = mdp.value_iteration(true_world)
    true_world.print_map(V)
    print("mdp features")
    utils.display_onehot_state_features(true_world)
    #find the optimal policy under this MDP
    Qopt = mdp.compute_q_values(true_world, V=V)
    opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt)
    print("optimal policy")
    true_world.print_map(true_world.to_arrows(opt_policy))
    #input()
    #now find a bunch of other optimal policies for the same MDP but with different weight vectors.
    #TODO: I wonder if there is a better way to create these eval policies?
    # Can we efficiently solve for all of them or should they all be close? (e.g. rewards sampled from gaussian centerd on true reward?)
    world = copy.deepcopy(true_world)
    eval_policies = []
    eval_Qvalues = []
    eval_weights = []
    num_eval_policies = 0
    for i in range(num_eval_policies_tries):
Ejemplo n.º 2
0
#let's try out BIRL on a simpler version and see what happens

#first let's give a demo in the A version that doesn't have lava

mdp_env_A = mdp_worlds.lava_ird_simplified_a()
# mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a()
u_sa_A = mdp.solve_mdp_lp(mdp_env_A)

print("mdp A")
print("Policy")
utils.print_policy_from_occupancies(u_sa_A, mdp_env_A)
print("reward")
utils.print_as_grid(mdp_env_A.r_s, mdp_env_A)
print("features")
utils.display_onehot_state_features(mdp_env_A)

#generate demo for Dylan's NeurIPS world
# demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A)
#generate demo for my simplified world

demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A)
print("demonstration")
print(demonstrations)

#Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens.

beta = 100.0
step_stdev = 0.1
burn = 100
skip = 5
Ejemplo n.º 3
0
train_mdp = mdp_worlds.negative_sideeffects_goal(num_rows,
                                                 num_cols,
                                                 num_features,
                                                 unseen_feature=False)
train_mdp.set_reward_fn(np.array([-.1, -.6, -.1, -0.6, -2, 0]))
opt_sa_train = mdp.solve_mdp_lp(train_mdp)
print("===========================")
print("Training MDP with No Lava")
print("===========================")

print("Optimal Policy")
utils.print_policy_from_occupancies(opt_sa_train, train_mdp)
print("reward")
utils.print_as_grid(train_mdp.r_s, train_mdp)
print("features")
utils.display_onehot_state_features(train_mdp)

import numpy as np

np.random.randint(60)

init_demo_states = [0, 9, 90, 99]  #mdp_env.num_cols * (mdp_env.num_rows - 1)
traj_demonstrations = []
demo_set = set()
for d in range(num_demos):
    # np.random.seed(init_seed + d)
    # random.seed(init_seed + d)
    for s in init_demo_states:
        #s = init_demo_state #mdp_env.init_states[0] # only one initial state
        demo = utils.rollout_from_usa(s, demo_horizon, opt_sa_train, train_mdp)
        print("demo", d, demo)