if __name__ == "__main__":
    seed = 1234
    np.random.seed(seed)
    scipy.random.seed(seed)
    random.seed(seed)
    #mdp_env = mdp_worlds.two_state_chain()
    #demonstrations = [(1,0), (0,0)]

    # mdp_env = mdp_worlds.machine_teaching_toy_featurized()
    # demonstrations = [(2,3),(5,0),(4,0),(3,2)]

    mdp_env = mdp_worlds.lava_ambiguous_aaai18()
    u_sa = mdp.solve_mdp_lp(mdp_env)
    #generate demo from state 5 to terminal
    demonstrations = utils.rollout_from_usa(5, 10, u_sa, mdp_env)
    print(demonstrations)

    traj_demonstrations = [demonstrations]

    beta = 10.0
    step_stdev = 0.1
    birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False)

    num_samples = 200
    burn = 50
    skip = 2
    map_w, map_u, r_chain, u_chain = birl.sample_posterior(
        demonstrations, num_samples)
    print("map_weights", map_w)
    map_r = np.dot(mdp_env.state_features, map_w)
Ejemplo n.º 2
0
# mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a()
u_sa_A = mdp.solve_mdp_lp(mdp_env_A)

print("mdp A")
print("Policy")
utils.print_policy_from_occupancies(u_sa_A, mdp_env_A)
print("reward")
utils.print_as_grid(mdp_env_A.r_s, mdp_env_A)
print("features")
utils.display_onehot_state_features(mdp_env_A)

#generate demo for Dylan's NeurIPS world
# demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A)
#generate demo for my simplified world

demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A)
print("demonstration")
print(demonstrations)

#Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens.

beta = 100.0
step_stdev = 0.1
burn = 100
skip = 5
num_samples = 2000
sample_norm = None
birl = bayesian_irl.BayesianIRL(mdp_env_A,
                                beta,
                                step_stdev,
                                debug=False,
for i in range(num_trials):
    print("=" * 10)
    print("iteration", i)
    print("=" * 10)

    seed = init_seed + i * 13
    np.random.seed(seed)
    random.seed(seed)

    mdp_env = mdp_worlds.random_gridworld(num_rows, num_cols, num_features)
    opt_u_sa = mdp.solve_mdp_lp(mdp_env, debug=debug)
    true_r_sa = mdp_env.r_sa
    true_w = mdp_env.feature_weights

    #generate demontration from bottom left corner
    demonstrations = utils.rollout_from_usa(demo_state, horizon, opt_u_sa,
                                            mdp_env)
    print("demonstration")
    print(demonstrations)

    ###Run Bayesian IRL to get posterior
    print("running B-IRL")
    birl = bayesian_irl.BayesianIRL(mdp_env,
                                    beta,
                                    step_stdev,
                                    debug=False,
                                    mcmc_norm=mcmc_norm)
    map_w, map_u_sa, w_chain, u_chain = birl.sample_posterior(
        demonstrations, num_samples, False)
    print("Birl complete")

    if debug:
Ejemplo n.º 4
0
print("Cliff world")
print("Optimal Policy")
utils.print_policy_from_occupancies(opt_sa, mdp_env)
print("reward")
utils.print_as_grid(mdp_env.r_s, mdp_env)
print("features")
utils.display_onehot_state_features(mdp_env)

init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1)
traj_demonstrations = []
demo_set = set()
for d in range(num_demos):
    # np.random.seed(init_seed + d)
    # random.seed(init_seed + d)
    s = init_demo_state #mdp_env.init_states[0] # only one initial state
    demo = utils.rollout_from_usa(s, demo_horizon, opt_sa, mdp_env)
    print("demo", d, demo)
    traj_demonstrations.append(demo)
    for s_a in demo:
        demo_set.add(s_a)
demonstrations = list(demo_set)
print("demonstration")
print(demonstrations)

state_feature_list = [tuple(fs) for fs in mdp_env.state_features]
pg.get_policy_string_from_trajectory(traj_demonstrations[0], state_feature_list, mdp_env)


    
# In[4]:
Ejemplo n.º 5
0
    print("Training MDP with No Lava")
    print("===========================")

    print("Optimal Policy")
    utils.print_policy_from_occupancies(u_sa_A, mdp_env_A)
    print("reward")
    utils.print_as_grid(mdp_env_A.r_s, mdp_env_A)
    print("features")
    utils.display_onehot_state_features(mdp_env_A)

    #generate demonstration from top left corner
    traj_demonstrations = []
    demo_set = set()
    for s in demo_states:  #range(mdp_env_A.get_num_states()):
        if mdp_env_A.init_dist[s] > 0:
            demo = utils.rollout_from_usa(s, demo_horizon, u_sa_A, mdp_env_A)
            traj_demonstrations.append(demo)
            for s_a in demo:
                demo_set.add(s_a)
    demonstrations = list(demo_set)
    print("demonstration")
    print(demonstrations)

    #CVaR stuff needs expected feature counts from a list of trajectories
    #traj_demonstrations = [demonstrations]

    #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens.
    birl = bayesian_irl.BayesianIRL(mdp_env_A,
                                    beta,
                                    step_stdev,
                                    debug=False,
Ejemplo n.º 6
0
    print("Training MDP with No Lava")
    print("===========================")

    print("Optimal Policy")
    utils.print_policy_from_occupancies(u_sa_A, mdp_env_A)
    print("reward")
    utils.print_as_grid(mdp_env_A.r_s, mdp_env_A)
    print("features")
    utils.display_onehot_state_features(mdp_env_A)

    #generate demonstration from top left corner
    traj_demonstrations = []
    demo_set = set()
    for s in demo_states:  #range(mdp_env_A.get_num_states()):
        if mdp_env_A.init_dist[s] > 0:
            demo = utils.rollout_from_usa(s, 100, u_sa_A, mdp_env_A)
            traj_demonstrations.append(demo)
            for s_a in demo:
                demo_set.add(s_a)
    demonstrations = list(demo_set)
    print("demonstration")
    print(demonstrations)

    #CVaR stuff needs expected feature counts from a list of trajectories
    #traj_demonstrations = [demonstrations]

    #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens.
    birl = bayesian_irl.BayesianIRL(mdp_env_A, beta, step_stdev, debug=False)

    map_w, map_u, r_chain, u_chain = birl.sample_posterior(
        demonstrations, num_samples, False)
Ejemplo n.º 7
0
print("features")
utils.display_onehot_state_features(train_mdp)

import numpy as np

np.random.randint(60)

init_demo_states = [0, 9, 90, 99]  #mdp_env.num_cols * (mdp_env.num_rows - 1)
traj_demonstrations = []
demo_set = set()
for d in range(num_demos):
    # np.random.seed(init_seed + d)
    # random.seed(init_seed + d)
    for s in init_demo_states:
        #s = init_demo_state #mdp_env.init_states[0] # only one initial state
        demo = utils.rollout_from_usa(s, demo_horizon, opt_sa_train, train_mdp)
        print("demo", d, demo)
        traj_demonstrations.append(demo)
        for s_a in demo:
            demo_set.add(s_a)
demonstrations = list(demo_set)
print("demonstration")
print(demonstrations)

state_feature_list = [tuple(fs) for fs in train_mdp.state_features]

#Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens.
beta = 10.0
step_stdev = 0.05

num_samples = 1000