np.random.seed(seed) random.seed(seed) #let's try out BIRL on a simpler version and see what happens #first let's give a demo in the A version that doesn't have lava mdp_env_A = mdp_worlds.lava_ird_simplified_a() # mdp_env_A = mdp_worlds.lava_ambiguous_ird_fig2a() u_sa_A = mdp.solve_mdp_lp(mdp_env_A) print("mdp A") print("Policy") utils.print_policy_from_occupancies(u_sa_A, mdp_env_A) print("reward") utils.print_as_grid(mdp_env_A.r_s, mdp_env_A) print("features") utils.display_onehot_state_features(mdp_env_A) #generate demo for Dylan's NeurIPS world # demonstrations = utils.rollout_from_usa(51, 15, u_sa_A, mdp_env_A) #generate demo for my simplified world demonstrations = utils.rollout_from_usa(10, 100, u_sa_A, mdp_env_A) print("demonstration") print(demonstrations) #Now let's run Bayesian IRL on this demo in this mdp with a placeholder feature to see what happens. beta = 100.0 step_stdev = 0.1
print(demonstrations) traj_demonstrations = [demonstrations] beta = 10.0 step_stdev = 0.1 birl = bayesian_irl.BayesianIRL(mdp_env, beta, step_stdev, debug=False) num_samples = 200 burn = 50 skip = 2 map_w, map_u, r_chain, u_chain = birl.sample_posterior( demonstrations, num_samples) print("map_weights", map_w) map_r = np.dot(mdp_env.state_features, map_w) utils.print_as_grid(map_r, mdp_env) #print("Map policy") #utils.print_policy_from_occupancies(map_u, mdp_env) # print("chain") # for r in r_chain: # print(r) worst_index = np.argmin(r_chain[:, 1]) print(r_chain[worst_index]) print(np.sum(r_chain[:, 1] < -0.82), "out of ", len(r_chain)) r_chain_burned = r_chain[burn::skip] # print("chain after burn and skip") # for r in r_chain_burned: # print(r)
weights = utils.sample_l2_ball(num_features) print("weights", weights) gamma = 0.99 #let's look at all starting states for now init_dist = np.ones(num_states) / num_states # init_states = [10] # for si in init_states: # init_dist[si] = 1.0 / len(init_states) #no terminal term_states = [] mdp_env = mdp.FeaturizedGridMDP(num_rows, num_cols, state_features, weights, gamma, init_dist, term_states) return mdp_env if __name__ == "__main__": #mdp_env = machine_teaching_toy_featurized() # mdp_env = lava_ambiguous_aaai18() mdp_env = random_gridworld_corner_terminal(6, 6, 5) print("features") utils.display_onehot_state_features(mdp_env) u_sa = mdp.solve_mdp_lp(mdp_env, debug=True) print("optimal policy") utils.print_policy_from_occupancies(u_sa, mdp_env) print("optimal values") v = mdp.get_state_values(u_sa, mdp_env) utils.print_as_grid(v, mdp_env)
step_stdev = 0.2 burn = 50 skip = 5 num_samples = 200 mcmc_norm = "l2" likelihood = "birl" mdp_env = mdp_worlds.lava_ambiguous_corridor() opt_sa = mdp.solve_mdp_lp(mdp_env) print("Cliff world") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa, mdp_env) print("reward") utils.print_as_grid(mdp_env.r_s, mdp_env) print("features") utils.display_onehot_state_features(mdp_env) init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d) s = init_demo_state #mdp_env.init_states[0] # only one initial state demo = utils.rollout_from_usa(s, demo_horizon, opt_sa, mdp_env) print("demo", d, demo) traj_demonstrations.append(demo) for s_a in demo: demo_set.add(s_a)
step_stdev = 0.2 burn = 500 skip = 5 num_samples = 2000 mcmc_norm = "l2" likelihood = "birl" mdp_env = mdp_worlds.lava_ambiguous_corridor() opt_sa = mdp.solve_mdp_lp(mdp_env) print("Cliff world") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa, mdp_env) print("reward") utils.print_as_grid(mdp_env.r_s, mdp_env) print("features") utils.display_onehot_state_features(mdp_env) init_demo_state = 1#mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d) s = init_demo_state #mdp_env.init_states[0] # only one initial state demo = utils.rollout_from_usa(s, demo_horizon, opt_sa, mdp_env) print("demo", d, demo) traj_demonstrations.append(demo) for s_a in demo: demo_set.add(s_a)
num_features = 6 train_mdp = mdp_worlds.negative_sideeffects_goal(num_rows, num_cols, num_features, unseen_feature=False) train_mdp.set_reward_fn(np.array([-.1, -.6, -.1, -0.6, -2, 0])) opt_sa_train = mdp.solve_mdp_lp(train_mdp) print("===========================") print("Training MDP with No Lava") print("===========================") print("Optimal Policy") utils.print_policy_from_occupancies(opt_sa_train, train_mdp) print("reward") utils.print_as_grid(train_mdp.r_s, train_mdp) print("features") utils.display_onehot_state_features(train_mdp) import numpy as np np.random.randint(60) init_demo_states = [0, 9, 90, 99] #mdp_env.num_cols * (mdp_env.num_rows - 1) traj_demonstrations = [] demo_set = set() for d in range(num_demos): # np.random.seed(init_seed + d) # random.seed(init_seed + d) for s in init_demo_states: #s = init_demo_state #mdp_env.init_states[0] # only one initial state