def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, structure): """ Run deep maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. structure: Neural network structure. Tuple of hidden layer dimensions, e.g., () is no neural network (linear maximum entropy) and (3, 4) is two hidden layers with dimensions 3 and 4. """ wind = 0.3 trajectory_length = 8 l1 = l2 = 0 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) feature_matrix = ow.feature_matrix(discrete=False) r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) print("ow.n_states", ow.n_states) print("ow.n_actions", ow.n_actions) print("ow.transition_probability", ow.transition_probability, len(ow.transition_probability), len(ow.transition_probability[0]), len(ow.transition_probability[0][0])) print("ground_r", ground_r, len(ground_r)) print("ow.discount", ow.discount) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) print(trajectories) feature_matrix = ow.feature_matrix(discrete=False) print("feature_matrix", feature_matrix, len(feature_matrix), len(feature_matrix[0])) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, start_state): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ sx, sy = start_state wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ow.plot_grid() ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) print("Policy = ", policy.shape) # print ("policy - {}".format(policy)) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) print("trajectories = ", trajectories.shape) # for t in trajectories: # ow.plot_grid("trajectory_{}.png".format(t), t) # for t in trajectories: # for s, a, r in t: # print (ow.int_to_point(s), ow.actions[a], r) # print ("---------") feature_matrix = ow.feature_matrix(discrete=False) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) recovered_policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, r, ow.discount, stochastic=False) new_trajectory = ow.generate_trajectories(1, trajectory_length, lambda s: recovered_policy[s], False, (sx, sy)) print("new trajectory") for t in new_trajectory: ow.plot_grid("new_trajectory.png", t) for s, a, rw in t: print(ow.int_to_point(s), ow.actions[a], rw) print("---------") plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.savefig("reward.png", format="png", dpi=150)
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, start_state, wind=0.0, algo="maxnet", mdp="gridworld"): """ Run inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. start_state: start location to generate trajectory from algo: IRL algo to run (Currently, support maxnet and deep_maxnet) """ sx, sy = start_state trajectory_length = 8 if mdp == "objectworld": import irl.mdp.objectworld as objectworld ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) elif mdp == "gridworld": import irl.mdp.gridworld as gridworld ow = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) optimal_v = optimal_value(ow.n_states, ow.n_actions, ow.transition_probability, normalize(ground_r), ow.discount) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s], random_start=True) feature_matrix = ow.feature_matrix() print("trajectories = ", trajectories.shape) print("epochs = ", epochs) print("feature_matrix.shape = ", feature_matrix.shape) print("policy.shape = ", policy.shape) # ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), value=optimal_v) ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), policy=policy, value=optimal_v) r = [] ground_svf = [] if algo == "maxent": import irl.maxent as maxent ground_svf = maxent.find_svf(ow.n_states, trajectories) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) elif algo == "deep_maxnet": import irl.deep_maxent as deep_maxent l1 = l2 = 0 structure = (3, 3) r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) recovered_policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, normalize(r), ow.discount, stochastic=False) recovered_v = value(recovered_policy, ow.n_states, ow.transition_probability, normalize(r), ow.discount) new_trajectory = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: recovered_policy[s], True, (sx, sy)) recovered_svf = maxent.find_svf(ow.n_states, new_trajectory) # ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), # value=recovered_v) ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), policy=recovered_policy, value=recovered_v) # print("new trajectory") # for t in new_trajectory: # for s, a, rw in t: # print (ow.int_to_point(s), ow.actions[a], rw) # print ("---------") y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5] plt.subplot(111) plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth SVF") plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size))) plt.title("Recovered SVF") plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size))) plt.title("Recovered reward") plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150)