def main(grid_size, discount, n_trajectories, epochs, learning_rate, trajectory_length, trust, expert_type, random_start): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 1 - trust gw = gridworld.Gridworld(grid_size, wind, discount, expert_type) trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, gw.optimal_policy, random_start=random_start) feature_matrix = gw.feature_matrix() ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) r = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) print r.reshape((grid_size, grid_size)) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
Q -= Q.max(axis=1).reshape((n_states, 1)) # For numerical stability. Q = np.exp(Q)/np.exp(Q).sum(axis=1).reshape((n_states, 1)) return Q def _policy(s): return max(range(n_actions), key=lambda a: sum(transition_probabilities[s, a, k] * (reward[k] + discount * v[k]) for k in range(n_states))) policy = np.array([_policy(s) for s in range(n_states)]) return policy if __name__ == '__main__': # Quick unit test using gridworld. import mdp.gridworld as gridworld gw = gridworld.Gridworld(3, 0.3, 0.9) v = value([gw.optimal_policy_deterministic(s) for s in range(gw.n_states)], gw.n_states, gw.transition_probability, [gw.reward(s) for s in range(gw.n_states)], gw.discount) assert np.isclose(v, [5.7194282, 6.46706692, 6.42589811, 6.46706692, 7.47058224, 7.96505174, 6.42589811, 7.96505174, 8.19268666], 1).all() opt_v = optimal_value(gw.n_states, gw.n_actions, gw.transition_probability, [gw.reward(s) for s in range(gw.n_states)], gw.discount) assert np.isclose(v, opt_v).all()