def train(discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ # wind = 0.3 trajectory_length = 268 # gw = gridworld.Gridworld(grid_size, wind, discount) env = Env() trajectories = env.generate_trajectories(n_trajectories, trajectory_length, env.optimal_policy_deterministic) feature_matrix = env.feature_matrix() r = maxent.irl(feature_matrix, env.n_actions, discount, env.transition_probability, trajectories, epochs, learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205) pkl.dump(r, open("maxent_reward.pkl", 'wb')) return r
def main(discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, structure): # n_objects, n_colours 随便给的 """ Run deep maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. structure: Neural network structure. Tuple of hidden layer dimensions, e.g., () is no neural network (linear maximum entropy) and (3, 4) is two hidden layers with dimensions 3 and 4. """ trajectory_length = 268 l1 = l2 = 0 env = Env(n_objects, n_colours) # ground_r = np.array([env.reward_deep_maxent(s) for s in range(env.n_states)]) # policy = find_policy(env.n_states, env.n_actions, env.transition_probability, # ground_r, discount, stochastic=False) # trajectories = env.generate_trajectories(n_trajectories, trajectory_length, policy = env.get_policy() trajectories = env.generate_trajectories(2, trajectory_length, lambda s: policy[s]) # feature_matrix = env.feature_matrix_deep_maxent(discrete=False) feature_matrix = env.feature_matrix() r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, env.n_actions, discount, env.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) pkl.dump(r, open('deep_maxent_reward.pkl', 'wb'))