def expected_value_difference(n_states, n_actions, transition_probability, reward, discount, p_start_state, optimal_value, true_reward): """ Calculate the expected value difference, which is a proxy to how good a recovered reward function is. n_states: Number of states. int. n_actions: Number of actions. int. transition_probability: NumPy array mapping (state_i, action, state_k) to the probability of transitioning from state_i to state_k under action. Shape (N, A, N). reward: Reward vector mapping state int to reward. Shape (N,). discount: Discount factor. float. p_start_state: Probability vector with the ith component as the probability that the ith state is the start state. Shape (N,). optimal_value: Value vector for the ground reward with optimal policy. The ith component is the value of the ith state. Shape (N,). true_reward: True reward vector. Shape (N,). -> Expected value difference. float. """ policy = value_iteration.find_policy(n_states, n_actions, transition_probability, reward, discount) value = value_iteration.value(policy.argmax(axis=1), n_states, transition_probability, true_reward, discount) evd = optimal_value.dot(p_start_state) - value.dot(p_start_state) return evd
def main(grid_size, discount, L): wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)]#由确定性最优策略(自己预先设置的)求R #由强化学习求最优策略 policy = find_policy( gw.n_states, gw.n_actions, gw.transition_probability, ground_r, discount, ) # Need a value function for each basis function. feature_matrix = gw.feature_matrix() values = [] for dim in range(feature_matrix.shape[1]): reward = feature_matrix[:, dim] values.append( value(policy, gw.n_states, gw.transition_probability, reward, gw.discount)) values = np.array(values).T rl1, rl2, rl1l2 = linear_irl.large_irl(values, gw.transition_probability, feature_matrix, gw.n_states, gw.n_actions, policy, L) return ground_r, rl1, rl2, rl1l2
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, structure): """ Run deep maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. structure: Neural network structure. Tuple of hidden layer dimensions, e.g., () is no neural network (linear maximum entropy) and (3, 4) is two hidden layers with dimensions 3 and 4. """ wind = 0.3 trajectory_length = 8 l1 = l2 = 0 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) feature_matrix = ow.feature_matrix(discrete=False) r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) print("ow.n_states", ow.n_states) print("ow.n_actions", ow.n_actions) print("ow.transition_probability", ow.transition_probability, len(ow.transition_probability), len(ow.transition_probability[0]), len(ow.transition_probability[0][0])) print("ground_r", ground_r, len(ground_r)) print("ow.discount", ow.discount) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) print(trajectories) feature_matrix = ow.feature_matrix(discrete=False) print("feature_matrix", feature_matrix, len(feature_matrix), len(feature_matrix[0])) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def find_expected_svf(n_states, r, n_actions, discount, transition_probability, trajectories): ''' 从轨迹中找出期望状态的访问频率,svf是啥——state visitation frequencies :param n_states: :param r: :param n_actions: :param discount: :param transition_probability: :param trajectories: :return: ''' """ Find the expected state visitation frequencies using algorithm 1 from Ziebart et al. 2008. n_states: Number of states N. int. alpha: Reward. NumPy array with shape (N,). n_actions: Number of actions A. int. discount: Discount factor of the MDP. float. transition_probability: NumPy array mapping (state_i, action, state_k) to the probability of transitioning from state_i to state_k under action. Shape (N, A, N). trajectories: 3D array of state/action pairs. States are ints, actions are ints. NumPy array with shape (T, L, 2) where T is the number of trajectories and L is the trajectory length. -> Expected state visitation frequencies vector with shape (N,). """ n_trajectories = trajectories.shape[0]#轨迹数量 trajectory_length = trajectories.shape[1]#轨迹长度 # policy = find_policy(n_states, r, n_actions, discount, # transition_probability) policy = value_iteration.find_policy(n_states, n_actions, transition_probability, r, discount) #输出为每个状态的价值 start_state_count = np.zeros(n_states) for trajectory in trajectories: start_state_count[trajectory[0, 0]] += 1 p_start_state = start_state_count/n_trajectories expected_svf = np.tile(p_start_state, (trajectory_length, 1)).T for t in range(1, trajectory_length): expected_svf[:, t] = 0 for i, j, k in product(range(n_states), range(n_actions), range(n_states)): expected_svf[k, t] += (expected_svf[i, t-1] * policy[i, j] * # Stochastic policy transition_probability[i, j, k]) return expected_svf.sum(axis=1)
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, structure): """ Run deep maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. structure: Neural network structure. Tuple of hidden layer dimensions, e.g., () is no neural network (linear maximum entropy) and (3, 4) is two hidden layers with dimensions 3 and 4. """ wind = 0.3 trajectory_length = 8 l1 = l2 = 0 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) feature_matrix = ow.feature_matrix(discrete=False) r = deep_maxent.irl( (feature_matrix.shape[1],) + structure, feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2, ) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, L, trust): #L正则化系数 wind = 1 - trust #专家随机动作系数, trajectory_length = 3 * grid_size #最大轨迹长度 gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #真实奖赏函数 #policy = [gw.optimal_policy_stochastic(s) for s in range(gw.n_states)] #采用随机(非确定性)策略,效果没那么好 #policy = [gw.optimal_policy_deterministic(s) for s in range(gw.n_states)] #采用确定性策略,效果好 # 由强化学习求最优策略 policy = find_policy( gw.n_states, gw.n_actions, gw.transition_probability, ground_r, discount, ) rl1, rl2, rl1l2 = linear_irl.irl(gw.n_states, gw.n_actions, gw.transition_probability, policy, gw.discount, 1, L) #Rmax=1,L1可变 return ground_r, rl1, rl2, rl1l2
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) feature_matrix = ow.feature_matrix(discrete=False) r = maxent.irl( feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate ) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def test_ow_once(grid_size, n_objects, n_colours, discrete, l1, l2, n_samples, epochs, structure): """ Test MaxEnt and DeepMaxEnt on a ow of size grid_size with the feature map feature_map with n_samples paths. grid_size: Grid size. int. n_objects: Number of objects. int. n_colours: Number of colours. int. discrete: Whether the features should be discrete. bool. l1: L1 regularisation. float. l2: L2 regularisation. float. n_samples: Number of paths to sample. epochs: Number of epochs to run MaxEnt with. structure: Neural network structure tuple, e.g. (3, 3) would be a 3-layer neural network with assumed inputs. -> Expected value difference for MaxEnt, DeepMaxEnt """ # Basic gist of what we're doing here: Get the reward function using our # different IRL methods, use those to get a policy, evaluate that policy # using the true reward, and then return the difference in expected values. # Setup parameters. wind = 0.3 discount = 0.9 learning_rate = 0.01 trajectory_length = 3 * grid_size # Make the objectworld and associated data. ow = Objectworld(grid_size, n_objects, n_colours, wind, discount) feature_matrix = ow.feature_matrix(discrete) ground_reward = np.array([ow.reward(i) for i in range(ow.n_states)]) optimal_policy = value_iteration.find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_reward, discount).argmax(axis=1) trajectories = ow.generate_trajectories(n_samples, trajectory_length, optimal_policy.take) p_start_state = ( np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) / trajectories.shape[0]) # True value. optimal_V = value_iteration.optimal_value(ow.n_states, ow.n_actions, ow.transition_probability, ground_reward, ow.discount) # MaxEnt reward; policy; value. maxent_reward = deep_maxent.irl((feature_matrix.shape[1], ), feature_matrix, ow.n_actions, ow.discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions, ow.transition_probability, maxent_reward, discount).argmax(axis=1) maxent_V = value_iteration.value(maxent_policy, ow.n_states, ow.transition_probability, ground_reward, ow.discount) maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state) # DeepMaxEnt reward; policy; value. deep_learning_rate = 0.005 # For the 32 x 32 experiments. deep_maxent_reward = deep_maxent.irl( (feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, ow.discount, ow.transition_probability, trajectories, epochs, deep_learning_rate, l1=l1, l2=l2) deep_maxent_policy = value_iteration.find_policy(ow.n_states, ow.n_actions, ow.transition_probability, deep_maxent_reward, discount).argmax(axis=1) deep_maxent_V = value_iteration.value(deep_maxent_policy, ow.n_states, ow.transition_probability, ground_reward, ow.discount) deep_maxent_EVD = (optimal_V.dot(p_start_state) - deep_maxent_V.dot(p_start_state)) plt.subplot(3, 3, 1) plt.pcolor(ground_reward.reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 2) plt.pcolor(maxent_reward.reshape((grid_size, grid_size))) plt.title("MaxEnt reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 3) plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size))) plt.title("DeepMaxEnt reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 4) plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("Optimal policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 5) plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("MaxEnt policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 6) plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("DeepMaxEnt policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 7) plt.pcolor(optimal_V.reshape((grid_size, grid_size))) plt.title("Optimal value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 8) plt.pcolor(maxent_V.reshape((grid_size, grid_size))) plt.title("MaxEnt value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 9) plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size))) plt.title("DeepMaxEnt value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.savefig("{}_{}_{}_{}_{}_{}_{}_{}_{}_objectworld_{}.png".format( grid_size, n_objects, n_colours, discrete, n_samples, epochs, structure, l1, l2, np.random.randint(10000000))) return maxent_EVD, deep_maxent_EVD
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, start_state): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ sx, sy = start_state wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ow.plot_grid() ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) print("Policy = ", policy.shape) # print ("policy - {}".format(policy)) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) print("trajectories = ", trajectories.shape) # for t in trajectories: # ow.plot_grid("trajectory_{}.png".format(t), t) # for t in trajectories: # for s, a, r in t: # print (ow.int_to_point(s), ow.actions[a], r) # print ("---------") feature_matrix = ow.feature_matrix(discrete=False) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) recovered_policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, r, ow.discount, stochastic=False) new_trajectory = ow.generate_trajectories(1, trajectory_length, lambda s: recovered_policy[s], False, (sx, sy)) print("new trajectory") for t in new_trajectory: ow.plot_grid("new_trajectory.png", t) for s, a, rw in t: print(ow.int_to_point(s), ow.actions[a], rw) print("---------") plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.savefig("reward.png", format="png", dpi=150)
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.1 #模拟干扰,噪声,专家出错导致动作非最优的概率 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) # 由强化学习求最优策略让它代表专家策略产生示例轨迹 policy = find_policy(gw.n_states, gw.n_actions, gw.transition_probability, ground_r, discount) trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, policy, random_start=True) # 画轨迹图 预处理前 paths = [] for i in trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '预处理前专家示例轨迹') # 预处理专家轨迹 new_trajectories = pre_treated(gw.n_states, gw.n_actions, trajectories) # 画轨迹图 预处理后 paths = [] for i in new_trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '预处理后专家示例轨迹') feature_matrix = gw.feature_matrix() trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in trajectories] # maxent irl处理的格式 r1, R1 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(trajectories), epochs, learning_rate) r1 = r1 / max(r1) loss1 = [] for r in R1: r = r / max(r) loss = abs(r - ground_r).sum() loss1.append(loss) new_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in new_trajectories] # maxent irl处理的格式 feature_matrix = gw.feature_matrix() r2, R2 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(new_trajectories), epochs, learning_rate) r2 = r2 / max(r2) loss2 = [] for r in R2: r = r / max(r) loss = abs(r - ground_r).sum() loss2.append(loss) # 监督学习 policy_sl = supervised_learning(new_trajectories, policy) # 监督学习 equal = 0 for i in range(len(policy)): if policy_sl[i] == policy[i]: equal += 1 / len(policy) print("监督学习得到的策略正确率{}%".format(100 * equal)) # 由监督学习策略生成轨迹 sl_trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, policy_sl, random_start=True) # 预处理监督学习策略轨迹 new_sl_trajectories = pre_treated(gw.n_states, gw.n_actions, sl_trajectories) # 画轨迹图 监督学习策略 paths = [] for i in new_sl_trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '监督学习策略估计出的专家轨迹') new_sl_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in new_sl_trajectories] mix_trajectories = new_trajectories for trajectory in new_sl_trajectories: for i in new_trajectories: if trajectory[-1] == i[-1]: mix_trajectories.append(trajectory) break feature_matrix = gw.feature_matrix() r3, R3 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(mix_trajectories), epochs, learning_rate) r3 = r3 / max(r3) loss3 = [] for r in R3: r = r / max(r) loss = abs(r - ground_r).sum() loss3.append(loss) # # 2维图 # plt.subplot(1, 3, 1) # plt.pcolor(r1.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("未进行预处理恢复的R") # plt.subplot(1, 3, 2) # plt.pcolor(r2.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("进行预处理恢复的R") # plt.subplot(1, 3, 3) # plt.pcolor(r3.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("预处理且监督学习恢复的R") # plt.show() # 画三维图 # 绘图设置 # X和Y的个数要相同 X = range(gw.grid_size) Y = range(gw.grid_size) Z1 = r1 Z2 = r2 Z3 = r3 # meshgrid把X和Y变成平方长度,比如原来都是4,经过meshgrid和ravel之后,长度都变成了16,因为网格点是16个 xx, yy = np.meshgrid(X, Y) # 网格化坐标 X, Y = xx.ravel(), yy.ravel() # 矩阵扁平化 # # 设置柱子属性 height = np.zeros_like(Z1) # 新建全0数组,shape和Z相同,据说是图中底部的位置 width = depth = 1 # 柱子的长和宽 # # 颜色数组,长度和Z一致 c = ['y'] * len(Z1) # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z1, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("未进行预处理恢复的R") plt.show() # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z2, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("预处理后恢复的R") plt.show() # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z3, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("预处理且监督学习恢复的R") plt.show() # 画误差图 plt.plot(range(epochs), loss1, color='r', label='未加预处理') plt.plot(range(epochs), loss2, color='g', label='加了预处理') plt.plot(range(epochs), loss3, color='b', label='预处理且监督学习') plt.legend(loc=1) # 标签展示位置,数字代表标签具位置右上 plt.xlabel('epochs') plt.ylabel('Error') plt.title('grid_size=10,discount=0.9') plt.plot() plt.show()
def test_gw_once(grid_size, feature_map, n_samples, epochs, structure): """ Test MaxEnt and DeepMaxEnt on a gw of size grid_size with the feature map feature_map with n_samples paths. grid_size: Grid size. int. feature_map: Which feature map to use. String in {ident, coord, proxi}. n_samples: Number of paths to sample. epochs: Number of epochs to run MaxEnt with. structure: Neural network structure tuple, e.g. (3, 3) would be a 3-layer neural network with assumed inputs. -> Expected value difference for MaxEnt, DeepMaxEnt """ # Basic gist of what we're doing here: Get the reward function using our # different IRL methods, use those to get a policy, evaluate that policy # using the true reward, and then return the difference in expected values. # Setup parameters. wind = 0.3 discount = 0.9 learning_rate = 0.01 trajectory_length = 3*grid_size # Make the gridworld and associated data. gw = Gridworld(grid_size, wind, discount) feature_matrix = gw.feature_matrix(feature_map) ground_reward = np.array([gw.reward(i) for i in range(gw.n_states)]) optimal_policy = value_iteration.find_policy(gw.n_states, gw.n_actions, gw.transition_probability, ground_reward, discount).argmax(axis=1) trajectories = gw.generate_trajectories(n_samples, trajectory_length, optimal_policy.take) p_start_state = (np.bincount(trajectories[:, 0, 0], minlength=ow.n_states) / trajectories.shape[0]) # True value. optimal_V = value_iteration.optimal_value(gw.n_states, gw.n_actions, gw.transition_probability, ground_reward, gw.discount) # MaxEnt reward; policy; value. maxent_reward = deep_maxent.irl((feature_matrix.shape[1],), feature_matrix, gw.n_actions, gw.discount, gw.transition_probability, trajectories, epochs, learning_rate) maxent_policy = value_iteration.find_policy(gw.n_states, gw.n_actions, gw.transition_probability, maxent_reward, discount).argmax(axis=1) maxent_V = value_iteration.value(maxent_policy, gw.n_states, gw.transition_probability, ground_reward, gw.discount) maxent_EVD = optimal_V.dot(p_start_state) - maxent_V.dot(p_start_state) # DeepMaxEnt reward; policy; value. deep_maxent_reward = deep_maxent.irl((feature_matrix.shape[1],)+structure, feature_matrix, gw.n_actions, gw.discount, gw.transition_probability, trajectories, epochs, learning_rate) deep_maxent_policy = value_iteration.find_policy(gw.n_states, gw.n_actions, gw.transition_probability, deep_maxent_reward, discount).argmax(axis=1) deep_maxent_V = value_iteration.value(deep_maxent_policy, gw.n_states, gw.transition_probability, ground_reward, gw.discount) deep_maxent_EVD = (optimal_V.dot(p_start_state) - deep_maxent_V.dot(p_start_state)) plt.subplot(3, 3, 1) plt.pcolor(ground_reward.reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 2) plt.pcolor(maxent_reward.reshape((grid_size, grid_size))) plt.title("MaxEnt reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 3) plt.pcolor(deep_maxent_reward.reshape((grid_size, grid_size))) plt.title("DeepMaxEnt reward") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 4) plt.pcolor(optimal_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("Optimal policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 5) plt.pcolor(maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("MaxEnt policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 6) plt.pcolor(deep_maxent_policy.reshape((grid_size, grid_size)), vmin=0, vmax=3) plt.title("DeepMaxEnt policy") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 7) plt.pcolor(optimal_V.reshape((grid_size, grid_size))) plt.title("Optimal value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 8) plt.pcolor(maxent_V.reshape((grid_size, grid_size))) plt.title("MaxEnt value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.subplot(3, 3, 9) plt.pcolor(deep_maxent_V.reshape((grid_size, grid_size))) plt.title("DeepMaxEnt value") plt.tick_params(labeltop=False, labelbottom=False, labelleft=False, bottom=False, top=False, left=False, right=False, labelright=False) plt.savefig("{}_{}_{}_{}gridworld{}.png".format(grid_size, feature_map, n_samples, epochs, structure, np.random.randint(10000000))) return maxent_EVD, deep_maxent_EVD
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, start_state, wind=0.0, algo="maxnet", mdp="gridworld"): """ Run inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. start_state: start location to generate trajectory from algo: IRL algo to run (Currently, support maxnet and deep_maxnet) """ sx, sy = start_state trajectory_length = 8 if mdp == "objectworld": import irl.mdp.objectworld as objectworld ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) elif mdp == "gridworld": import irl.mdp.gridworld as gridworld ow = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) optimal_v = optimal_value(ow.n_states, ow.n_actions, ow.transition_probability, normalize(ground_r), ow.discount) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s], random_start=True) feature_matrix = ow.feature_matrix() print("trajectories = ", trajectories.shape) print("epochs = ", epochs) print("feature_matrix.shape = ", feature_matrix.shape) print("policy.shape = ", policy.shape) # ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), value=optimal_v) ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), policy=policy, value=optimal_v) r = [] ground_svf = [] if algo == "maxent": import irl.maxent as maxent ground_svf = maxent.find_svf(ow.n_states, trajectories) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) elif algo == "deep_maxnet": import irl.deep_maxent as deep_maxent l1 = l2 = 0 structure = (3, 3) r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) recovered_policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, normalize(r), ow.discount, stochastic=False) recovered_v = value(recovered_policy, ow.n_states, ow.transition_probability, normalize(r), ow.discount) new_trajectory = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: recovered_policy[s], True, (sx, sy)) recovered_svf = maxent.find_svf(ow.n_states, new_trajectory) # ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), # value=recovered_v) ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), policy=recovered_policy, value=recovered_v) # print("new trajectory") # for t in new_trajectory: # for s, a, rw in t: # print (ow.int_to_point(s), ow.actions[a], rw) # print ("---------") y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5] plt.subplot(111) plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth SVF") plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size))) plt.title("Recovered SVF") plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size))) plt.title("Recovered reward") plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150)