def training(pos): id_trajectory = load.load_trajectory(1000) graph_trajectories = tools.choose_trajectory(1000, id_trajectory) _graph = load.load_graph_traj(graph_trajectories) sample_trajectories = tools.choose_trajectory(100, id_trajectory) gw = gridworld.Gridworld(_graph, 0.9) feature_matrix = gw.feature_matrix(_graph) alpha = maxent.irl(_graph, feature_matrix, sample_trajectories, 1, 0.05) path = str("D:/Ubicomp/alpha" + str(pos) + ".txt") type(path) print path numpy.savetxt(path, alpha) _graph = graph.Graph([], {}, False, False) del _graph return alpha
def train(discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ # wind = 0.3 trajectory_length = 268 # gw = gridworld.Gridworld(grid_size, wind, discount) env = Env() trajectories = env.generate_trajectories(n_trajectories, trajectory_length, env.optimal_policy_deterministic) feature_matrix = env.feature_matrix() r = maxent.irl(feature_matrix, env.n_actions, discount, env.transition_probability, trajectories, epochs, learning_rate, "alpha_%d.pkl", "alpha_205.pkl", 205) pkl.dump(r, open("maxent_reward.pkl", 'wb')) return r
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 3*grid_size gw = gridworld.Gridworld(grid_size, wind, discount) trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, gw.optimal_policy) feature_matrix = gw.feature_matrix() ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) r = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, gw.optimal_policy) feature_matrix = gw.feature_matrix() ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) r = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) print("ow.n_states", ow.n_states) print("ow.n_actions", ow.n_actions) print("ow.transition_probability", ow.transition_probability, len(ow.transition_probability), len(ow.transition_probability[0]), len(ow.transition_probability[0][0])) print("ground_r", ground_r, len(ground_r)) print("ow.discount", ow.discount) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) print(trajectories) feature_matrix = ow.feature_matrix(discrete=False) print("feature_matrix", feature_matrix, len(feature_matrix), len(feature_matrix[0])) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 3*grid_size gw = gridworld.Gridworld(grid_size, wind, discount) #trajectories = gw.generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy) trajectories = gw.my_generate_trajectories(n_trajectories,trajectory_length,gw.optimal_policy) feature_matrix = gw.feature_matrix() #feature_matrix = gw.feature_matrix_goalVsOther() #feature_matrix = gw.feature_matrix_goalVsOtherTwo() #feature_matrix = gw.feature_matrix_goalVsOtherThree() #ground truth given by us as we know which states are good vs bad ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) #reard recovered using IRL algorithm recovered_reward = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, trajectories, epochs, learning_rate) #let's standardiese it scaler = StandardScaler() standardised_reward=scaler.fit_transform(recovered_reward.reshape(-1,1)) #print(recovered_reward) #print(standardised_reward) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(standardised_reward.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) feature_matrix = ow.feature_matrix(discrete=False) r = maxent.irl( feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate ) plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.show()
def main(discount, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ #wind = 0.3 #trajectory_length = 8 lw = lindaworld.Lindaworld() #ground_r = np.array([lw.reward(s) for s in range(lw.n_states)]) #policy = find_policy(lw.n_states, lw.n_actions, lw.transition_probability, # ground_r, lw.discount, stochastic=False) #trajectories = lw.generate_trajectories(n_trajectories, # trajectory_length, # lambda s: policy[s]) feature_matrix = lw.feature_matrix(discrete=False) reward = maxent.irl(feature_matrix, lw.n_actions, discount, lw.transition_probability, lw.trajectories, epochs, learning_rate) policy = maxent.find_policy(lw.n_states, reward, lw.n_actions, discount, lw.transition_probability) ## Save the policy print "Saving policy file...", sys.stdout.flush() policy_file = open("linda_policy.pnpo", "w") for i, a_prob in enumerate(policy): state = lw.state_list[i] actions = np.array(lw.action_list)[np.where(a_prob == np.amax(a_prob))] #print set(actions.tolist()) action = random.choice( list(set(actions.tolist()) - set(["NO_ACTION"]))) policy_file.write(state + "\t" + action + "\n") policy_file.close() print "DONE" #for i, ap in enumerate(policy): # print lw.state_list[i], np.array(lw.action_list)[np.where(ap==np.amax(ap))] #for i, state in enumerate(lw.state_list): # print reward[i], state rospy.init_node("reward_visualizer") ## Visualize the reward on the rviz markers # initialize interactive marker server rew_markers_publisher = rospy.Publisher("reward_visualizer", MarkerArray, latch=True, queue_size=10) # take the current markers top_map = rospy.wait_for_message("/topological_map", TopologicalMap, timeout=10) max_reward = max(reward) min_reward = min(reward) map_v = "/map" marker_array = MarkerArray() for index, node in enumerate(top_map.nodes): # get the corresponding state index currentstate_index = None closeststate_index = None current_marker_id = None closest_marker_id = None for i, state in enumerate(lw.state_list): if "CurrentNode_" + node.name in state.split(" "): currentstate_index = i current_marker_id = int(node.name.replace("WayPoint", "")) print "current>>>> ", state if "ClosestNode_" + node.name in state.split(" "): closeststate_index = i closest_marker_id = int(node.name.replace("WayPoint", "")) * 100 print "closest>>>> ", state # Current state marker if current_marker_id is not None: current_box_marker = Marker() if currentstate_index is not None: # get heatmap color r, g, b = rgb(min_reward, max_reward, reward[currentstate_index]) current_box_marker.text = str(reward[currentstate_index]) else: current_box_marker.text = "0" r = g = b = 0.0 current_box_marker.header.frame_id = map_v current_box_marker.type = Marker.CYLINDER current_box_marker.action = Marker.ADD current_box_marker.id = current_marker_id current_box_marker.scale.x = 0.5 current_box_marker.scale.y = 0.5 current_box_marker.scale.z = 0.1 current_box_marker.pose = node.pose current_box_marker.color.r = r current_box_marker.color.g = g current_box_marker.color.b = b current_box_marker.color.a = 1.0 marker_array.markers.append(current_box_marker) if closest_marker_id is not None: # Closest state marker closest_box_marker = Marker() if closeststate_index is not None: # get heatmap color r, g, b = rgb(min_reward, max_reward, reward[closeststate_index]) closest_box_marker.text = str(reward[closeststate_index]) #print reward[closeststate_index] else: closest_box_marker.text = "0" r = g = b = 0.0 closest_box_marker.header.frame_id = map_v closest_box_marker.type = Marker.CYLINDER closest_box_marker.action = Marker.ADD closest_box_marker.id = closest_marker_id closest_box_marker.scale.x = 2 closest_box_marker.scale.y = 2 closest_box_marker.scale.z = 0.01 closest_box_marker.pose = node.pose closest_box_marker.color.r = r closest_box_marker.color.g = g closest_box_marker.color.b = b closest_box_marker.color.a = 0.7 marker_array.markers.append(closest_box_marker) #print marker_array rew_markers_publisher.publish(marker_array) rospy.spin()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, start_state): """ Run maximum entropy inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ sx, sy = start_state wind = 0.3 trajectory_length = 8 ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) ow.plot_grid() ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) print("Policy = ", policy.shape) # print ("policy - {}".format(policy)) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s]) print("trajectories = ", trajectories.shape) # for t in trajectories: # ow.plot_grid("trajectory_{}.png".format(t), t) # for t in trajectories: # for s, a, r in t: # print (ow.int_to_point(s), ow.actions[a], r) # print ("---------") feature_matrix = ow.feature_matrix(discrete=False) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) recovered_policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, r, ow.discount, stochastic=False) new_trajectory = ow.generate_trajectories(1, trajectory_length, lambda s: recovered_policy[s], False, (sx, sy)) print("new trajectory") for t in new_trajectory: ow.plot_grid("new_trajectory.png", t) for s, a, rw in t: print(ow.int_to_point(s), ow.actions[a], rw) print("---------") plt.subplot(1, 2, 1) plt.pcolor(ground_r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth reward") plt.subplot(1, 2, 2) plt.pcolor(r.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Recovered reward") plt.savefig("reward.png", format="png", dpi=150)
def main(grid_size, discount, n_trajectories, epochs, learning_rate): """ Run maximum entropy inverse reinforcement learning on the gridworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. """ wind = 0.1 #模拟干扰,噪声,专家出错导致动作非最优的概率 trajectory_length = 3 * grid_size gw = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([gw.reward(s) for s in range(gw.n_states)]) # 由强化学习求最优策略让它代表专家策略产生示例轨迹 policy = find_policy(gw.n_states, gw.n_actions, gw.transition_probability, ground_r, discount) trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, policy, random_start=True) # 画轨迹图 预处理前 paths = [] for i in trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '预处理前专家示例轨迹') # 预处理专家轨迹 new_trajectories = pre_treated(gw.n_states, gw.n_actions, trajectories) # 画轨迹图 预处理后 paths = [] for i in new_trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '预处理后专家示例轨迹') feature_matrix = gw.feature_matrix() trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in trajectories] # maxent irl处理的格式 r1, R1 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(trajectories), epochs, learning_rate) r1 = r1 / max(r1) loss1 = [] for r in R1: r = r / max(r) loss = abs(r - ground_r).sum() loss1.append(loss) new_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in new_trajectories] # maxent irl处理的格式 feature_matrix = gw.feature_matrix() r2, R2 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(new_trajectories), epochs, learning_rate) r2 = r2 / max(r2) loss2 = [] for r in R2: r = r / max(r) loss = abs(r - ground_r).sum() loss2.append(loss) # 监督学习 policy_sl = supervised_learning(new_trajectories, policy) # 监督学习 equal = 0 for i in range(len(policy)): if policy_sl[i] == policy[i]: equal += 1 / len(policy) print("监督学习得到的策略正确率{}%".format(100 * equal)) # 由监督学习策略生成轨迹 sl_trajectories = gw.generate_trajectories(n_trajectories, trajectory_length, policy_sl, random_start=True) # 预处理监督学习策略轨迹 new_sl_trajectories = pre_treated(gw.n_states, gw.n_actions, sl_trajectories) # 画轨迹图 监督学习策略 paths = [] for i in new_sl_trajectories: path = [j[0] for j in i] paths.append(path) draw_path(gw.grid_size, paths, '监督学习策略估计出的专家轨迹') new_sl_trajectories = [[(s, a, r) for (s, a, r, _) in trajectory] for trajectory in new_sl_trajectories] mix_trajectories = new_trajectories for trajectory in new_sl_trajectories: for i in new_trajectories: if trajectory[-1] == i[-1]: mix_trajectories.append(trajectory) break feature_matrix = gw.feature_matrix() r3, R3 = maxent.irl(feature_matrix, gw.n_actions, discount, gw.transition_probability, np.array(mix_trajectories), epochs, learning_rate) r3 = r3 / max(r3) loss3 = [] for r in R3: r = r / max(r) loss = abs(r - ground_r).sum() loss3.append(loss) # # 2维图 # plt.subplot(1, 3, 1) # plt.pcolor(r1.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("未进行预处理恢复的R") # plt.subplot(1, 3, 2) # plt.pcolor(r2.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("进行预处理恢复的R") # plt.subplot(1, 3, 3) # plt.pcolor(r3.reshape((grid_size, grid_size))) # plt.colorbar() # plt.title("预处理且监督学习恢复的R") # plt.show() # 画三维图 # 绘图设置 # X和Y的个数要相同 X = range(gw.grid_size) Y = range(gw.grid_size) Z1 = r1 Z2 = r2 Z3 = r3 # meshgrid把X和Y变成平方长度,比如原来都是4,经过meshgrid和ravel之后,长度都变成了16,因为网格点是16个 xx, yy = np.meshgrid(X, Y) # 网格化坐标 X, Y = xx.ravel(), yy.ravel() # 矩阵扁平化 # # 设置柱子属性 height = np.zeros_like(Z1) # 新建全0数组,shape和Z相同,据说是图中底部的位置 width = depth = 1 # 柱子的长和宽 # # 颜色数组,长度和Z一致 c = ['y'] * len(Z1) # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z1, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("未进行预处理恢复的R") plt.show() # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z2, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("预处理后恢复的R") plt.show() # 开始画图,注意本来的顺序是X, Y, Z, width, depth, height,但是那样会导致不能形成柱子,只有柱子顶端薄片,所以Z和height要互换 fig = plt.figure() ax = fig.gca(projection='3d') # 三维坐标轴 ax.bar3d(X, Y, height, width, depth, Z3, color=c, shade=True) # width, depth, height ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('reward_vale') plt.title("预处理且监督学习恢复的R") plt.show() # 画误差图 plt.plot(range(epochs), loss1, color='r', label='未加预处理') plt.plot(range(epochs), loss2, color='g', label='加了预处理') plt.plot(range(epochs), loss3, color='b', label='预处理且监督学习') plt.legend(loc=1) # 标签展示位置,数字代表标签具位置右上 plt.xlabel('epochs') plt.ylabel('Error') plt.title('grid_size=10,discount=0.9') plt.plot() plt.show()
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs, learning_rate, start_state, wind=0.0, algo="maxnet", mdp="gridworld"): """ Run inverse reinforcement learning on the objectworld MDP. Plots the reward function. grid_size: Grid size. int. discount: MDP discount factor. float. n_objects: Number of objects. int. n_colours: Number of colours. int. n_trajectories: Number of sampled trajectories. int. epochs: Gradient descent iterations. int. learning_rate: Gradient descent learning rate. float. start_state: start location to generate trajectory from algo: IRL algo to run (Currently, support maxnet and deep_maxnet) """ sx, sy = start_state trajectory_length = 8 if mdp == "objectworld": import irl.mdp.objectworld as objectworld ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind, discount) elif mdp == "gridworld": import irl.mdp.gridworld as gridworld ow = gridworld.Gridworld(grid_size, wind, discount) ground_r = np.array([ow.reward(s) for s in range(ow.n_states)]) policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, ground_r, ow.discount, stochastic=False) optimal_v = optimal_value(ow.n_states, ow.n_actions, ow.transition_probability, normalize(ground_r), ow.discount) trajectories = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: policy[s], random_start=True) feature_matrix = ow.feature_matrix() print("trajectories = ", trajectories.shape) print("epochs = ", epochs) print("feature_matrix.shape = ", feature_matrix.shape) print("policy.shape = ", policy.shape) # ow.plot_grid("value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), value=optimal_v) ow.plot_grid("policy_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), policy=policy, value=optimal_v) r = [] ground_svf = [] if algo == "maxent": import irl.maxent as maxent ground_svf = maxent.find_svf(ow.n_states, trajectories) r = maxent.irl(feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate) elif algo == "deep_maxnet": import irl.deep_maxent as deep_maxent l1 = l2 = 0 structure = (3, 3) r = deep_maxent.irl((feature_matrix.shape[1], ) + structure, feature_matrix, ow.n_actions, discount, ow.transition_probability, trajectories, epochs, learning_rate, l1=l1, l2=l2) recovered_policy = find_policy(ow.n_states, ow.n_actions, ow.transition_probability, normalize(r), ow.discount, stochastic=False) recovered_v = value(recovered_policy, ow.n_states, ow.transition_probability, normalize(r), ow.discount) new_trajectory = ow.generate_trajectories(n_trajectories, trajectory_length, lambda s: recovered_policy[s], True, (sx, sy)) recovered_svf = maxent.find_svf(ow.n_states, new_trajectory) # ow.plot_grid("recovered_value_{}_t{}_e{}_w{}.png".format(algo, # n_trajectories, epochs, wind), # value=recovered_v) ow.plot_grid("recovered_policy_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), policy=recovered_policy, value=recovered_v) # print("new trajectory") # for t in new_trajectory: # for s, a, rw in t: # print (ow.int_to_point(s), ow.actions[a], rw) # print ("---------") y, x = np.mgrid[-0.5:grid_size + 0.5, -0.5:grid_size + 0.5] plt.subplot(111) plt.pcolor(x, y, ground_svf.reshape((grid_size, grid_size))) plt.colorbar() plt.title("Groundtruth SVF") plt.savefig("ground_svf_{}_t{}_e{}_w{}.png".format(algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, recovered_svf.reshape((grid_size, grid_size))) plt.title("Recovered SVF") plt.savefig("recovered_svf_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(ground_r).reshape((grid_size, grid_size))) plt.title("Groundtruth reward") plt.savefig("ground_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150) plt.pcolor(x, y, normalize(r).reshape((grid_size, grid_size))) plt.title("Recovered reward") plt.savefig("recovered_reward_{}_t{}_e{}_w{}.png".format( algo, n_trajectories, epochs, wind), format="png", dpi=150)