def main(): # init the gridworld rmap_gt = np.zeros(N_STATES) rmap_gt[N_STATES - 5] = R_MAX rmap_gt[10] = R_MAX gw = gridworld1d.GridWorld1D(rmap_gt, {}, ACT_RAND) P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rmap_gt, GAMMA, error=0.01, deterministic=True) # gradient rewards rmap_gt = values_gt gw = gridworld1d.GridWorld1D(rmap_gt, {}, ACT_RAND) P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rmap_gt, GAMMA, error=0.01, deterministic=True) # np.random.seed(1) trajs = gw.generate_demonstrations(policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) # feat_map = np.eye(N_STATES) feat_map = np.array([feat(s) for s in range(N_STATES)]) test_irl_algorithms(gw, P_a, rmap_gt, policy_gt, trajs, feat_map)
def main(): N_STATES = H * W N_ACTIONS = 5 # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) print(feat_map.shape) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX rmap_gt[0, W - 1] = R_MAX rmap_gt[H - 1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) feat_map_torch = torch.tensor(feat_map, dtype=torch.float) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print('Deep Max Ent IRL training ..') rewards = deep_maxent_irl(feat_map_torch, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #rewards = rewards.detach().numpy() values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) inputs: feat_map NxD matrix - the features for each state P_a NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of landing at state s1 when taking action a at state s0 gamma float - RL discount factor trajs a list of demonstrations lr float - learning rate n_iters int - number of optimization steps returns rewards Nx1 vector - recoverred state rewards """ n_states, _, n_actions = np.shape(P_a) # init parameters theta = np.random.uniform(size=(feat_map.shape[1], )) # calc feature expectations feat_exp = np.zeros([feat_map.shape[1]]) for episode in trajs: for step in episode: feat_exp += feat_map[step.cur_state, :] feat_exp = feat_exp / len(trajs) # training for iteration in range(n_iters): if iteration % (n_iters / 20) == 0: print('iteration: {}/{}'.format(iteration, n_iters)) # compute reward function rewards = np.dot(feat_map, theta) # compute policy _, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) # compute state visition frequences svf = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) # compute gradients grad = feat_exp - feat_map.T.dot(svf) # update params theta += lr * grad rewards = np.dot(feat_map, theta) # return sigmoid(normalize(rewards)) return normalize(rewards)
def assert_vi(P_a): assert_values, assert_policy = value_iteration.value_iteration( P_a, rewards, gamma, error=0.000001, deterministic=deterministic) assert_values_old, assert_policy_old = value_iteration.value_iteration_old( P_a, rewards, gamma, error=0.000001, deterministic=deterministic) if len(P_a) == 3: assert_values2 = value_iteration.optimal_value(N_STATES, N_ACTIONS, P_a_t, rewards, gamma, threshold=0.000001) assert (np.abs(assert_values - assert_values2) < 0.0001).all() assert (np.abs(assert_values - assert_values_old) < 0.0001).all() assert (np.abs(values - assert_values) < 0.0001).all() assert (np.abs(values - assert_values_old) < 0.0001).all() # print(assert_policy) # print(assert_policy_old) # print(policy) # print(values) # print(assert_values) # print(rewards) assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all() assert (np.abs(policy - assert_policy) < 0.0001).all() assert (np.abs(policy - assert_policy_old) < 0.0001).all()
def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) inputs: feat_map NxD matrix - the features for each state P_a NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of landing at state s1 when taking action a at state s0 gamma float - RL discount factor trajs a list of demonstrations lr float - learning rate n_iters int - number of optimization steps returns rewards Nx1 vector - recoverred state rewards """ # tf.set_random_seed(1) N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model import datetime nn_name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f") nn_r = DeepIRLFC(feat_map.shape[1], lr, 3, 3, name=nn_name) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) # training for iteration in range(n_iters): if iteration % (n_iters/10) == 0: print 'iteration: {}'.format(iteration) # compute the reward matrix rewards = nn_r.get_rewards(feat_map) # compute policy _, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # compute expected svf mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) # compute gradients on rewards: grad_r = mu_D - mu_exp # apply gradients to the neural network grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) rewards = nn_r.get_rewards(feat_map) nn_r.finished() #return normalize(rewards, -1, 1) return rewards
def __init__(self, gamma=0.9, act_rand=0.3, r_max=1, h=10, w=10, n_trajs=100, l_traj=20, rand_start=True, learning_rate=0.02, n_iters=20, save_dir="./exps", exp_name="gw_" + str(int(time.time())), n_exp=20, feat_map=None, gpu_fraction=0.2, terminal=True): self._gamma, self._act_rand, self._r_max, self._h, self._w, self._n_trajs, self._l_traj, self._rand_start, \ self._learning_rate, self._n_iters, self._save_dir, self._exp_name, self._n_exp = \ gamma, act_rand, r_max, h, w, n_trajs, l_traj, rand_start, learning_rate, n_iters, save_dir, exp_name, n_exp self._exp_result_path = save_dir + "/" + exp_name if not os.path.exists(self._exp_result_path): os.makedirs(self._exp_result_path) else: logging.warning(self._exp_result_path + " has existed") exit() rmap_gt = np.zeros([h, w]) rmap_gt[h - 1, w - 1] = rmap_gt[0, w - 1] = rmap_gt[h - 1, 0] = r_max if terminal: self._gw = gridworld.GridWorld(rmap_gt, {(h - 1, w - 1), (0, w - 1), (h - 1, 0)}, 1 - ACT_RAND) else: self._gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) self._rewards_gt = np.reshape(rmap_gt, H * W, order='F') self._P_a = self._gw.get_transition_mat() ts = time.time() self._values_gt, self._policy_gt = value_iteration.value_iteration( self._P_a, self._rewards_gt, GAMMA, error=0.01, deterministic=True) te = time.time() print "value iteration time of ground truth: ", te - ts ts = time.time() self.save_plt("gt", (3 * w, h), self._rewards_gt, self._values_gt, self._policy_gt) te = time.time() print "saving plt time: ", te - ts self._demo_trajs = self.generate_demonstrations() self._feat_map = np.eye(h * w) if feat_map is None else feat_map self._gpu_fraction = gpu_fraction
def main(): for seed in range(1): N_STATES = H * W # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) #goal coordinates rmap_gt[H - 1, W - 1] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration( P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(0) #trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) trajs = mod.exp1_case2() rewards = maxent_irl(gw, feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #np.savetxt('results/rewards.txt', rewards) #values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map', block=False) plt.plot() #now = datetime.datetime.now() #figname = "results/rewards_{0:%m%d%H%M}".format(now) + ".png" figname = "results/rewards_seed{0}".format(seed) + ".png" plt.savefig(figname)
def test_once(self, exp_id): tf.reset_default_graph() print 'Deep Max Ent IRL training ..' ts = time.time() rewards = deep_maxent_irl(self._feat_map, self._P_a, GAMMA, self._demo_trajs, self._learning_rate, self._n_iters, self._gpu_fraction) te = time.time() print 'IRL time: ', te - ts ts = time.time() values, policy = value_iteration.value_iteration(self._P_a, rewards, self._gamma, error=0.01, deterministic=True) te = time.time() print 'value iteration time of recovered: ', te - ts # plots ts = time.time() self.save_plt(exp_id, (3 * self._w, self._h), rewards, values, policy) te = time.time() print 'saving plt time: ', te - ts
def test_irl_algorithms(gw, P_a, rmap_gt, policy_gt, trajs, feat_map): print('LP IRL training ..') rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX) print('Max Ent IRL training ..') rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2, N_ITERS * 2) print('Deep Max Ent IRL training ..') rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 8)) plt.subplot(1, 4, 1) img_utils.heatmap2d(to_plot(rmap_gt), 'Rewards Map - Ground Truth', block=False, text=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(to_plot(rewards_lpirl), 'Reward Map - LP', block=False, text=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(to_plot(rewards_maxent), 'Reward Map - Maxent', block=False, text=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(to_plot(rewards), 'Reward Map - Deep Maxent', block=False, text=False) plt.show()
def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) inputs: feat_map NxD matrix - the features for each state P_a NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of landing at state s1 when taking action a at state s0 gamma float - RL discount factor trajs a list of demonstrations lr float - learning rate n_iters int - number of optimization steps returns rewards Nx1 vector - recoverred state rewards """ N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model net = IRLNet(lr) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) mu_D = torch.tensor(mu_D, requires_grad=True) # Init optimizer optimizer = optim.SGD(net.parameters(), lr=net.lr, weight_decay=1e-5) # training for iteration in range(n_iters): if iteration % (n_iters / 10) == 0: print('Training Step: {}'.format(iteration)) # compute the reward matrix rewards = net(feat_map) rewards_np = rewards.detach().numpy() # compute policy _, policy = value_iteration.value_iteration(P_a, rewards_np, gamma, error=0.01, deterministic=True) # compute expected svf mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) mu_exp = torch.tensor(mu_exp, requires_grad=True) # compute gradients on rewards: grad = mu_D - mu_exp grad = grad * -1 # Clear gradient buffer optimizer.zero_grad() # apply gradients to the neural network rewards.backward(grad) # Gradient clipping to prevent exploding gradients -> NaN in multiplication nn.utils.clip_grad_norm_(net.parameters(), 100.0) optimizer.step() # Final forward pass and return rewards = net(feat_map).detach().numpy() return normalize(rewards)
def deep_maxent_irl(self, feat_map, P_a, gamma, trajs, lr, n_iters, rewards_gt, policy_gt, mapSize, H, W, r_weight, p_weight, proposed): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) inputs: feat_map NxD matrix - the features for each state P_a NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of landing at state s1 when taking action a at state s0 gamma float - RL discount factor trajs a list of demonstrations lr float - learning rate n_iters int - number of optimization steps returns rewards Nx1 vector - recoverred state rewards """ self.finetune(allow = True) self.lr = lr self.rewards_gt = rewards_gt N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model feat_map = feat_map.view(1, 1, mapSize, mapSize) feat_map = Variable(feat_map) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES, r_weight, p_weight, proposed) self.networks.train() iteration = 0 # training for iteration in range(n_iters): #while True: self.networks.zero_grad() # compute the reward matrix rewards = self.networks(feat_map) # compute policy rewards = rewards.view(mapSize, 1) _, policy = value_iteration.value_iteration(P_a, rewards, gamma, H * W, error=0.01, deterministic=True, npy=False) #if iteration % (n_iters / 10) == 0: #temp_rewards = normalize(rewards.data.numpy()) #temp_rewards_gt = rewards_gt.data.numpy() # temp_evd = compute_expected_value_difference(policy, temp_rewards_gt, policy_gt, gamma, H, W) #reward_diff = np.abs(np.reshape(temp_rewards_gt, (H * W, 1)) - temp_rewards) #print 'iteration: {},'.format(iteration), 'Sum of diff : {},'.format(np.sum(reward_diff)), '# of exceed 0.3 : {}'.format(len(np.extract(reward_diff > 0.3, reward_diff))) # dyna algorithm # if iteration % (n_iters/10) == 0 and iteration/ (n_iters/10) > 30: # # get new trajs # trajs_new = generate_demonstrations(gw, policy, n_trajs=20, len_traj=20, rand_start=True) # trajs+=trajs_new # # update gt_svf # mu_D = demo_svf(trajs, N_STATES) # compute expected svf mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) # compute gradients on rewards: rewards = rewards.view(mapSize) # Set Gradient torch.autograd.backward([rewards], [-(mu_D-mu_exp)*self.lr]) # Update / Optimizer: SGD self.optimizer.step() # get output self.networks.eval() rewards = self.networks(feat_map) rewards = rewards.view(mapSize, 1) _, policy = value_iteration.value_iteration(P_a, rewards, gamma, H * W, error=0.01, deterministic=True, npy=False) mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) # return sigmoid(normalize(rewards)) rewards = rewards.view(mapSize, 1) rewards = rewards.data.numpy() # for normalize return normalize(rewards), mu_D, mu_exp
def deep_siamese_maxent_irl(feat_map, feat_map_inv, P_a, gamma, trajs, trajs_inv, lr, n_iters): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) inputs: feat_map NxD matrix - the features for each state P_a NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of landing at state s1 when taking action a at state s0 gamma float - RL discount factor trajs a list of demonstrations lr float - learning rate n_iters int - number of optimization steps returns rewards Nx1 vector - recoverred state rewards """ # tf.set_random_seed(1) N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model nn_r = DeepIRLP(feat_map.shape[1], lr, 3, 3) # find state visitation frequencies using demonstrations mu_D = find_svf(N_STATES, trajs) values = np.zeros(feat_map.shape[0]) # training for iteration in range(n_iters): if iteration % (n_iters / 10) == 0: print('iteration: {}'.format(iteration)) # compute the reward matrix rewards = nn_r.get_rewards(feat_map, feat_map_inv) # compute policy values, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # compute expected svf mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) # compute gradients on rewards: grad_r = mu_D - mu_exp # apply gradients to the neural network grad_theta, l2_loss, grad_norm = nn_r.apply_grads( feat_map, feat_map_inv, grad_r) rewards = nn_r.get_rewards(feat_map, feat_map_inv) # print(rewards) # return sigmoid(normalize(rewards)) return normalize(rewards)
def main(): N_STATES = H * W N_ACTIONS = 5 start_coordinates = (pixel_locations[0]['location-lat'][0], pixel_locations[0]['location-long'][0]) end_coordinates = ( pixel_locations[0]['location-lat'][len(pixel_locations[0].index) - 1], pixel_locations[0]['location-long'][len(pixel_locations[0].index) - 1]) rmap_gt = np.zeros([W, H]) rmap_gt[int(start_coordinates[0]), int(start_coordinates[1])] = R_MAX rmap_gt[int(end_coordinates[0]), int(end_coordinates[1])] = R_MAX # rmap_gt[H/2, W/2] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) rewards_gt = normalize(values_gt) gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {}, 1 - ACT_RAND) P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature # feat_map = np.eye(N_STATES) coast_map = np.load('Feature Maps/small_maps/coast.npy') coast_map = np.reshape(coast_map, (600, 1)) forest_map = np.load('Feature Maps/small_maps/forest.npy') forest_map = np.reshape(coast_map, (600, 1)) land_map = np.load('Feature Maps/small_maps/land.npy') land_map = np.reshape(coast_map, (600, 1)) feat_map = np.hstack((coast_map, forest_map, land_map)) # populate trajectories trajs = [] terminal_state = end_coordinates for x in range(len(pixel_locations)): trajs.append([]) for i in range(len(pixel_locations[x]) - 1): loc = pixel_locations[x].iloc[i] next_loc = pixel_locations[x].iloc[i + 1] action = get_action(loc, next_loc) reward = rmap_gt[int(next_loc[0]), int(next_loc[1])] is_done = np.array_equal(next_loc, terminal_state) trajs[x].append( Step(cur_state=int(gw.pos2idx(loc)), action=action, next_state=int(gw.pos2idx(next_loc)), reward=reward, done=is_done)) print 'LP IRL training ..' rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=100, R_max=R_MAX) print 'Max Ent IRL training ..' rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) # print 'Deep Max Ent IRL training ..' # rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, 10) # plots fig = plt.figure() plt.subplot(1, 2, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) fig.savefig('GroundTruth.png') plt.subplot(1, 1, 1) img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'), 'Reward Map - LP', block=False) fig.savefig('LP.png') plt.subplot(1, 1, 1) img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'), 'Reward Map - Maxent', block=False) fig.savefig('MaxEnt.png')
def main(): N_STATES = H * W N_ACTIONS = 5 """while True: print "BAD_STATE入力" bad = raw_input('>> ') if bad == 'ok': break Bad_states.append(bad) """ #print Bad_states # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) #new_rewards = reward_decrease(rewards, R_GAMMA, BAD_X, BAD_Y) np.savetxt('results/rewards.txt', rewards) #print rewards values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) #print policy # plots plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map', block=False) plt.plot() plt.figure(figsize=(20, 20)) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Policy Map', block=False) plt.plot() plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX rmap_gt[0, W - 1] = R_MAX rmap_gt[H - 1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') if ACT_RAND == 0: P_a = gw.get_transition_mat_deterministic() else: P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature #feat_map = np.eye(N_STATES) # feat_map = np.zeros(N_STATES).reshape((H, W)) feat_map = np.random.rand(N_STATES).reshape((H, W)) #feat_map = np.arange(N_STATES).reshape((H, W)) if ARGS.conv: #feat_map[H-1, W-1] = -5 #feat_map[0, W-1] = -5 #feat_map[H-1, 0] = -5 pass else: feat_map = feat_map.reshape(N_STATES) #feat_map = rmap_gt trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print 'Deep Max Ent IRL training ..' t = time.time() rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS, ARGS.conv, ARGS.sparse) print('time for dirl', time.time() - t) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) print( 'evd', value_iteration.expected_value_diff(P_a, rewards_gt, GAMMA, start_state_probs(trajs, N_STATES), values_gt, policy)) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 rmap_gt = np.zeros([H, W]) rmap_gt[H - 2, W - 2] = R_MAX rmap_gt[1, 1] = R_MAX # rmap_gt[H/2, W/2] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) rewards_gt = normalize(values_gt) gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {}, 1 - ACT_RAND) P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(N_STATES) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print('LP IRL training ..') rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX) print('Max Ent IRL training ..') rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2, N_ITERS * 2) print('Deep Max Ent IRL training ..') rewards_deep = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) print('Deep Siamese Max Ent IRL training ..') rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) # plots plt.figure(figsize=(20, 5)) plt.subplot(1, 5, 1) img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'), 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 5, 2) img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'), 'Reward Map - LP', block=False) plt.subplot(1, 5, 3) img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'), 'Reward Map - Maxent', block=False) plt.subplot(1, 5, 4) img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'), 'Reward Map - Deep Maxent', block=False) plt.subplot(1, 5, 5) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Deep Policy Maxent', block=False) plt.show()
def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, gpu_fraction): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) inputs: feat_map NxD matrix - the features for each state P_a NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of landing at state s1 when taking action a at state s0 gamma float - RL discount factor trajs a list of demonstrations lr float - learning rate n_iters int - number of optimization steps returns rewards Nx1 vector - recoverred state rewards """ # tf.set_random_seed(1) N_STATES, _, N_ACTIONS = np.shape(P_a) # init nn model nn_r = FCNIRL(feat_map.shape, lr, 3, 3, gpu_fraction) # find state visitation frequencies using demonstrations mu_D = demo_svf(trajs, N_STATES) # training for iteration in range(n_iters): if iteration % (n_iters / 10) == 0: print 'iteration: {}'.format(iteration) # compute the reward matrix rewards = nn_r.get_rewards(feat_map) rewards = np.reshape(rewards, [-1, 1]) # compute policy _, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # compute expected svf mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True) # compute gradients on rewards: grad_r = mu_D - mu_exp # apply gradients to the neural network grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r) rewards = nn_r.get_rewards(feat_map) print "rewards: ", rewards center_rewards = rewards - rewards.mean() print "rewards reduce mean: ", center_rewards # print "gaussian normalize rewards: ", rewards - rewards.mean() norm_rewards = normalize(rewards) print "normalize rewards: ", norm_rewards norm2_rewards = (rewards - rewards.mean()) / rewards.std() print "normalize2 rewards: ", norm2_rewards sigmoid_center_rewards = 1 / (1 + np.exp(-center_rewards)) print "sigmoid rewards: ", sigmoid_center_rewards sigmoid_norm2_rewards = 1 / (1 + np.exp(-norm2_rewards)) print "sigmoid norm2 rewards: ", sigmoid_norm2_rewards # return sigmoid(normalize(rewards)) # return normalize(rewards) # return sigmoid_norm2_rewards # return sigmoid_center_rewards return norm_rewards
def main(): N_STATES = H * W N_ACTIONS = 4 rmap_gt = set_rewards() gw = gridworld.GridWorld(rmap_gt, {(H - 1, W - 1)}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) path_gt = gw.display_path_grid(policy_gt) rmap_gt = gw.get_reward_mat() #temp plt.figure(figsize=(20, 4)) plt.subplot(1, 3, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 3, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 3, 3) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.show() sys.exit() # feat_map = np.eye(N_STATES) # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) feat_map = feature_histogram(gw) np.random.seed(1) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) path = gw.display_path_grid(policy) # plots plt.figure(figsize=(20, 4)) plt.subplot(2, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.subplot(2, 4, 7) img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 4 rmap_gt = set_rewards2() gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') P_a = gw.get_transition_mat() values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True) path_gt = gw.display_path_grid(policy_gt) # use identity matrix as feature ## feat_map = np.eye(N_STATES) feat_map = feature_histogram(gw) trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) print 'Deep Max Ent IRL training ..' rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS) values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) path = gw.display_path_grid(policy) # plots plt.figure(figsize=(20, 4)) plt.subplot(2, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.subplot(2, 4, 5) img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'), 'Path Map - Ground Truth', block=False) plt.subplot(2, 4, 7) img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path Map - Recovered', block=False) plt.show()
def main(): N_STATES = H * W N_ACTIONS = 5 # init the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([H, W]) rmap_gt[H - 1, W - 1] = R_MAX # rmap_gt[H-1, 0] = R_MAX gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND) rewards_gt = np.reshape(rmap_gt, H * W, order='F') # P_a = gw.get_transition_mat( ) #this is the transitin probablities of the matrix 5 action what is the probability of moving from state s1 to s2 give the action #getting the transition probabilities in my case is just impossible ... values_gt, policy_gt = value_iteration.value_iteration( P_a, rewards_gt, GAMMA, error=0.01, deterministic=True ) #value iteration and policy acoding to the currrent rewards 0 # use identity matrix as feature feat_map = np.eye(N_STATES) #features as one hot encoding # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) np.random.seed(1) trajs = generate_demonstrations( gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START) #this is the trajectories rewards = maxent_irl( feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS ) #need to input the feature map , transition priobalibliteis og the world pdb.set_trace() values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True) # plots plt.figure(figsize=(20, 4)) plt.subplot(1, 4, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(1, 4, 2) img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(1, 4, 3) img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(1, 4, 4) img_utils.heatmap2d(np.reshape(values, (H, W), order='F'), 'Value Map - Recovered', block=False) plt.show()
def main(): # named tuple to record demonstrations Step = namedtuple('Step','cur_state action next_state reward done') # argument parser for command line arguments parser = argparse.ArgumentParser(description=None) parser.add_argument('-wid', '--width', default=5, type=int, help='width of the gridworld') parser.add_argument('-hei', '--height', default=5, type=int, help='height of the gridworld') parser.add_argument('-lr', '--learning_rate', default=0.01, type=float, help='learning rate') parser.add_argument('-l', '--l_traj', default=20, type=int, help='length of expert trajectory') parser.add_argument('--no-rand_start', dest='rand_start', action='store_false', help='when sampling trajectories, fix start positions') parser.add_argument('--rand_start', dest='rand_start', action='store_true', help='when sampling trajectories, randomly pick start positions') parser.add_argument('--approx', dest='approx', action='store_true', help='flag to perform approximation of psa') parser.add_argument('-g', '--gamma', default=0.9, type=float, help='discount factor') parser.add_argument('-n', '--n_iters', default=20, type=int, help='number of iterations') parser.add_argument('-t', '--n_trajs', default=100, type=int, help='number of expert trajectories') parser.add_argument('-a', '--act_random', default=0.3, type=float, help='probability of acting randomly') # set default value for rand_start variable parser.set_defaults(rand_start=False) # parse and print arguments args = parser.parse_args() # arguments for environment and irl algorithm r_max = 1 gamma = args.gamma width = args.width height = args.height l_traj = args.l_traj approx = args.approx n_iters = args.n_iters n_trajs = args.n_trajs act_rand = args.act_random rand_start = args.rand_start learning_rate = args.learning_rate # variables for number of actions and states n_actions = 5 n_states = height * width # initialize the gridworld # rmap_gt is the ground truth for rewards rmap_gt = np.zeros([height, width]) rmap_gt[0, width-1] = r_max rmap_gt[height-1, 0] = r_max rmap_gt[height-1, width-1] = r_max # create grid world instance gw = gridworld.GridWorld(rmap_gt, {}, 1-act_rand) # get true rewards, state transition dynamics rewards_gt = np.reshape(rmap_gt, height*width, order='F') P_a_true = gw.get_transition_mat() trajs = generate_random(gw, n_actions, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start) # get approximation of state transition dynamics P_a_approx = np.zeros((n_states, n_states, n_actions)) for traj in trajs: for t in range(len(traj)): P_a_approx[traj[t].cur_state, traj[t].next_state, traj[t].action] += 1 for s in range(n_states): for a in range(n_actions): if np.sum(P_a_approx[s,:,a]) != 0: P_a_approx[s,:,a] /= np.sum(P_a_approx[s,:,a]) if approx: P_a = P_a_approx else: P_a = P_a_true # get true value function and policy from reward map values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, gamma, error=0.01, deterministic=True) # use identity matrix as feature feat_map = np.eye(n_states) # other two features. due to the linear nature, # the following two features might not work as well as the identity. # feat_map = feature_basis(gw) # feat_map = feature_coord(gw) trajs = generate_demonstrations(gw, policy_gt, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start) # perform inverse reinforcement learning to get reward function rewards = maxent_irl(feat_map, P_a, gamma, trajs, learning_rate, n_iters) values, _ = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) # plots plt.figure(figsize=(20,4)) plt.subplot(2, 2, 1) img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False) plt.subplot(2, 2, 2) img_utils.heatmap2d(np.reshape(values_gt, (height,width), order='F'), 'Value Map - Ground Truth', block=False) plt.subplot(2, 2, 3) img_utils.heatmap2d(np.reshape(rewards, (height,width), order='F'), 'Reward Map - Recovered', block=False) plt.subplot(2, 2, 4) img_utils.heatmap2d(np.reshape(values, (height,width), order='F'), 'Value Map - Recovered', block=False) plt.show() # plots for state transition dynamics plt.figure(figsize=(10,4)) plt.subplot(2, 1, 1) img_utils.heatmap2d(np.reshape(P_a_true[10,:,2], (height,width), order='F'), 'True Dist', block=False) plt.subplot(2, 1, 2) img_utils.heatmap2d(np.reshape(P_a_approx[10,:,2], (height,width), order='F'), 'Approx Dist', block=False) plt.show()
def maxent_irl(gw, feat_map, P_a, gamma, trajs, lr, n_iters): """ Maximum Entropy Inverse Reinforcement Learning (Maxent IRL) inputs: feat_map NxD matrix - the features for each state P_a NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of landing at state s1 when taking action a at state s0 gamma float - RL discount factor trajs a list of demonstrations lr float - learning rate n_iters int - number of optimization steps returns rewards Nx1 vector - recoverred state rewardsF """ MRATE_THRESHOLD = 0.9 exp_count = 0 SLIDESIZE = 100 # state number state_num = np.array([i + 1 for i in range(H * W)]) statecount = np.zeros(H * W, dtype=float) # init parameters theta = np.random.uniform(size=(feat_map.shape[1], )) # calc feature expectations feat_exp = np.zeros([feat_map.shape[1]]) for episode in trajs: for step in episode: feat_exp += feat_map[step.cur_state, :] feat_exp = feat_exp / len(trajs) check_opt_traj = [] update_time = [] data_stepsize = [] #case2 exp_traj = [ 0, 1, 2, 3, 4, 5, 6, 13, 20, 19, 18, 17, 24, 31, 38, 45, 46, 47, 48 ] e_traj = [ 0, 1, 2, 3, 4, 5, 6, 13, 20, 19, 18, 17, 24, 31, 38, 45, 46, 47, 48 ] #case1 """ exp_traj = [0,7,14,15,22,29,28,35,42,43,44,45,46,47,48] e_traj = [0,7,14,15,22,29,28,35,42,43,44,45,46,47,48] """ #case3 """ exp_traj = [0,1,8,15,22,31,32,33,40,41,48] e_traj = [0,1,8,15,22,31,32,33,40,41,48] """ #case3 new """ exp_traj = [0,1,8,15,22,23, 24, 31,32,33,40,41,48] e_traj = [0,1,8,15,22,23,24,31,32,33,40,41,48] """ #case4 """ exp_traj = [0,1,8,15,14,21,28,29,30,31,32,33,40,41,48] e_traj =[0,1,8,15,14,21,28,29,30,31,32,33,40,41,48] """ #exp3 """ exp_traj = [0,1,2,3,4,5,6,13,20,27,26,25,24,31,38,45,46,47,48] e_traj = [0,1,2,3,4,5,6,13,20,27,26,25,24,31,38,45,46,47,48] """ #exp3 case1 """ exp_traj = [0,1,2,3,4,5,6,13,12,11,10,9,16,23,30,37,44,45,46,47,48] e_traj = [0,1,2,3,4,5,6,13,12,11,10,9,16,23,30,37,44,45,46,47,48] """ #exp3 case2 """ exp_traj=[0,1,8,15,14,21,28,29,30,31,24,17, 10, 11, 12, 19,26,33,40,47,48] e_traj=[0,1,8,15,14,21,28,29,30,31,24,17, 10, 11, 12, 19,26,33,40,47,48] """ #exp3 case2 new """ exp_traj=[0,1,8,15,14,21,28,29,30,31,24,17, 10, 11,12, 13, 20 ,27,34,41,48] e_traj=[0,1,8,15,14,21,28,29,30,31,24,17,10,11,12,13,20,27,34,41,48] """ select_candidate = [] maxstate = 0 overstate = [] # training for iteration in tqdm(range(n_iters)): #print 'iteration: {}/{}'.format(iteration, n_iters) # compute reward function if (iteration == 0): rewards = np.dot(feat_map, theta) # compute policy _, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=False) _, true_policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True) """ if (iteration % 150 == 0): plt.figure(figsize=(20,20)) img_utils.heatmap2d(np.reshape(rewards, (H,W), order='F'), 'Reward Map', block=False) plt.plot() now = datetime.datetime.now() figname = "results/reward/rewards_{0:%m%d%H%M%S}".format(now) + ".png" plt.savefig(figname) plt.figure(figsize=(20,20)) img_utils.heatmap2d(np.reshape(value, (H, W), order='F'), 'Value Map', block=False) plt.plot() figname1 = "results/value/value_{0:%m%d%H%M%S}".format(now) + ".png" plt.savefig(figname1) #plt.show() """ # compute new trajectory #new_trajs = generate_newtrajs(gw, true_policy, n_trajs=100, len_traj=30, rand_start=False) #opt_traj = get_optimaltrajectory(true_policy,7,7,20) #if terminal == 48 candidate = get_trajectory_egreedy(true_policy, 7, 7, 20) re_candidate = sorted(set(candidate), key=candidate.index) e_traj = copy.deepcopy(re_candidate) if exp_traj[-2] in re_candidate: select_candidate = copy.deepcopy(re_candidate) print " " print "candidate ", candidate print "re_candidate", re_candidate print "exp_traj ", exp_traj #compare epsilon-greedy trajectory m_rate = match_rate('simple', e_traj, exp_traj) print m_rate #m_threshold = tune_rate(iteration, n_iters, MRATE_THRESHOLD, update_time) #print ("m_threshold", m_threshold) if ((len(exp_traj) > len(e_traj)) and ((check_opt_traj != e_traj)) and (m_rate >= MRATE_THRESHOLD) and ((48 in e_traj))): trajs = make_traj(20, e_traj) exp_count += 1 feat_exp = np.zeros([feat_map.shape[1]]) for episode in trajs: for step in episode: feat_exp += feat_map[step.cur_state, :] feat_exp = feat_exp / len(trajs) check_opt_traj = e_traj exp_traj = e_traj update_time.append(iteration) """ if(iteration == 100): trajs = tj.exp1_case3_correct() exp_count += 1 feat_exp = np.zeros([feat_map.shape[1]]) for episode in trajs: for step in episode: feat_exp += feat_map[step.cur_state,:] feat_exp = feat_exp/len(trajs) check_opt_traj= [0,1,8,15,16,23,24,31,32,33,40,41,48] exp_traj = [0,1,8,15,16,23,24,31,32,33,40,41,48] update_time.append(iteration) """ #print "exp_traj ", exp_traj print "update_time", update_time data_stepsize.append(len(check_opt_traj)) ''' with open('results/step_size.csv', 'a') as f: f.write(str(iteration)) f.write(",") f.write(str(data_stepsize[iteration])) f.write('\n') f.close ''' ''' with open('results/expert.csv', 'a') as f: f.write(str(iteration)) f.write(",") for state in exp_traj: f.write(str(state)) f.write(",") f.write('\n') f.close ''' ''' with open('results/candidate.csv', 'a') as f: f.write(str(iteration)) f.write(",") for state in candidate: f.write(str(state)) f.write(",") f.write('\n') f.close ''' ''' with open('results/re_candidate.csv', 'a') as f: f.write(str(iteration)) f.write(",") for state in re_candidate: f.write(str(state)) f.write(",") f.write('\n') f.close ''' ''' with open('results/select_candidate.csv', 'a') as f: f.write(str(iteration)) f.write(",") for state in select_candidate: f.write(str(state)) f.write(",") f.write('\n') f.close ''' # compute state visition frequences svf = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=False) # compute gradients grad = feat_exp - feat_map.T.dot(svf) # update params theta += lr * grad rewards = np.dot(feat_map, theta) #policy rewards = normalize(rewards) for t in range(len(candidate)): statecount[candidate[t]] += 1 if (iteration % SLIDESIZE == 0 and iteration != 0): fig = plt.figure() left = state_num height = statecount # minmax_h = min_max(height) plt.bar(left, height, color="#FF5B70") plt.title("iteration{0}".format(iteration)) plt.savefig('results/statecount{0}'.format(iteration) + '.png') print height overstate = theta for i in range(len(height)): if (height[i] > SLIDESIZE + 50): overstate[i] = 0.0 print "overstate{0}".format(i) statecount = np.zeros(H * W, dtype=int) if (iteration > SLIDESIZE): theta = overstate rewards = np.dot(feat_map, theta) #return rewards return normalize(rewards)