Example #1
0
def main():
    # init the gridworld
    rmap_gt = np.zeros(N_STATES)
    rmap_gt[N_STATES - 5] = R_MAX
    rmap_gt[10] = R_MAX

    gw = gridworld1d.GridWorld1D(rmap_gt, {}, ACT_RAND)
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rmap_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # gradient rewards
    rmap_gt = values_gt
    gw = gridworld1d.GridWorld1D(rmap_gt, {}, ACT_RAND)
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rmap_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # np.random.seed(1)
    trajs = gw.generate_demonstrations(policy_gt,
                                       n_trajs=N_TRAJS,
                                       len_traj=L_TRAJ,
                                       rand_start=RAND_START)

    # feat_map = np.eye(N_STATES)
    feat_map = np.array([feat(s) for s in range(N_STATES)])
    test_irl_algorithms(gw, P_a, rmap_gt, policy_gt, trajs, feat_map)
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)
    print(feat_map.shape)

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)

    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
Example #3
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    rmap_gt[0, W - 1] = R_MAX
    rmap_gt[H - 1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)
    feat_map_torch = torch.tensor(feat_map, dtype=torch.float)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print('Deep Max Ent IRL training ..')
    rewards = deep_maxent_irl(feat_map_torch, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)
    #rewards = rewards.detach().numpy()
    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
Example #4
0
def maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
    """
	Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)

	inputs:
		feat_map	NxD matrix - the features for each state
		P_a			NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of 
					landing at state s1 when taking action a at state s0
		gamma		float - RL discount factor
		trajs		a list of demonstrations
		lr			float - learning rate
		n_iters		int - number of optimization steps

	returns
		rewards		Nx1 vector - recoverred state rewards
	"""
    n_states, _, n_actions = np.shape(P_a)

    # init parameters
    theta = np.random.uniform(size=(feat_map.shape[1], ))

    # calc feature expectations
    feat_exp = np.zeros([feat_map.shape[1]])
    for episode in trajs:
        for step in episode:
            feat_exp += feat_map[step.cur_state, :]
    feat_exp = feat_exp / len(trajs)

    # training
    for iteration in range(n_iters):
        if iteration % (n_iters / 20) == 0:
            print('iteration: {}/{}'.format(iteration, n_iters))

        # compute reward function
        rewards = np.dot(feat_map, theta)

        # compute policy
        _, policy = value_iteration.value_iteration(P_a,
                                                    rewards,
                                                    gamma,
                                                    error=0.01,
                                                    deterministic=False)

        # compute state visition frequences
        svf = compute_state_visition_freq(P_a,
                                          gamma,
                                          trajs,
                                          policy,
                                          deterministic=False)

        # compute gradients
        grad = feat_exp - feat_map.T.dot(svf)

        # update params
        theta += lr * grad

    rewards = np.dot(feat_map, theta)

    # return sigmoid(normalize(rewards))
    return normalize(rewards)
Example #5
0
    def assert_vi(P_a):
        assert_values, assert_policy = value_iteration.value_iteration(
            P_a, rewards, gamma, error=0.000001, deterministic=deterministic)
        assert_values_old, assert_policy_old = value_iteration.value_iteration_old(
            P_a, rewards, gamma, error=0.000001, deterministic=deterministic)

        if len(P_a) == 3:
            assert_values2 = value_iteration.optimal_value(N_STATES,
                                                           N_ACTIONS,
                                                           P_a_t,
                                                           rewards,
                                                           gamma,
                                                           threshold=0.000001)

            assert (np.abs(assert_values - assert_values2) < 0.0001).all()

        assert (np.abs(assert_values - assert_values_old) < 0.0001).all()
        assert (np.abs(values - assert_values) < 0.0001).all()
        assert (np.abs(values - assert_values_old) < 0.0001).all()

        # print(assert_policy)
        # print(assert_policy_old)
        # print(policy)
        # print(values)
        # print(assert_values)
        # print(rewards)
        assert (np.abs(assert_policy - assert_policy_old) < 0.0001).all()
        assert (np.abs(policy - assert_policy) < 0.0001).all()
        assert (np.abs(policy - assert_policy_old) < 0.0001).all()
def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
  """
  Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)

  inputs:
    feat_map    NxD matrix - the features for each state
    P_a         NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of 
                                       landing at state s1 when taking action 
                                       a at state s0
    gamma       float - RL discount factor
    trajs       a list of demonstrations
    lr          float - learning rate
    n_iters     int - number of optimization steps

  returns
    rewards     Nx1 vector - recoverred state rewards
  """

  # tf.set_random_seed(1)
  
  N_STATES, _, N_ACTIONS = np.shape(P_a)

  # init nn model
  import datetime  
  nn_name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
  nn_r = DeepIRLFC(feat_map.shape[1], lr, 3, 3, name=nn_name)

  # find state visitation frequencies using demonstrations
  mu_D = demo_svf(trajs, N_STATES)

  # training 
  for iteration in range(n_iters):
    if iteration % (n_iters/10) == 0:
      print 'iteration: {}'.format(iteration)
    
    # compute the reward matrix
    rewards = nn_r.get_rewards(feat_map)
    
    # compute policy 
    _, policy = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)
    
    # compute expected svf
    mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)
    
    # compute gradients on rewards:
    grad_r = mu_D - mu_exp

    # apply gradients to the neural network
    grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)
    

  rewards = nn_r.get_rewards(feat_map)
  nn_r.finished()

  #return normalize(rewards, -1, 1)
  return rewards
Example #7
0
 def __init__(self,
              gamma=0.9,
              act_rand=0.3,
              r_max=1,
              h=10,
              w=10,
              n_trajs=100,
              l_traj=20,
              rand_start=True,
              learning_rate=0.02,
              n_iters=20,
              save_dir="./exps",
              exp_name="gw_" + str(int(time.time())),
              n_exp=20,
              feat_map=None,
              gpu_fraction=0.2,
              terminal=True):
     self._gamma, self._act_rand, self._r_max, self._h, self._w, self._n_trajs, self._l_traj, self._rand_start, \
     self._learning_rate, self._n_iters, self._save_dir, self._exp_name, self._n_exp = \
       gamma, act_rand, r_max, h, w, n_trajs, l_traj, rand_start, learning_rate, n_iters, save_dir, exp_name, n_exp
     self._exp_result_path = save_dir + "/" + exp_name
     if not os.path.exists(self._exp_result_path):
         os.makedirs(self._exp_result_path)
     else:
         logging.warning(self._exp_result_path + " has existed")
         exit()
     rmap_gt = np.zeros([h, w])
     rmap_gt[h - 1, w - 1] = rmap_gt[0, w - 1] = rmap_gt[h - 1, 0] = r_max
     if terminal:
         self._gw = gridworld.GridWorld(rmap_gt,
                                        {(h - 1, w - 1), (0, w - 1),
                                         (h - 1, 0)}, 1 - ACT_RAND)
     else:
         self._gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)
     self._rewards_gt = np.reshape(rmap_gt, H * W, order='F')
     self._P_a = self._gw.get_transition_mat()
     ts = time.time()
     self._values_gt, self._policy_gt = value_iteration.value_iteration(
         self._P_a, self._rewards_gt, GAMMA, error=0.01, deterministic=True)
     te = time.time()
     print "value iteration time of ground truth: ", te - ts
     ts = time.time()
     self.save_plt("gt", (3 * w, h), self._rewards_gt, self._values_gt,
                   self._policy_gt)
     te = time.time()
     print "saving plt time: ", te - ts
     self._demo_trajs = self.generate_demonstrations()
     self._feat_map = np.eye(h * w) if feat_map is None else feat_map
     self._gpu_fraction = gpu_fraction
def main():
    for seed in range(1):
        N_STATES = H * W
        # init the gridworld
        # rmap_gt is the ground truth for rewards
        rmap_gt = np.zeros([H, W])
        #goal coordinates
        rmap_gt[H - 1, W - 1] = R_MAX
        gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)
        rewards_gt = np.reshape(rmap_gt, H * W, order='F')
        P_a = gw.get_transition_mat()
        values_gt, policy_gt = value_iteration.value_iteration(
            P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)

        # use identity matrix as feature
        feat_map = np.eye(N_STATES)

        # other two features. due to the linear nature,
        # the following two features might not work as well as the identity.
        # feat_map = feature_basis(gw)
        # feat_map = feature_coord(gw)
        np.random.seed(0)
        #trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)

        trajs = mod.exp1_case2()
        rewards = maxent_irl(gw, feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                             N_ITERS)

        #np.savetxt('results/rewards.txt', rewards)

        #values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)
        # plots
        plt.figure(figsize=(20, 20))
        img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                            'Reward Map',
                            block=False)
        plt.plot()
        #now = datetime.datetime.now()
        #figname = "results/rewards_{0:%m%d%H%M}".format(now) + ".png"
        figname = "results/rewards_seed{0}".format(seed) + ".png"
        plt.savefig(figname)
Example #9
0
 def test_once(self, exp_id):
     tf.reset_default_graph()
     print 'Deep Max Ent IRL training ..'
     ts = time.time()
     rewards = deep_maxent_irl(self._feat_map, self._P_a, GAMMA,
                               self._demo_trajs, self._learning_rate,
                               self._n_iters, self._gpu_fraction)
     te = time.time()
     print 'IRL time: ', te - ts
     ts = time.time()
     values, policy = value_iteration.value_iteration(self._P_a,
                                                      rewards,
                                                      self._gamma,
                                                      error=0.01,
                                                      deterministic=True)
     te = time.time()
     print 'value iteration time of recovered: ', te - ts
     # plots
     ts = time.time()
     self.save_plt(exp_id, (3 * self._w, self._h), rewards, values, policy)
     te = time.time()
     print 'saving plt time: ', te - ts
Example #10
0
def test_irl_algorithms(gw, P_a, rmap_gt, policy_gt, trajs, feat_map):
    print('LP IRL training ..')
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX)
    print('Max Ent IRL training ..')
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2,
                                N_ITERS * 2)
    print('Deep Max Ent IRL training ..')
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)
    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)

    # plots
    plt.figure(figsize=(20, 8))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(to_plot(rmap_gt),
                        'Rewards Map - Ground Truth',
                        block=False,
                        text=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(to_plot(rewards_lpirl),
                        'Reward Map - LP',
                        block=False,
                        text=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(to_plot(rewards_maxent),
                        'Reward Map - Maxent',
                        block=False,
                        text=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(to_plot(rewards),
                        'Reward Map - Deep Maxent',
                        block=False,
                        text=False)
    plt.show()
Example #11
0
def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters):
    """
  Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)

  inputs:
    feat_map    NxD matrix - the features for each state
    P_a         NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of 
                                       landing at state s1 when taking action 
                                       a at state s0
    gamma       float - RL discount factor
    trajs       a list of demonstrations
    lr          float - learning rate
    n_iters     int - number of optimization steps

  returns
    rewards     Nx1 vector - recoverred state rewards
  """

    N_STATES, _, N_ACTIONS = np.shape(P_a)

    # init nn model
    net = IRLNet(lr)

    # find state visitation frequencies using demonstrations
    mu_D = demo_svf(trajs, N_STATES)

    mu_D = torch.tensor(mu_D, requires_grad=True)

    # Init optimizer
    optimizer = optim.SGD(net.parameters(), lr=net.lr, weight_decay=1e-5)

    # training
    for iteration in range(n_iters):
        if iteration % (n_iters / 10) == 0:
            print('Training Step: {}'.format(iteration))

        # compute the reward matrix
        rewards = net(feat_map)

        rewards_np = rewards.detach().numpy()
        # compute policy
        _, policy = value_iteration.value_iteration(P_a,
                                                    rewards_np,
                                                    gamma,
                                                    error=0.01,
                                                    deterministic=True)

        # compute expected svf
        mu_exp = compute_state_visition_freq(P_a,
                                             gamma,
                                             trajs,
                                             policy,
                                             deterministic=True)
        mu_exp = torch.tensor(mu_exp, requires_grad=True)
        # compute gradients on rewards:
        grad = mu_D - mu_exp
        grad = grad * -1

        # Clear gradient buffer
        optimizer.zero_grad()

        # apply gradients to the neural network
        rewards.backward(grad)

        # Gradient clipping to prevent exploding gradients -> NaN in multiplication
        nn.utils.clip_grad_norm_(net.parameters(), 100.0)

        optimizer.step()

    # Final forward pass and return
    rewards = net(feat_map).detach().numpy()

    return normalize(rewards)
  def deep_maxent_irl(self, feat_map, P_a, gamma, trajs, lr, n_iters, rewards_gt, policy_gt, mapSize, H, W, r_weight, p_weight, proposed):
    """
    Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)

    inputs:
      feat_map    NxD matrix - the features for each state
      P_a         NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of
                                         landing at state s1 when taking action
                                         a at state s0
      gamma       float - RL discount factor
      trajs       a list of demonstrations
      lr          float - learning rate
      n_iters     int - number of optimization steps

    returns
      rewards     Nx1 vector - recoverred state rewards
    """
    self.finetune(allow = True)

    self.lr = lr
    self.rewards_gt = rewards_gt
    N_STATES, _, N_ACTIONS = np.shape(P_a)

    # init nn model
    feat_map = feat_map.view(1, 1, mapSize, mapSize)
    feat_map = Variable(feat_map)


    # find state visitation frequencies using demonstrations
    mu_D = demo_svf(trajs, N_STATES, r_weight, p_weight, proposed)
    self.networks.train()
    iteration  = 0
    # training
    for iteration in range(n_iters):
    #while True:
      self.networks.zero_grad()

      # compute the reward matrix
      rewards = self.networks(feat_map)

      # compute policy
      rewards = rewards.view(mapSize, 1)
      _, policy = value_iteration.value_iteration(P_a, rewards, gamma, H * W, error=0.01, deterministic=True, npy=False)

      #if iteration % (n_iters / 10) == 0:
        #temp_rewards = normalize(rewards.data.numpy())
        #temp_rewards_gt = rewards_gt.data.numpy()
        # temp_evd = compute_expected_value_difference(policy, temp_rewards_gt, policy_gt, gamma, H, W)
        #reward_diff = np.abs(np.reshape(temp_rewards_gt, (H * W, 1)) - temp_rewards)
        #print 'iteration: {},'.format(iteration), 'Sum of diff : {},'.format(np.sum(reward_diff)), '# of exceed 0.3 : {}'.format(len(np.extract(reward_diff > 0.3, reward_diff)))

      # dyna algorithm
      # if iteration % (n_iters/10) == 0 and iteration/ (n_iters/10) > 30:
      #  # get new trajs
      #  trajs_new = generate_demonstrations(gw, policy, n_trajs=20, len_traj=20, rand_start=True)
      #  trajs+=trajs_new

      #  # update gt_svf
      #  mu_D = demo_svf(trajs, N_STATES)

      # compute expected svf
      mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)

      # compute gradients on rewards:
      rewards = rewards.view(mapSize)

      # Set Gradient
      torch.autograd.backward([rewards], [-(mu_D-mu_exp)*self.lr])

      # Update / Optimizer: SGD
      self.optimizer.step()


    # get output
    self.networks.eval()
    rewards = self.networks(feat_map)
    rewards = rewards.view(mapSize, 1)
    _, policy = value_iteration.value_iteration(P_a, rewards, gamma, H * W, error=0.01, deterministic=True, npy=False)
    mu_exp = compute_state_visition_freq(P_a, gamma, trajs, policy, deterministic=True)

    # return sigmoid(normalize(rewards))
    rewards = rewards.view(mapSize, 1)
    rewards = rewards.data.numpy()  # for normalize
    return normalize(rewards), mu_D, mu_exp
def deep_siamese_maxent_irl(feat_map, feat_map_inv, P_a, gamma, trajs,
                            trajs_inv, lr, n_iters):
    """
  Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)

  inputs:
    feat_map    NxD matrix - the features for each state
    P_a         NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of 
                                       landing at state s1 when taking action 
                                       a at state s0
    gamma       float - RL discount factor
    trajs       a list of demonstrations
    lr          float - learning rate
    n_iters     int - number of optimization steps

  returns
    rewards     Nx1 vector - recoverred state rewards
  """

    # tf.set_random_seed(1)

    N_STATES, _, N_ACTIONS = np.shape(P_a)

    # init nn model
    nn_r = DeepIRLP(feat_map.shape[1], lr, 3, 3)

    # find state visitation frequencies using demonstrations
    mu_D = find_svf(N_STATES, trajs)
    values = np.zeros(feat_map.shape[0])
    # training
    for iteration in range(n_iters):
        if iteration % (n_iters / 10) == 0:
            print('iteration: {}'.format(iteration))

        # compute the reward matrix
        rewards = nn_r.get_rewards(feat_map, feat_map_inv)

        # compute policy
        values, policy = value_iteration.value_iteration(P_a,
                                                         rewards,
                                                         gamma,
                                                         error=0.01,
                                                         deterministic=True)

        # compute expected svf
        mu_exp = compute_state_visition_freq(P_a,
                                             gamma,
                                             trajs,
                                             policy,
                                             deterministic=True)

        # compute gradients on rewards:
        grad_r = mu_D - mu_exp

        # apply gradients to the neural network
        grad_theta, l2_loss, grad_norm = nn_r.apply_grads(
            feat_map, feat_map_inv, grad_r)

    rewards = nn_r.get_rewards(feat_map, feat_map_inv)
    # print(rewards)
    # return sigmoid(normalize(rewards))
    return normalize(rewards)
def main():
    N_STATES = H * W
    N_ACTIONS = 5
    start_coordinates = (pixel_locations[0]['location-lat'][0],
                         pixel_locations[0]['location-long'][0])
    end_coordinates = (
        pixel_locations[0]['location-lat'][len(pixel_locations[0].index) - 1],
        pixel_locations[0]['location-long'][len(pixel_locations[0].index) - 1])

    rmap_gt = np.zeros([W, H])
    rmap_gt[int(start_coordinates[0]), int(start_coordinates[1])] = R_MAX
    rmap_gt[int(end_coordinates[0]), int(end_coordinates[1])] = R_MAX
    # rmap_gt[H/2, W/2] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {},
                             1 - ACT_RAND)
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    # feat_map = np.eye(N_STATES)

    coast_map = np.load('Feature Maps/small_maps/coast.npy')
    coast_map = np.reshape(coast_map, (600, 1))

    forest_map = np.load('Feature Maps/small_maps/forest.npy')
    forest_map = np.reshape(coast_map, (600, 1))

    land_map = np.load('Feature Maps/small_maps/land.npy')
    land_map = np.reshape(coast_map, (600, 1))

    feat_map = np.hstack((coast_map, forest_map, land_map))

    # populate trajectories
    trajs = []
    terminal_state = end_coordinates
    for x in range(len(pixel_locations)):
        trajs.append([])
        for i in range(len(pixel_locations[x]) - 1):
            loc = pixel_locations[x].iloc[i]
            next_loc = pixel_locations[x].iloc[i + 1]
            action = get_action(loc, next_loc)
            reward = rmap_gt[int(next_loc[0]), int(next_loc[1])]
            is_done = np.array_equal(next_loc, terminal_state)

            trajs[x].append(
                Step(cur_state=int(gw.pos2idx(loc)),
                     action=action,
                     next_state=int(gw.pos2idx(next_loc)),
                     reward=reward,
                     done=is_done))

    print 'LP IRL training ..'
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=100, R_max=R_MAX)
    print 'Max Ent IRL training ..'
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                                N_ITERS)
    #   print 'Deep Max Ent IRL training ..'
    #   rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, 10)

    # plots
    fig = plt.figure()
    plt.subplot(1, 2, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    fig.savefig('GroundTruth.png')
    plt.subplot(1, 1, 1)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    fig.savefig('LP.png')
    plt.subplot(1, 1, 1)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    fig.savefig('MaxEnt.png')
def main():
    N_STATES = H * W
    N_ACTIONS = 5
    """while True:
      print "BAD_STATE入力"
      bad = raw_input('>> ')
      if bad == 'ok':
          break
      Bad_states.append(bad)
  """

    #print Bad_states
    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)

    #new_rewards = reward_decrease(rewards, R_GAMMA, BAD_X, BAD_Y)

    np.savetxt('results/rewards.txt', rewards)

    #print rewards

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)

    #print policy
    # plots
    plt.figure(figsize=(20, 20))
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map',
                        block=False)
    plt.plot()

    plt.figure(figsize=(20, 20))
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Policy Map',
                        block=False)
    plt.plot()
    plt.show()
Example #16
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    rmap_gt[0, W - 1] = R_MAX
    rmap_gt[H - 1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')

    if ACT_RAND == 0:
        P_a = gw.get_transition_mat_deterministic()
    else:
        P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    #feat_map = np.eye(N_STATES)
    # feat_map = np.zeros(N_STATES).reshape((H, W))
    feat_map = np.random.rand(N_STATES).reshape((H, W))
    #feat_map = np.arange(N_STATES).reshape((H, W))
    if ARGS.conv:
        #feat_map[H-1, W-1] = -5
        #feat_map[0, W-1] = -5
        #feat_map[H-1, 0] = -5
        pass
    else:
        feat_map = feat_map.reshape(N_STATES)
    #feat_map = rmap_gt

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print 'Deep Max Ent IRL training ..'
    t = time.time()
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS, ARGS.conv, ARGS.sparse)
    print('time for dirl', time.time() - t)

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)

    print(
        'evd',
        value_iteration.expected_value_diff(P_a, rewards_gt, GAMMA,
                                            start_state_probs(trajs, N_STATES),
                                            values_gt, policy))

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
Example #17
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 2, W - 2] = R_MAX
    rmap_gt[1, 1] = R_MAX
    # rmap_gt[H/2, W/2] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {},
                             1 - ACT_RAND)
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    print('LP IRL training ..')
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX)
    print('Max Ent IRL training ..')
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2,
                                N_ITERS * 2)
    print('Deep Max Ent IRL training ..')
    rewards_deep = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                                   N_ITERS)
    print('Deep Siamese Max Ent IRL training ..')
    rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs,
                                      LEARNING_RATE, N_ITERS)

    # plots
    plt.figure(figsize=(20, 5))
    plt.subplot(1, 5, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 5, 2)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    plt.subplot(1, 5, 3)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    plt.subplot(1, 5, 4)
    img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'),
                        'Reward Map - Deep Maxent',
                        block=False)
    plt.subplot(1, 5, 5)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Deep Policy Maxent',
                        block=False)
    plt.show()
def deep_maxent_irl(feat_map, P_a, gamma, trajs, lr, n_iters, gpu_fraction):
    """
  Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)

  inputs:
    feat_map    NxD matrix - the features for each state
    P_a         NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of
                                       landing at state s1 when taking action
                                       a at state s0
    gamma       float - RL discount factor
    trajs       a list of demonstrations
    lr          float - learning rate
    n_iters     int - number of optimization steps

  returns
    rewards     Nx1 vector - recoverred state rewards
  """

    # tf.set_random_seed(1)

    N_STATES, _, N_ACTIONS = np.shape(P_a)

    # init nn model
    nn_r = FCNIRL(feat_map.shape, lr, 3, 3, gpu_fraction)

    # find state visitation frequencies using demonstrations
    mu_D = demo_svf(trajs, N_STATES)

    # training
    for iteration in range(n_iters):
        if iteration % (n_iters / 10) == 0:
            print 'iteration: {}'.format(iteration)

        # compute the reward matrix
        rewards = nn_r.get_rewards(feat_map)
        rewards = np.reshape(rewards, [-1, 1])
        # compute policy
        _, policy = value_iteration.value_iteration(P_a,
                                                    rewards,
                                                    gamma,
                                                    error=0.01,
                                                    deterministic=True)

        # compute expected svf
        mu_exp = compute_state_visition_freq(P_a,
                                             gamma,
                                             trajs,
                                             policy,
                                             deterministic=True)

        # compute gradients on rewards:
        grad_r = mu_D - mu_exp

        # apply gradients to the neural network
        grad_theta, l2_loss, grad_norm = nn_r.apply_grads(feat_map, grad_r)

    rewards = nn_r.get_rewards(feat_map)
    print "rewards: ", rewards
    center_rewards = rewards - rewards.mean()
    print "rewards reduce mean: ", center_rewards
    # print "gaussian normalize rewards: ", rewards - rewards.mean()
    norm_rewards = normalize(rewards)
    print "normalize rewards: ", norm_rewards
    norm2_rewards = (rewards - rewards.mean()) / rewards.std()
    print "normalize2 rewards: ", norm2_rewards
    sigmoid_center_rewards = 1 / (1 + np.exp(-center_rewards))
    print "sigmoid rewards: ", sigmoid_center_rewards
    sigmoid_norm2_rewards = 1 / (1 + np.exp(-norm2_rewards))
    print "sigmoid norm2 rewards: ", sigmoid_norm2_rewards
    # return sigmoid(normalize(rewards))
    # return normalize(rewards)
    # return sigmoid_norm2_rewards
    # return sigmoid_center_rewards
    return norm_rewards
def main():
    N_STATES = H * W
    N_ACTIONS = 4

    rmap_gt = set_rewards()
    gw = gridworld.GridWorld(rmap_gt, {(H - 1, W - 1)}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)
    path_gt = gw.display_path_grid(policy_gt)

    rmap_gt = gw.get_reward_mat()

    #temp
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 3, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 3, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 3, 3)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.show()
    sys.exit()

    # feat_map = np.eye(N_STATES)
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    feat_map = feature_histogram(gw)

    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)
    path = gw.display_path_grid(policy)

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)
    plt.show()
def main():
    N_STATES = H * W
    N_ACTIONS = 4

    rmap_gt = set_rewards2()

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)
    path_gt = gw.display_path_grid(policy_gt)

    # use identity matrix as feature
    ## feat_map = np.eye(N_STATES)
    feat_map = feature_histogram(gw)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print 'Deep Max Ent IRL training ..'
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)
    path = gw.display_path_grid(policy)

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)

    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)

    plt.show()
Example #21
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')  #

    P_a = gw.get_transition_mat(
    )  #this is the transitin probablities of the matrix  5 action what is the probability of moving from state s1 to s2 give the action
    #getting the  transition probabilities in my case is just impossible ...

    values_gt, policy_gt = value_iteration.value_iteration(
        P_a, rewards_gt, GAMMA, error=0.01, deterministic=True
    )  #value iteration and policy acoding to the currrent rewards 0

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)  #features as one hot encoding

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(
        gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ,
        rand_start=RAND_START)  #this is the trajectories

    rewards = maxent_irl(
        feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS
    )  #need to input the feature map , transition priobalibliteis og the world

    pdb.set_trace()

    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
Example #22
0
def main():

	# named tuple to record demonstrations
	Step = namedtuple('Step','cur_state action next_state reward done')

	# argument parser for command line arguments
	parser = argparse.ArgumentParser(description=None)

	parser.add_argument('-wid', '--width', default=5, type=int, 
						help='width of the gridworld')
	parser.add_argument('-hei', '--height', default=5, type=int, 
						help='height of the gridworld')
	parser.add_argument('-lr', '--learning_rate', default=0.01, type=float, 
						help='learning rate')
	parser.add_argument('-l', '--l_traj', default=20, type=int, 
						help='length of expert trajectory')

	parser.add_argument('--no-rand_start', dest='rand_start', action='store_false', 
						help='when sampling trajectories, fix start positions')
	parser.add_argument('--rand_start', dest='rand_start', action='store_true', 
						help='when sampling trajectories, randomly pick start positions')
	parser.add_argument('--approx', dest='approx', action='store_true', 
						help='flag to perform approximation of psa')

	parser.add_argument('-g', '--gamma', default=0.9, type=float, 
						help='discount factor')
	parser.add_argument('-n', '--n_iters', default=20, type=int, 
						help='number of iterations')
	parser.add_argument('-t', '--n_trajs', default=100, type=int, 
						help='number of expert trajectories')
	parser.add_argument('-a', '--act_random', default=0.3, type=float, 
						help='probability of acting randomly')
	
	# set default value for rand_start variable
	parser.set_defaults(rand_start=False)

	# parse and print arguments
	args = parser.parse_args()

	# arguments for environment and irl algorithm
	r_max = 1 
	gamma = args.gamma
	width = args.width
	height = args.height
	l_traj = args.l_traj
	approx = args.approx
	n_iters = args.n_iters
	n_trajs = args.n_trajs
	act_rand = args.act_random
	rand_start = args.rand_start
	learning_rate = args.learning_rate

	# variables for number of actions and states
	n_actions = 5
	n_states = height * width

	# initialize the gridworld
	# rmap_gt is the ground truth for rewards
	rmap_gt = np.zeros([height, width])

	rmap_gt[0, width-1] = r_max
	rmap_gt[height-1, 0] = r_max
	rmap_gt[height-1, width-1] = r_max

	# create grid world instance
	gw = gridworld.GridWorld(rmap_gt, {}, 1-act_rand)

	# get true rewards, state transition dynamics
	rewards_gt = np.reshape(rmap_gt, height*width, order='F')
	P_a_true = gw.get_transition_mat()

	trajs = generate_random(gw, n_actions, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start)

	# get approximation of state transition dynamics
	P_a_approx = np.zeros((n_states, n_states, n_actions))
	for traj in trajs:
		for t in range(len(traj)):
			P_a_approx[traj[t].cur_state, traj[t].next_state, traj[t].action] += 1

	for s in range(n_states):
		for a in range(n_actions):
			if np.sum(P_a_approx[s,:,a]) != 0:
				P_a_approx[s,:,a] /= np.sum(P_a_approx[s,:,a])

	if approx:
		P_a = P_a_approx
	else:
		P_a = P_a_true

	# get true value function and policy from reward map
	values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, gamma, error=0.01, deterministic=True)

	# use identity matrix as feature
	feat_map = np.eye(n_states)

	# other two features. due to the linear nature, 
	# the following two features might not work as well as the identity.
	# feat_map = feature_basis(gw)
	# feat_map = feature_coord(gw)

	trajs = generate_demonstrations(gw, policy_gt, n_trajs=n_trajs, len_traj=l_traj, 
									rand_start=rand_start)

	# perform inverse reinforcement learning to get reward function
	rewards = maxent_irl(feat_map, P_a, gamma, trajs, learning_rate, n_iters)
	values, _ = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)

	# plots
	plt.figure(figsize=(20,4))
	plt.subplot(2, 2, 1)
	img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
	plt.subplot(2, 2, 2)
	img_utils.heatmap2d(np.reshape(values_gt, (height,width), order='F'), 'Value Map - Ground Truth', block=False)
	plt.subplot(2, 2, 3)
	img_utils.heatmap2d(np.reshape(rewards, (height,width), order='F'), 'Reward Map - Recovered', block=False)
	plt.subplot(2, 2, 4)
	img_utils.heatmap2d(np.reshape(values, (height,width), order='F'), 'Value Map - Recovered', block=False)
	plt.show()

	# plots for state transition dynamics
	plt.figure(figsize=(10,4))
	plt.subplot(2, 1, 1)
	img_utils.heatmap2d(np.reshape(P_a_true[10,:,2], (height,width), order='F'), 'True Dist', block=False)
	plt.subplot(2, 1, 2)
	img_utils.heatmap2d(np.reshape(P_a_approx[10,:,2], (height,width), order='F'), 'Approx Dist', block=False)
	plt.show()
Example #23
0
def maxent_irl(gw, feat_map, P_a, gamma, trajs, lr, n_iters):
    """
  Maximum Entropy Inverse Reinforcement Learning (Maxent IRL)

  inputs:
    feat_map    NxD matrix - the features for each state
    P_a         NxNxN_ACTIONS matrix - P_a[s0, s1, a] is the transition prob of
                                       landing at state s1 when taking action
                                       a at state s0
    gamma       float - RL discount factor
    trajs       a list of demonstrations
    lr          float - learning rate
    n_iters     int - number of optimization steps

  returns
    rewards     Nx1 vector - recoverred state rewardsF
  """
    MRATE_THRESHOLD = 0.9
    exp_count = 0

    SLIDESIZE = 100

    # state number
    state_num = np.array([i + 1 for i in range(H * W)])
    statecount = np.zeros(H * W, dtype=float)

    # init parameters
    theta = np.random.uniform(size=(feat_map.shape[1], ))

    # calc feature expectations
    feat_exp = np.zeros([feat_map.shape[1]])

    for episode in trajs:
        for step in episode:
            feat_exp += feat_map[step.cur_state, :]
    feat_exp = feat_exp / len(trajs)

    check_opt_traj = []

    update_time = []

    data_stepsize = []

    #case2

    exp_traj = [
        0, 1, 2, 3, 4, 5, 6, 13, 20, 19, 18, 17, 24, 31, 38, 45, 46, 47, 48
    ]
    e_traj = [
        0, 1, 2, 3, 4, 5, 6, 13, 20, 19, 18, 17, 24, 31, 38, 45, 46, 47, 48
    ]

    #case1
    """
  exp_traj = [0,7,14,15,22,29,28,35,42,43,44,45,46,47,48]
  e_traj = [0,7,14,15,22,29,28,35,42,43,44,45,46,47,48]
  """

    #case3
    """
  exp_traj = [0,1,8,15,22,31,32,33,40,41,48]
  e_traj = [0,1,8,15,22,31,32,33,40,41,48]
  """
    #case3 new
    """
  exp_traj = [0,1,8,15,22,23, 24, 31,32,33,40,41,48]
  e_traj = [0,1,8,15,22,23,24,31,32,33,40,41,48]
  """

    #case4
    """
  exp_traj = [0,1,8,15,14,21,28,29,30,31,32,33,40,41,48]
  e_traj =[0,1,8,15,14,21,28,29,30,31,32,33,40,41,48]
  """

    #exp3
    """
  exp_traj = [0,1,2,3,4,5,6,13,20,27,26,25,24,31,38,45,46,47,48]
  e_traj = [0,1,2,3,4,5,6,13,20,27,26,25,24,31,38,45,46,47,48]
  """

    #exp3 case1
    """
  exp_traj = [0,1,2,3,4,5,6,13,12,11,10,9,16,23,30,37,44,45,46,47,48]
  e_traj = [0,1,2,3,4,5,6,13,12,11,10,9,16,23,30,37,44,45,46,47,48]
  """

    #exp3 case2
    """
  exp_traj=[0,1,8,15,14,21,28,29,30,31,24,17, 10, 11, 12, 19,26,33,40,47,48]
  e_traj=[0,1,8,15,14,21,28,29,30,31,24,17, 10, 11, 12, 19,26,33,40,47,48]
  """

    #exp3 case2 new
    """
  exp_traj=[0,1,8,15,14,21,28,29,30,31,24,17, 10, 11,12, 13, 20 ,27,34,41,48]
  e_traj=[0,1,8,15,14,21,28,29,30,31,24,17,10,11,12,13,20,27,34,41,48]
  """

    select_candidate = []

    maxstate = 0
    overstate = []

    # training
    for iteration in tqdm(range(n_iters)):

        #print 'iteration: {}/{}'.format(iteration, n_iters)
        # compute reward function
        if (iteration == 0):
            rewards = np.dot(feat_map, theta)

        # compute policy
        _, policy = value_iteration.value_iteration(P_a,
                                                    rewards,
                                                    gamma,
                                                    error=0.01,
                                                    deterministic=False)
        _, true_policy = value_iteration.value_iteration(P_a,
                                                         rewards,
                                                         gamma,
                                                         error=0.01,
                                                         deterministic=True)
        """
    if (iteration % 150 == 0):
      plt.figure(figsize=(20,20))
      img_utils.heatmap2d(np.reshape(rewards, (H,W), order='F'), 'Reward Map', block=False)
      plt.plot()
      now = datetime.datetime.now()
      figname = "results/reward/rewards_{0:%m%d%H%M%S}".format(now) + ".png"
      plt.savefig(figname)

      plt.figure(figsize=(20,20))
      img_utils.heatmap2d(np.reshape(value, (H, W), order='F'), 'Value Map', block=False)
      plt.plot()
      figname1 = "results/value/value_{0:%m%d%H%M%S}".format(now) + ".png"
      plt.savefig(figname1)
      #plt.show()
    """

        # compute new trajectory

        #new_trajs = generate_newtrajs(gw, true_policy, n_trajs=100, len_traj=30, rand_start=False)
        #opt_traj = get_optimaltrajectory(true_policy,7,7,20)

        #if terminal == 48
        candidate = get_trajectory_egreedy(true_policy, 7, 7, 20)

        re_candidate = sorted(set(candidate), key=candidate.index)

        e_traj = copy.deepcopy(re_candidate)

        if exp_traj[-2] in re_candidate:
            select_candidate = copy.deepcopy(re_candidate)
        print " "
        print "candidate   ", candidate
        print "re_candidate", re_candidate
        print "exp_traj    ", exp_traj

        #compare epsilon-greedy trajectory
        m_rate = match_rate('simple', e_traj, exp_traj)

        print m_rate

        #m_threshold = tune_rate(iteration, n_iters, MRATE_THRESHOLD, update_time)
        #print ("m_threshold", m_threshold)

        if ((len(exp_traj) > len(e_traj)) and ((check_opt_traj != e_traj))
                and (m_rate >= MRATE_THRESHOLD) and ((48 in e_traj))):
            trajs = make_traj(20, e_traj)
            exp_count += 1
            feat_exp = np.zeros([feat_map.shape[1]])
            for episode in trajs:
                for step in episode:
                    feat_exp += feat_map[step.cur_state, :]
            feat_exp = feat_exp / len(trajs)
            check_opt_traj = e_traj
            exp_traj = e_traj
            update_time.append(iteration)
        """
    if(iteration == 100):
      trajs = tj.exp1_case3_correct()
      exp_count += 1
      feat_exp = np.zeros([feat_map.shape[1]])
      for episode in trajs:
        for step in episode:
          feat_exp += feat_map[step.cur_state,:]
      feat_exp = feat_exp/len(trajs)
      check_opt_traj= [0,1,8,15,16,23,24,31,32,33,40,41,48]
      exp_traj =  [0,1,8,15,16,23,24,31,32,33,40,41,48]
      update_time.append(iteration)
    """

        #print "exp_traj    ", exp_traj
        print "update_time", update_time

        data_stepsize.append(len(check_opt_traj))
        '''
    with open('results/step_size.csv', 'a')  as f:
      f.write(str(iteration))
      f.write(",")
      f.write(str(data_stepsize[iteration]))
      f.write('\n')
      f.close
    '''
        '''
    with open('results/expert.csv', 'a')  as f:
      f.write(str(iteration))
      f.write(",")
      for state in exp_traj:
        f.write(str(state))
        f.write(",")
      f.write('\n')
      f.close
    '''
        '''
    with open('results/candidate.csv', 'a')  as f:
      f.write(str(iteration))
      f.write(",")
      for state in candidate:
        f.write(str(state))
        f.write(",")
      f.write('\n')
      f.close
    '''
        '''
    with open('results/re_candidate.csv', 'a')  as f:
      f.write(str(iteration))
      f.write(",")
      for state in re_candidate:
        f.write(str(state))
        f.write(",")
      f.write('\n')
      f.close
    '''
        '''
    with open('results/select_candidate.csv', 'a')  as f:
      f.write(str(iteration))
      f.write(",")
      for state in select_candidate:
        f.write(str(state))
        f.write(",")
      f.write('\n')
      f.close
    '''

        # compute state visition frequences
        svf = compute_state_visition_freq(P_a,
                                          gamma,
                                          trajs,
                                          policy,
                                          deterministic=False)

        # compute gradients
        grad = feat_exp - feat_map.T.dot(svf)

        # update params
        theta += lr * grad

        rewards = np.dot(feat_map, theta)  #policy

        rewards = normalize(rewards)

        for t in range(len(candidate)):
            statecount[candidate[t]] += 1

        if (iteration % SLIDESIZE == 0 and iteration != 0):
            fig = plt.figure()
            left = state_num
            height = statecount
            # minmax_h = min_max(height)
            plt.bar(left, height, color="#FF5B70")
            plt.title("iteration{0}".format(iteration))
            plt.savefig('results/statecount{0}'.format(iteration) + '.png')
            print height

            overstate = theta
            for i in range(len(height)):
                if (height[i] > SLIDESIZE + 50):
                    overstate[i] = 0.0
                    print "overstate{0}".format(i)
            statecount = np.zeros(H * W, dtype=int)

        if (iteration > SLIDESIZE):
            theta = overstate

    rewards = np.dot(feat_map, theta)
    #return rewards
    return normalize(rewards)