コード例 #1
0
ファイル: gym_test.py プロジェクト: zacrash/Inverse-RL
def main():
    # Init Gym
    env = gym.make(ARGS.environment)
    OBS_S = env.observation_space.shape

    trajs = play(env)

    # use identity matrix as feature
    feat_map_np = env.reset()
    #feat_map_np = voxelize(feat_map_np)
    feat_map = torch.tensor(feat_map_np, dtype=torch.float)
    P_a = np.ones((210, 160, 3))

    print 'Deep Max Ent IRL training ..'
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
コード例 #2
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)
    print(feat_map.shape)

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)

    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
コード例 #3
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    rmap_gt[0, W - 1] = R_MAX
    rmap_gt[H - 1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)
    feat_map_torch = torch.tensor(feat_map, dtype=torch.float)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print('Deep Max Ent IRL training ..')
    rewards = deep_maxent_irl(feat_map_torch, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)
    #rewards = rewards.detach().numpy()
    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
コード例 #4
0
def main():
    for seed in range(1):
        N_STATES = H * W
        # init the gridworld
        # rmap_gt is the ground truth for rewards
        rmap_gt = np.zeros([H, W])
        #goal coordinates
        rmap_gt[H - 1, W - 1] = R_MAX
        gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)
        rewards_gt = np.reshape(rmap_gt, H * W, order='F')
        P_a = gw.get_transition_mat()
        values_gt, policy_gt = value_iteration.value_iteration(
            P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)

        # use identity matrix as feature
        feat_map = np.eye(N_STATES)

        # other two features. due to the linear nature,
        # the following two features might not work as well as the identity.
        # feat_map = feature_basis(gw)
        # feat_map = feature_coord(gw)
        np.random.seed(0)
        #trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)

        trajs = mod.exp1_case2()
        rewards = maxent_irl(gw, feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                             N_ITERS)

        #np.savetxt('results/rewards.txt', rewards)

        #values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)
        # plots
        plt.figure(figsize=(20, 20))
        img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                            'Reward Map',
                            block=False)
        plt.plot()
        #now = datetime.datetime.now()
        #figname = "results/rewards_{0:%m%d%H%M}".format(now) + ".png"
        figname = "results/rewards_seed{0}".format(seed) + ".png"
        plt.savefig(figname)
コード例 #5
0
def test_irl_algorithms(gw, P_a, rmap_gt, policy_gt, trajs, feat_map):
  # print( 'LP IRL training ..')
  # rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX)
  # print('Max Ent IRL training ..')
  # rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE*2, N_ITERS*2)
  # print('Deep Max Ent IRL training ..')
  # rewards_fc = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
  # print('Deep Policy Max Ent IRL training ..')
  rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)    
  values, _ = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)

  # plots
  plt.figure(figsize=(20,8))
  plt.subplot(1, 5, 1)
  img_utils.heatmap2d(to_plot(rmap_gt), 'Rewards Map - Ground Truth', block=False, text=False)
  plt.subplot(1, 5, 2)
  img_utils.heatmap2d(to_plot(rewards_lpirl), 'Reward Map - LP', block=False, text=False)
  plt.subplot(1, 5, 3)
  img_utils.heatmap2d(to_plot(rewards_maxent), 'Reward Map - Maxent', block=False, text=False)
  plt.subplot(1, 5, 4)
  img_utils.heatmap2d(to_plot(rewards_fc), 'Reward Map - Deep Maxent', block=False, text=False)
  plt.subplot(1, 5, 5)
  img_utils.heatmap2d(to_plot(rewards), 'Reward Siamese Map - Deep Maxent', block=False, text=False)
  plt.show()
コード例 #6
0
 def save_plt(self, name, figsize, rewards, values, policy):
     plt.figure(figsize=figsize)
     plt.subplot(1, 3, 1)
     img_utils.heatmap2d(np.reshape(rewards, (self._h, self._w), order='F'),
                         'Rewards Map',
                         block=False)
     plt.subplot(1, 3, 2)
     img_utils.heatmap2d(np.reshape(values, (self._h, self._w), order='F'),
                         'Value Map',
                         block=False)
     plt.subplot(1, 3, 3)
     img_utils.heatmap2d(np.reshape(policy, (self._h, self._w), order='F'),
                         'Policy Map',
                         block=False)
     plt.savefig(self._exp_result_path + "/" + name + ".png")
     plt.close()
コード例 #7
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, structure):

    wind = 0.3
    trajectory_length = 8
    l1 = l2 = 0

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    print(ow.objects.keys())
    rewards_gt = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy_gt = find_policy(ow.n_states,
                            ow.n_actions,
                            ow.transition_probability,
                            rewards_gt,
                            ow.discount,
                            stochastic=False)
    trajs = ow.generate_trajectories(N_TRAJS, L_TRAJ, lambda s: policy_gt[s])
    feat_map = ow.feature_matrix(ow.objects, discrete=False)

    rewards_inv = np.array(
        [ow.inverse_reward(s_inv) for s_inv in range(ow.n_states)])
    policy_inv = find_inverted_policy(ow.n_states,
                                      ow.n_actions,
                                      ow.transition_probability,
                                      rewards_inv,
                                      ow.discount,
                                      stochastic=False)
    trajs_inv = ow.generate_inverse_trajectories(
        N_TRAJS, L_TRAJ, lambda s_inv: policy_inv[s_inv])
    feat_map_inv = ow.inv_feature_matrix(ow.inverted_objects, discrete=False)
    print('LP IRL training ..')
    rewards_lpirl = lp_irl(ow.transition_probability,
                           policy_gt,
                           gamma=0.3,
                           l1=10,
                           R_max=R_MAX)
    print('Max Ent IRL training ..')
    rewards_maxent = maxent_irl(feat_map, ow.transition_probability, GAMMA,
                                trajs, LEARNING_RATE * 2, N_ITERS * 2)
    print('Deep Max Ent IRL training ..')
    rewards_deep = deep_maxent_irl(feat_map, ow.transition_probability, GAMMA,
                                   trajs, LEARNING_RATE, N_ITERS)
    print('Deep Siamese Max Ent IRL training ..')
    rewards = deep_siamese_maxent_irl(feat_map, feat_map_inv,
                                      ow.transition_probability, GAMMA, trajs,
                                      trajs_inv, LEARNING_RATE, N_ITERS)

    # plots
    plt.figure(figsize=(20, 5))
    plt.subplot(1, 5, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 5, 2)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    plt.subplot(1, 5, 3)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    plt.subplot(1, 5, 4)
    img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'),
                        'Reward Map - Deep Maxent',
                        block=False)
    plt.subplot(1, 5, 5)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Deep Siamese Maxent',
                        block=False)
    plt.show()
コード例 #8
0
    def ada_irl_update(self):
        for episode in range(self.Episode):
            # initial observation
            observation = self.env.reset()
            # action = self.RL.choose_action(observation)
            states = observation
            eq_r = 0
            eq_step = 0
            while True and eq_step < len(self.IRL.expert) * 3:
                # print eq_step
                # fresh env
                self.env.render()

                action = self.RL.choose_action(observation)

                observation_, reward, done = self.env.step(action)

                reward = self.IRL.reward(observation_, )

                self.RL.learn(observation, action, reward, observation_)

                observation = observation_
                # action = action_
                states = np.vstack((states, observation))
                eq_r += reward
                eq_step += 1

                if done:
                    break
                # print states
            self.IRL.learn(states)
            print "Episode %d | Reward" % episode, eq_r
            # print self.IRL.reward_weight

        print 'Game Over'
        print self.RL.q_table
        reward_weight = self.IRL.reward_weight.reshape(
            [self.env.col, self.env.row])
        expert_reward = self.IRL.reward_weight[self.expert]
        print reward_weight

        actual_reward = np.zeros_like(reward_weight)
        actual_reward[5, 5] = 10
        actual_reward[6, 1] = -5
        actual_reward[1, 6] = -5
        actual_reward[4, 7] = -5
        actual_reward[7, 4] = -5
        print actual_reward

        plt.figure(figsize=(25, 10))
        plt.subplot(1, 2, 1)
        img_utils.heatmap2d(actual_reward,
                            'Reward MAP - Ground Truth',
                            block=False)
        plt.subplot(1, 2, 2)
        img_utils.heatmap2d(reward_weight, 'Reward MAP - ddlGAN', block=False)
        plt.show()

        img_utils.heatmap3d(reward_weight, 'Reward MAP - ddlGAN')
        plt.show()

        print expert_reward
        x = np.arange(len(self.IRL.expert))
        plt.plot(x, expert_reward, 'r-', lw=5)
        plt.show()
コード例 #9
0
    for s1 in range(N_STATES):
        s = int(traj[i - 1])
        action = int(act[s])
        if P_a[s][action][s1] == 1:
            traj[i] = s1

for j in range(0, TRAJ_LEN, 1):
    x, y = int_to_point(traj[j])
    x = int(x)
    y = int(y)
    traj_demo[y][x] = 1

for i in range(H):
    for j in range(W):
        if r[i+j] > 20:
            r[i+j] = 20
        r[i+j] = 0
'''
plt.figure(figsize=(H, W))
img_utils.heatmap2d(np.reshape(r, (H, W)),
                    'Reward Map',
                    block=False,
                    text=False)
plt.show()
'''
plt.imshow(traj_demo)
plt.show()
plt.figure(figsize=(H, W))
img_utils.heatmap2d(np.reshape(value, (H, W)), 'Value', block=False, text=False)
plt.show()'''
コード例 #10
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards

    #"""
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    rmap_gt[H - 1, 0] = R_MAX

    ACT_RAND = 0

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    """
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 2, W - 2] = R_MAX
    rmap_gt[1, 1] = R_MAX
    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)
    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration(P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)
    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {}, 1 - ACT_RAND)
    """ #
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration(P_a,
                                           rewards_gt,
                                           GAMMA,
                                           error=0.01,
                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    np.random.seed(1)

    eg = []
    tg = []
    unseen = []
    for i in range(59, 60):
        print("i = {}".format(i + 1))
        N_TRAJS = 100
        L_TRAJ = (i + 1)
        trajs = generate_demonstrations(gw,
                                        policy_gt,
                                        n_trajs=N_TRAJS,
                                        len_traj=L_TRAJ,
                                        rand_start=RAND_START)
        #rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
        rewards_ent, policy_ent, n_unseen = maxent_irl_ent(feat_map,
                                                           P_a,
                                                           GAMMA,
                                                           trajs,
                                                           LEARNING_RATE,
                                                           N_ITERS,
                                                           deterministic=True)
        value_ent = GetValue(policy_ent,
                             P_a,
                             rewards_gt,
                             GAMMA,
                             deterministic=True)
        _, policy_theta = value_iteration(P_a,
                                          rewards_ent,
                                          GAMMA,
                                          error=0.01,
                                          deterministic=True)
        value_theta = GetValue(policy_theta,
                               P_a,
                               rewards_gt,
                               GAMMA,
                               deterministic=True)
        eg.append(
            np.linalg.norm(value_ent - values_gt) / np.linalg.norm(values_gt))
        tg.append(
            np.linalg.norm(value_theta - values_gt) /
            np.linalg.norm(values_gt))
        unseen.append(n_unseen)

    unseen = np.array(unseen)
    plt.figure(1)
    plt.plot(eg, marker='.')
    plt.plot(tg, marker='.')
    #plt.plot((unseen / max(unseen)), marker='.')
    plt.grid(True)
    plt.ylabel('||Vgt - V||2/||Vgt||2')
    plt.xlabel('length_expert_demos')
    plt.legend(['V = Vent', 'V = Vtheta'])  #, '#unseen'])
    plt.show()

    plt.figure(2)
    plt.plot(values_gt, marker='.')
    plt.plot(value_theta, marker='.')
    plt.plot(value_ent, marker='.')
    plt.grid(True)
    plt.ylabel('V')
    plt.xlabel('s')
    plt.legend(['Vgt', 'Vtheta', 'Vent'])
    plt.show()

    # plots
    plt.figure(figsize=(25, 5))
    plt.subplot(1, 5, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 5, 2)
    img_utils.heatmap2d(np.reshape(rewards_ent, (H, W), order='F'),
                        'Reward_Ent - Recovered',
                        block=False)
    plt.subplot(1, 5, 3)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value - Ground Truth',
                        block=False)
    plt.subplot(1, 5, 4)
    img_utils.heatmap2d(np.reshape(value_ent, (H, W), order='F'),
                        'Value Ent - Recovered',
                        block=False)
    plt.subplot(1, 5, 5)
    img_utils.heatmap2d(np.reshape(value_theta, (H, W), order='F'),
                        'Value Theta - Recovered',
                        block=False)
    plt.show()
コード例 #11
0
ファイル: linear_irl_gridworld.py プロジェクト: ufgtb24/IRL
def main():
    """
  Recover gridworld reward using linear programming IRL
  """

    H = 10
    W = 10
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    grid = [['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
            ['0', '0', '0', '0', '0', '0', '0', '0', '0',
             str(R_MAX)]]

    gw = gridworld.GridWorld(grid, {(H - 1, W - 1)}, 1 - ACT_RAND)

    # solve the MDP using value iteration
    vi = value_iteration.ValueIterationAgent(gw, GAMMA, 100)

    r_mat = gw.get_reward_mat()
    print 'show rewards map. any key to continue'
    img_utils.heatmap2d(r_mat, 'Reward Map - Ground Truth')

    v_mat = gw.get_values_mat(vi.get_values())
    print 'show values map. any key to continue'
    img_utils.heatmap2d(v_mat, 'Value Map - Ground Truth')

    # Construct transition matrix
    P_a = np.zeros((N_STATES, N_STATES, N_ACTIONS))

    for si in range(N_STATES):
        statei = gw.idx2pos(si)
        for a in range(N_ACTIONS):
            probs = gw.get_transition_states_and_probs(statei, a)
            for statej, prob in probs:
                sj = gw.pos2idx(statej)
                # Prob of si to sj given action a
                P_a[si, sj, a] = prob

    # display policy and value in gridworld just for debug use
    gw.display_policy_grid(vi.get_optimal_policy())
    gw.display_value_grid(vi.values)

    # setup policy
    policy = np.zeros(N_STATES)
    for i in range(N_STATES):
        policy[i] = vi.get_action(gw.idx2pos(i))

    # solve for the rewards
    rewards = lp_irl(P_a, policy, gamma=GAMMA, l1=L1, R_max=R_MAX)

    # display recoverred rewards
    print 'show recoverred rewards map. any key to continue'
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered')
    img_utils.heatmap3d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered')
コード例 #12
0
def main():
    N_STATES = H * W
    N_ACTIONS = 4

    rmap_gt = set_rewards()
    gw = gridworld.GridWorld(rmap_gt, {(H - 1, W - 1)}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)
    path_gt = gw.display_path_grid(policy_gt)

    rmap_gt = gw.get_reward_mat()

    #temp
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 3, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 3, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 3, 3)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.show()
    sys.exit()

    # feat_map = np.eye(N_STATES)
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    feat_map = feature_histogram(gw)

    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)
    path = gw.display_path_grid(policy)

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)
    plt.show()
コード例 #13
0
    if is_plt:
        if is_v:
            # gridworld
            '''single or multiple destination'''
            terminal_single = traj[len(traj)-1]
            terminal_single2list = []
            terminal_single2list.append(terminal_single)
            value, policy = vi.value_iteration(SHAPE, r, discount, terminal)
            # value = np.exp(value)
            '''single or multiple start point'''
            mu_exp = compute_state_visitation_freq(SHAPE, traj, policy)
            #mu_exp = compute_state_visitation_freq_multiple_starts(SHAPE, traj, start, policy)

            plt.subplot(2, 4, 5)
            img_utils.heatmap2d(np.reshape(mu_exp, (H, W)), 'Expected SVF', block=False, text=False)

            plt.subplot(2, 4, 6)
            img_utils.heatmap2d(np.reshape(value, (H, W)), 'Value', block=False, text=False)

            plt.subplot(2, 4, 4)
            alpha = 0.8
            integrated_weighted = alpha * 800 * np.reshape(mu_exp, [H, W]) + (1 - alpha) * np.reshape(r, (H, W))
            img_utils.heatmap2d(integrated_weighted, 'Integrated map', block=False, text=False)

        plt.subplot(2, 4, 1)
        img_utils.heatmap2d(np.reshape(ref, [H, W]), 'Expert SVF', block=False, text=False)

        plt.subplot(2, 4, 2)
        img_utils.heatmap2d(traj_svf, 'Trajectory SVF', block=False, text=False)
コード例 #14
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5
    start_coordinates = (pixel_locations[0]['location-lat'][0],
                         pixel_locations[0]['location-long'][0])
    end_coordinates = (
        pixel_locations[0]['location-lat'][len(pixel_locations[0].index) - 1],
        pixel_locations[0]['location-long'][len(pixel_locations[0].index) - 1])

    rmap_gt = np.zeros([W, H])
    rmap_gt[int(start_coordinates[0]), int(start_coordinates[1])] = R_MAX
    rmap_gt[int(end_coordinates[0]), int(end_coordinates[1])] = R_MAX
    # rmap_gt[H/2, W/2] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {},
                             1 - ACT_RAND)
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    # feat_map = np.eye(N_STATES)

    coast_map = np.load('Feature Maps/small_maps/coast.npy')
    coast_map = np.reshape(coast_map, (600, 1))

    forest_map = np.load('Feature Maps/small_maps/forest.npy')
    forest_map = np.reshape(coast_map, (600, 1))

    land_map = np.load('Feature Maps/small_maps/land.npy')
    land_map = np.reshape(coast_map, (600, 1))

    feat_map = np.hstack((coast_map, forest_map, land_map))

    # populate trajectories
    trajs = []
    terminal_state = end_coordinates
    for x in range(len(pixel_locations)):
        trajs.append([])
        for i in range(len(pixel_locations[x]) - 1):
            loc = pixel_locations[x].iloc[i]
            next_loc = pixel_locations[x].iloc[i + 1]
            action = get_action(loc, next_loc)
            reward = rmap_gt[int(next_loc[0]), int(next_loc[1])]
            is_done = np.array_equal(next_loc, terminal_state)

            trajs[x].append(
                Step(cur_state=int(gw.pos2idx(loc)),
                     action=action,
                     next_state=int(gw.pos2idx(next_loc)),
                     reward=reward,
                     done=is_done))

    print 'LP IRL training ..'
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=100, R_max=R_MAX)
    print 'Max Ent IRL training ..'
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                                N_ITERS)
    #   print 'Deep Max Ent IRL training ..'
    #   rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, 10)

    # plots
    fig = plt.figure()
    plt.subplot(1, 2, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    fig.savefig('GroundTruth.png')
    plt.subplot(1, 1, 1)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    fig.savefig('LP.png')
    plt.subplot(1, 1, 1)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    fig.savefig('MaxEnt.png')
コード例 #15
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')  #

    P_a = gw.get_transition_mat(
    )  #this is the transitin probablities of the matrix  5 action what is the probability of moving from state s1 to s2 give the action
    #getting the  transition probabilities in my case is just impossible ...

    values_gt, policy_gt = value_iteration.value_iteration(
        P_a, rewards_gt, GAMMA, error=0.01, deterministic=True
    )  #value iteration and policy acoding to the currrent rewards 0

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)  #features as one hot encoding

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(
        gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ,
        rand_start=RAND_START)  #this is the trajectories

    rewards = maxent_irl(
        feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS
    )  #need to input the feature map , transition priobalibliteis og the world

    pdb.set_trace()

    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
コード例 #16
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 2, W - 2] = R_MAX
    rmap_gt[1, 1] = R_MAX
    # rmap_gt[H/2, W/2] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {},
                             1 - ACT_RAND)
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    print('LP IRL training ..')
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX)
    print('Max Ent IRL training ..')
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2,
                                N_ITERS * 2)
    print('Deep Max Ent IRL training ..')
    rewards_deep = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                                   N_ITERS)
    print('Deep Siamese Max Ent IRL training ..')
    rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs,
                                      LEARNING_RATE, N_ITERS)

    # plots
    plt.figure(figsize=(20, 5))
    plt.subplot(1, 5, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 5, 2)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    plt.subplot(1, 5, 3)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    plt.subplot(1, 5, 4)
    img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'),
                        'Reward Map - Deep Maxent',
                        block=False)
    plt.subplot(1, 5, 5)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Deep Policy Maxent',
                        block=False)
    plt.show()
コード例 #17
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5
    """while True:
      print "BAD_STATE入力"
      bad = raw_input('>> ')
      if bad == 'ok':
          break
      Bad_states.append(bad)
  """

    #print Bad_states
    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)

    #new_rewards = reward_decrease(rewards, R_GAMMA, BAD_X, BAD_Y)

    np.savetxt('results/rewards.txt', rewards)

    #print rewards

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)

    #print policy
    # plots
    plt.figure(figsize=(20, 20))
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map',
                        block=False)
    plt.plot()

    plt.figure(figsize=(20, 20))
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Policy Map',
                        block=False)
    plt.plot()
    plt.show()
コード例 #18
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    rmap_gt[0, W - 1] = R_MAX
    rmap_gt[H - 1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')

    if ACT_RAND == 0:
        P_a = gw.get_transition_mat_deterministic()
    else:
        P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    #feat_map = np.eye(N_STATES)
    # feat_map = np.zeros(N_STATES).reshape((H, W))
    feat_map = np.random.rand(N_STATES).reshape((H, W))
    #feat_map = np.arange(N_STATES).reshape((H, W))
    if ARGS.conv:
        #feat_map[H-1, W-1] = -5
        #feat_map[0, W-1] = -5
        #feat_map[H-1, 0] = -5
        pass
    else:
        feat_map = feat_map.reshape(N_STATES)
    #feat_map = rmap_gt

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print 'Deep Max Ent IRL training ..'
    t = time.time()
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS, ARGS.conv, ARGS.sparse)
    print('time for dirl', time.time() - t)

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)

    print(
        'evd',
        value_iteration.expected_value_diff(P_a, rewards_gt, GAMMA,
                                            start_state_probs(trajs, N_STATES),
                                            values_gt, policy))

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
コード例 #19
0
def main():
    """
  Recover gridworld reward using linear programming IRL
  """

    H = 10
    W = 10
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld including the reward
    grid = [
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['-1', '-1', '-1', '-1', '-1', '0', '0', '-1', '-1', '-1'],
        ## ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '-1', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '-1', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '-1', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
        ['0', '0', '0', '0', '0', '0', '0', '0', '0',
         str(R_MAX)]
    ]

    # custom
    for i, row in enumerate(grid):
        for j, e in enumerate(row):
            if e is '0':
                grid[i][j] = '-1'
            elif e is '-1':
                grid[i][j] = '-10'

    # grid, terminal state, trans_prob
    gw = gridworld.GridWorld(grid, {(H - 1, W - 1)}, 1 - ACT_RAND)

    # solve the MDP using value iteration
    vi = value_iteration.ValueIterationAgent(gw, GAMMA, 100)
    r_mat_gt = gw.get_reward_mat()
    v_mat_gt = gw.get_values_mat(vi.get_values())

    # Construct transition matrix
    P_a = np.zeros((N_STATES, N_STATES, N_ACTIONS))

    for si in range(N_STATES):
        statei = gw.idx2pos(si)
        for a in range(N_ACTIONS):
            probs = gw.get_transition_states_and_probs(statei, a)
            for statej, prob in probs:
                sj = gw.pos2idx(statej)
                # Prob of si to sj given action a
                P_a[si, sj, a] = prob

    # display policy and value in gridworld just for debug use
    gw.display_policy_grid(vi.get_optimal_policy())
    gw.display_value_grid(vi.values)

    # display a path following optimal policy
    ## print 'show optimal path. any key to continue'
    path_gt = gw.display_path_grid(vi.get_optimal_policy())
    ## img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path')
    ## sys.exit()

    # setup policy
    policy = np.zeros(N_STATES)
    for i in range(N_STATES):
        policy[i] = vi.get_action(gw.idx2pos(i))

    #------------------ After getting optimal policy through iterations ------------------
    # solve for the rewards
    rewards = lp_irl(P_a, policy, gamma=GAMMA, l1=L1, R_max=R_MAX)
    r_mat = np.reshape(rewards, (H, W), order='F')
    v_mat = gw.get_values_mat(vi.get_values())
    path = gw.display_path_grid(vi.get_optimal_policy())

    # display recoverred rewards
    print 'show recoverred rewards map. any key to continue'
    ## img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered')
    #img_utils.heatmap3d(np.reshape(rewards, (H, W), order='F'), 'Reward Map - Recovered')

    # display a path following optimal policy
    print 'show optimal path. any key to continue'
    ## path = gw.display_path_grid(vi.get_optimal_policy())
    ## img_utils.heatmap2d(np.reshape(path, (H, W), order='F'), 'Path')

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(r_mat_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(v_mat_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(r_mat, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(v_mat, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)

    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)

    plt.show()
コード例 #20
0
def main():
    N_STATES = H * W
    N_ACTIONS = 4

    rmap_gt = set_rewards2()

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)
    path_gt = gw.display_path_grid(policy_gt)

    # use identity matrix as feature
    ## feat_map = np.eye(N_STATES)
    feat_map = feature_histogram(gw)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    print 'Deep Max Ent IRL training ..'
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)
    path = gw.display_path_grid(policy)

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)

    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)

    plt.show()
コード例 #21
0
def main():

	# named tuple to record demonstrations
	Step = namedtuple('Step','cur_state action next_state reward done')

	# argument parser for command line arguments
	parser = argparse.ArgumentParser(description=None)

	parser.add_argument('-wid', '--width', default=5, type=int, 
						help='width of the gridworld')
	parser.add_argument('-hei', '--height', default=5, type=int, 
						help='height of the gridworld')
	parser.add_argument('-lr', '--learning_rate', default=0.01, type=float, 
						help='learning rate')
	parser.add_argument('-l', '--l_traj', default=20, type=int, 
						help='length of expert trajectory')

	parser.add_argument('--no-rand_start', dest='rand_start', action='store_false', 
						help='when sampling trajectories, fix start positions')
	parser.add_argument('--rand_start', dest='rand_start', action='store_true', 
						help='when sampling trajectories, randomly pick start positions')
	parser.add_argument('--approx', dest='approx', action='store_true', 
						help='flag to perform approximation of psa')

	parser.add_argument('-g', '--gamma', default=0.9, type=float, 
						help='discount factor')
	parser.add_argument('-n', '--n_iters', default=20, type=int, 
						help='number of iterations')
	parser.add_argument('-t', '--n_trajs', default=100, type=int, 
						help='number of expert trajectories')
	parser.add_argument('-a', '--act_random', default=0.3, type=float, 
						help='probability of acting randomly')
	
	# set default value for rand_start variable
	parser.set_defaults(rand_start=False)

	# parse and print arguments
	args = parser.parse_args()

	# arguments for environment and irl algorithm
	r_max = 1 
	gamma = args.gamma
	width = args.width
	height = args.height
	l_traj = args.l_traj
	approx = args.approx
	n_iters = args.n_iters
	n_trajs = args.n_trajs
	act_rand = args.act_random
	rand_start = args.rand_start
	learning_rate = args.learning_rate

	# variables for number of actions and states
	n_actions = 5
	n_states = height * width

	# initialize the gridworld
	# rmap_gt is the ground truth for rewards
	rmap_gt = np.zeros([height, width])

	rmap_gt[0, width-1] = r_max
	rmap_gt[height-1, 0] = r_max
	rmap_gt[height-1, width-1] = r_max

	# create grid world instance
	gw = gridworld.GridWorld(rmap_gt, {}, 1-act_rand)

	# get true rewards, state transition dynamics
	rewards_gt = np.reshape(rmap_gt, height*width, order='F')
	P_a_true = gw.get_transition_mat()

	trajs = generate_random(gw, n_actions, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start)

	# get approximation of state transition dynamics
	P_a_approx = np.zeros((n_states, n_states, n_actions))
	for traj in trajs:
		for t in range(len(traj)):
			P_a_approx[traj[t].cur_state, traj[t].next_state, traj[t].action] += 1

	for s in range(n_states):
		for a in range(n_actions):
			if np.sum(P_a_approx[s,:,a]) != 0:
				P_a_approx[s,:,a] /= np.sum(P_a_approx[s,:,a])

	if approx:
		P_a = P_a_approx
	else:
		P_a = P_a_true

	# get true value function and policy from reward map
	values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, gamma, error=0.01, deterministic=True)

	# use identity matrix as feature
	feat_map = np.eye(n_states)

	# other two features. due to the linear nature, 
	# the following two features might not work as well as the identity.
	# feat_map = feature_basis(gw)
	# feat_map = feature_coord(gw)

	trajs = generate_demonstrations(gw, policy_gt, n_trajs=n_trajs, len_traj=l_traj, 
									rand_start=rand_start)

	# perform inverse reinforcement learning to get reward function
	rewards = maxent_irl(feat_map, P_a, gamma, trajs, learning_rate, n_iters)
	values, _ = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)

	# plots
	plt.figure(figsize=(20,4))
	plt.subplot(2, 2, 1)
	img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
	plt.subplot(2, 2, 2)
	img_utils.heatmap2d(np.reshape(values_gt, (height,width), order='F'), 'Value Map - Ground Truth', block=False)
	plt.subplot(2, 2, 3)
	img_utils.heatmap2d(np.reshape(rewards, (height,width), order='F'), 'Reward Map - Recovered', block=False)
	plt.subplot(2, 2, 4)
	img_utils.heatmap2d(np.reshape(values, (height,width), order='F'), 'Value Map - Recovered', block=False)
	plt.show()

	# plots for state transition dynamics
	plt.figure(figsize=(10,4))
	plt.subplot(2, 1, 1)
	img_utils.heatmap2d(np.reshape(P_a_true[10,:,2], (height,width), order='F'), 'True Dist', block=False)
	plt.subplot(2, 1, 2)
	img_utils.heatmap2d(np.reshape(P_a_approx[10,:,2], (height,width), order='F'), 'Approx Dist', block=False)
	plt.show()