def train(env, episode_count=1000):
    state_now = env.reset()    
    agent = vg.ValueIterationAgent(env)
    ## values, _  = agent.value_iteration(error=0.01, deterministic=False)
    agent.load()

    # plot test!
    plot(env, agent)

    # generate demonstrations
    trajs = generate_demonstrations(env, agent, n_trajs=100, len_traj=100,
                                    rand_start=True)

    # feature selection
    feat_map = feature_basis(env)

    # run irl
    T = agent.get_transition_mat()
    T = np.swapaxes(T,1,2)
    rewards = maxent_irl(np.array(feat_map), T, gamma=0.95, trajs=trajs, lr=0.01, n_iters=20)
    print rewards

    # value iteration

    agent.save()
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)
    print(feat_map.shape)

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)

    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
def main():
    for seed in range(1):
        N_STATES = H * W
        # init the gridworld
        # rmap_gt is the ground truth for rewards
        rmap_gt = np.zeros([H, W])
        #goal coordinates
        rmap_gt[H - 1, W - 1] = R_MAX
        gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)
        rewards_gt = np.reshape(rmap_gt, H * W, order='F')
        P_a = gw.get_transition_mat()
        values_gt, policy_gt = value_iteration.value_iteration(
            P_a, rewards_gt, GAMMA, error=0.01, deterministic=True)

        # use identity matrix as feature
        feat_map = np.eye(N_STATES)

        # other two features. due to the linear nature,
        # the following two features might not work as well as the identity.
        # feat_map = feature_basis(gw)
        # feat_map = feature_coord(gw)
        np.random.seed(0)
        #trajs = generate_demonstrations(gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ, rand_start=RAND_START)

        trajs = mod.exp1_case2()
        rewards = maxent_irl(gw, feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                             N_ITERS)

        #np.savetxt('results/rewards.txt', rewards)

        #values, policy = value_iteration.value_iteration(P_a, rewards, GAMMA, error=0.01, deterministic=True)
        # plots
        plt.figure(figsize=(20, 20))
        img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                            'Reward Map',
                            block=False)
        plt.plot()
        #now = datetime.datetime.now()
        #figname = "results/rewards_{0:%m%d%H%M}".format(now) + ".png"
        figname = "results/rewards_seed{0}".format(seed) + ".png"
        plt.savefig(figname)
Esempio n. 4
0
def test_irl_algorithms(gw, P_a, rmap_gt, policy_gt, trajs, feat_map):
    print('LP IRL training ..')
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX)
    print('Max Ent IRL training ..')
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2,
                                N_ITERS * 2)
    print('Deep Max Ent IRL training ..')
    rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                              N_ITERS)
    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)

    # plots
    plt.figure(figsize=(20, 8))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(to_plot(rmap_gt),
                        'Rewards Map - Ground Truth',
                        block=False,
                        text=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(to_plot(rewards_lpirl),
                        'Reward Map - LP',
                        block=False,
                        text=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(to_plot(rewards_maxent),
                        'Reward Map - Maxent',
                        block=False,
                        text=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(to_plot(rewards),
                        'Reward Map - Deep Maxent',
                        block=False,
                        text=False)
    plt.show()
Esempio n. 5
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 2, W - 2] = R_MAX
    rmap_gt[1, 1] = R_MAX
    # rmap_gt[H/2, W/2] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {},
                             1 - ACT_RAND)
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    print('LP IRL training ..')
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=10, R_max=R_MAX)
    print('Max Ent IRL training ..')
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE * 2,
                                N_ITERS * 2)
    print('Deep Max Ent IRL training ..')
    rewards_deep = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                                   N_ITERS)
    print('Deep Siamese Max Ent IRL training ..')
    rewards = deep_siamese_maxent_irl(feat_map, P_a, GAMMA, trajs,
                                      LEARNING_RATE, N_ITERS)

    # plots
    plt.figure(figsize=(20, 5))
    plt.subplot(1, 5, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 5, 2)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    plt.subplot(1, 5, 3)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    plt.subplot(1, 5, 4)
    img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'),
                        'Reward Map - Deep Maxent',
                        block=False)
    plt.subplot(1, 5, 5)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Deep Policy Maxent',
                        block=False)
    plt.show()
Esempio n. 6
0
def main(grid_size, discount, n_objects, n_colours, n_trajectories, epochs,
         learning_rate, structure):

    wind = 0.3
    trajectory_length = 8
    l1 = l2 = 0

    ow = objectworld.Objectworld(grid_size, n_objects, n_colours, wind,
                                 discount)
    print(ow.objects.keys())
    rewards_gt = np.array([ow.reward(s) for s in range(ow.n_states)])
    policy_gt = find_policy(ow.n_states,
                            ow.n_actions,
                            ow.transition_probability,
                            rewards_gt,
                            ow.discount,
                            stochastic=False)
    trajs = ow.generate_trajectories(N_TRAJS, L_TRAJ, lambda s: policy_gt[s])
    feat_map = ow.feature_matrix(ow.objects, discrete=False)

    rewards_inv = np.array(
        [ow.inverse_reward(s_inv) for s_inv in range(ow.n_states)])
    policy_inv = find_inverted_policy(ow.n_states,
                                      ow.n_actions,
                                      ow.transition_probability,
                                      rewards_inv,
                                      ow.discount,
                                      stochastic=False)
    trajs_inv = ow.generate_inverse_trajectories(
        N_TRAJS, L_TRAJ, lambda s_inv: policy_inv[s_inv])
    feat_map_inv = ow.inv_feature_matrix(ow.inverted_objects, discrete=False)
    print('LP IRL training ..')
    rewards_lpirl = lp_irl(ow.transition_probability,
                           policy_gt,
                           gamma=0.3,
                           l1=10,
                           R_max=R_MAX)
    print('Max Ent IRL training ..')
    rewards_maxent = maxent_irl(feat_map, ow.transition_probability, GAMMA,
                                trajs, LEARNING_RATE * 2, N_ITERS * 2)
    print('Deep Max Ent IRL training ..')
    rewards_deep = deep_maxent_irl(feat_map, ow.transition_probability, GAMMA,
                                   trajs, LEARNING_RATE, N_ITERS)
    print('Deep Siamese Max Ent IRL training ..')
    rewards = deep_siamese_maxent_irl(feat_map, feat_map_inv,
                                      ow.transition_probability, GAMMA, trajs,
                                      trajs_inv, LEARNING_RATE, N_ITERS)

    # plots
    plt.figure(figsize=(20, 5))
    plt.subplot(1, 5, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    plt.subplot(1, 5, 2)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    plt.subplot(1, 5, 3)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    plt.subplot(1, 5, 4)
    img_utils.heatmap2d(np.reshape(rewards_deep, (H, W), order='F'),
                        'Reward Map - Deep Maxent',
                        block=False)
    plt.subplot(1, 5, 5)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Deep Siamese Maxent',
                        block=False)
    plt.show()
def main():
    N_STATES = H * W
    N_ACTIONS = 4

    rmap_gt = set_rewards()
    gw = gridworld.GridWorld(rmap_gt, {(H - 1, W - 1)}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()
    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)
    path_gt = gw.display_path_grid(policy_gt)

    rmap_gt = gw.get_reward_mat()

    #temp
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 3, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 3, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 3, 3)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.show()
    sys.exit()

    # feat_map = np.eye(N_STATES)
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    feat_map = feature_histogram(gw)

    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)
    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)
    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)
    path = gw.display_path_grid(policy)

    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(2, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(2, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.subplot(2, 4, 5)
    img_utils.heatmap2d(np.reshape(path_gt, (H, W), order='F'),
                        'Path Map - Ground Truth',
                        block=False)
    plt.subplot(2, 4, 7)
    img_utils.heatmap2d(np.reshape(path, (H, W), order='F'),
                        'Path Map - Recovered',
                        block=False)
    plt.show()
def main():
    N_STATES = H * W
    N_ACTIONS = 5
    start_coordinates = (pixel_locations[0]['location-lat'][0],
                         pixel_locations[0]['location-long'][0])
    end_coordinates = (
        pixel_locations[0]['location-lat'][len(pixel_locations[0].index) - 1],
        pixel_locations[0]['location-long'][len(pixel_locations[0].index) - 1])

    rmap_gt = np.zeros([W, H])
    rmap_gt[int(start_coordinates[0]), int(start_coordinates[1])] = R_MAX
    rmap_gt[int(end_coordinates[0]), int(end_coordinates[1])] = R_MAX
    # rmap_gt[H/2, W/2] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    rewards_gt = normalize(values_gt)
    gw = gridworld.GridWorld(np.reshape(rewards_gt, (H, W), order='F'), {},
                             1 - ACT_RAND)
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    # feat_map = np.eye(N_STATES)

    coast_map = np.load('Feature Maps/small_maps/coast.npy')
    coast_map = np.reshape(coast_map, (600, 1))

    forest_map = np.load('Feature Maps/small_maps/forest.npy')
    forest_map = np.reshape(coast_map, (600, 1))

    land_map = np.load('Feature Maps/small_maps/land.npy')
    land_map = np.reshape(coast_map, (600, 1))

    feat_map = np.hstack((coast_map, forest_map, land_map))

    # populate trajectories
    trajs = []
    terminal_state = end_coordinates
    for x in range(len(pixel_locations)):
        trajs.append([])
        for i in range(len(pixel_locations[x]) - 1):
            loc = pixel_locations[x].iloc[i]
            next_loc = pixel_locations[x].iloc[i + 1]
            action = get_action(loc, next_loc)
            reward = rmap_gt[int(next_loc[0]), int(next_loc[1])]
            is_done = np.array_equal(next_loc, terminal_state)

            trajs[x].append(
                Step(cur_state=int(gw.pos2idx(loc)),
                     action=action,
                     next_state=int(gw.pos2idx(next_loc)),
                     reward=reward,
                     done=is_done))

    print 'LP IRL training ..'
    rewards_lpirl = lp_irl(P_a, policy_gt, gamma=0.3, l1=100, R_max=R_MAX)
    print 'Max Ent IRL training ..'
    rewards_maxent = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE,
                                N_ITERS)
    #   print 'Deep Max Ent IRL training ..'
    #   rewards = deep_maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, 10)

    # plots
    fig = plt.figure()
    plt.subplot(1, 2, 1)
    img_utils.heatmap2d(np.reshape(rewards_gt, (H, W), order='F'),
                        'Rewards Map - Ground Truth',
                        block=False)
    fig.savefig('GroundTruth.png')
    plt.subplot(1, 1, 1)
    img_utils.heatmap2d(np.reshape(rewards_lpirl, (H, W), order='F'),
                        'Reward Map - LP',
                        block=False)
    fig.savefig('LP.png')
    plt.subplot(1, 1, 1)
    img_utils.heatmap2d(np.reshape(rewards_maxent, (H, W), order='F'),
                        'Reward Map - Maxent',
                        block=False)
    fig.savefig('MaxEnt.png')
def main():
    N_STATES = H * W
    N_ACTIONS = 5
    """while True:
      print "BAD_STATE入力"
      bad = raw_input('>> ')
      if bad == 'ok':
          break
      Bad_states.append(bad)
  """

    #print Bad_states
    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')
    P_a = gw.get_transition_mat()

    values_gt, policy_gt = value_iteration.value_iteration(P_a,
                                                           rewards_gt,
                                                           GAMMA,
                                                           error=0.01,
                                                           deterministic=True)

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(gw,
                                    policy_gt,
                                    n_trajs=N_TRAJS,
                                    len_traj=L_TRAJ,
                                    rand_start=RAND_START)

    rewards = maxent_irl(feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS)

    #new_rewards = reward_decrease(rewards, R_GAMMA, BAD_X, BAD_Y)

    np.savetxt('results/rewards.txt', rewards)

    #print rewards

    values, policy = value_iteration.value_iteration(P_a,
                                                     rewards,
                                                     GAMMA,
                                                     error=0.01,
                                                     deterministic=True)

    #print policy
    # plots
    plt.figure(figsize=(20, 20))
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map',
                        block=False)
    plt.plot()

    plt.figure(figsize=(20, 20))
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Policy Map',
                        block=False)
    plt.plot()
    plt.show()
Esempio n. 10
0
C.set_terminal_idx()
if rank_features:
    C.convert_to_rankings()

# demonstrations
canonical_user_demo = [canonical_demo]
canonical_trajectories = get_trajectories(C.states, canonical_user_demo,
                                          C.transition)

print("Training ...")

# using abstract features
abstract_features = np.array([C.get_features(state) for state in C.states])
norm_abstract_features = abstract_features / np.linalg.norm(abstract_features,
                                                            axis=0)
canonical_rewards_abstract, canonical_weights_abstract = maxent_irl(
    C, norm_abstract_features, canonical_trajectories, optim, init)

print("Weights have been learned for the canonical task! Fingers X-ed.")
print("Weights -", canonical_weights_abstract)

# scale weights
if scale_weights:
    canonical_weights_abstract /= max(canonical_weights_abstract)

# ----------------------------------------- Testing: Predict complex -------------------------------------------- #
sample_complex_demo = [1, 3, 5, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 7]

complex_survey_actions = [0, 4, 1, 5, 6, 7, 2, 3]
action_counts = [1, 1, 4, 1, 4, 1, 4, 1]
preferred_order = [
    df[q][idx] for q in
Esempio n. 11
0
def main():
    N_STATES = H * W
    N_ACTIONS = 5

    # init the gridworld
    # rmap_gt is the ground truth for rewards
    rmap_gt = np.zeros([H, W])
    rmap_gt[H - 1, W - 1] = R_MAX
    # rmap_gt[H-1, 0] = R_MAX

    gw = gridworld.GridWorld(rmap_gt, {}, 1 - ACT_RAND)

    rewards_gt = np.reshape(rmap_gt, H * W, order='F')  #

    P_a = gw.get_transition_mat(
    )  #this is the transitin probablities of the matrix  5 action what is the probability of moving from state s1 to s2 give the action
    #getting the  transition probabilities in my case is just impossible ...

    values_gt, policy_gt = value_iteration.value_iteration(
        P_a, rewards_gt, GAMMA, error=0.01, deterministic=True
    )  #value iteration and policy acoding to the currrent rewards 0

    # use identity matrix as feature
    feat_map = np.eye(N_STATES)  #features as one hot encoding

    # other two features. due to the linear nature,
    # the following two features might not work as well as the identity.
    # feat_map = feature_basis(gw)
    # feat_map = feature_coord(gw)
    np.random.seed(1)
    trajs = generate_demonstrations(
        gw, policy_gt, n_trajs=N_TRAJS, len_traj=L_TRAJ,
        rand_start=RAND_START)  #this is the trajectories

    rewards = maxent_irl(
        feat_map, P_a, GAMMA, trajs, LEARNING_RATE, N_ITERS
    )  #need to input the feature map , transition priobalibliteis og the world

    pdb.set_trace()

    values, _ = value_iteration.value_iteration(P_a,
                                                rewards,
                                                GAMMA,
                                                error=0.01,
                                                deterministic=True)
    # plots
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 4, 1)
    img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
    plt.subplot(1, 4, 2)
    img_utils.heatmap2d(np.reshape(values_gt, (H, W), order='F'),
                        'Value Map - Ground Truth',
                        block=False)
    plt.subplot(1, 4, 3)
    img_utils.heatmap2d(np.reshape(rewards, (H, W), order='F'),
                        'Reward Map - Recovered',
                        block=False)
    plt.subplot(1, 4, 4)
    img_utils.heatmap2d(np.reshape(values, (H, W), order='F'),
                        'Value Map - Recovered',
                        block=False)
    plt.show()
Esempio n. 12
0
def main():

	# named tuple to record demonstrations
	Step = namedtuple('Step','cur_state action next_state reward done')

	# argument parser for command line arguments
	parser = argparse.ArgumentParser(description=None)

	parser.add_argument('-wid', '--width', default=5, type=int, 
						help='width of the gridworld')
	parser.add_argument('-hei', '--height', default=5, type=int, 
						help='height of the gridworld')
	parser.add_argument('-lr', '--learning_rate', default=0.01, type=float, 
						help='learning rate')
	parser.add_argument('-l', '--l_traj', default=20, type=int, 
						help='length of expert trajectory')

	parser.add_argument('--no-rand_start', dest='rand_start', action='store_false', 
						help='when sampling trajectories, fix start positions')
	parser.add_argument('--rand_start', dest='rand_start', action='store_true', 
						help='when sampling trajectories, randomly pick start positions')
	parser.add_argument('--approx', dest='approx', action='store_true', 
						help='flag to perform approximation of psa')

	parser.add_argument('-g', '--gamma', default=0.9, type=float, 
						help='discount factor')
	parser.add_argument('-n', '--n_iters', default=20, type=int, 
						help='number of iterations')
	parser.add_argument('-t', '--n_trajs', default=100, type=int, 
						help='number of expert trajectories')
	parser.add_argument('-a', '--act_random', default=0.3, type=float, 
						help='probability of acting randomly')
	
	# set default value for rand_start variable
	parser.set_defaults(rand_start=False)

	# parse and print arguments
	args = parser.parse_args()

	# arguments for environment and irl algorithm
	r_max = 1 
	gamma = args.gamma
	width = args.width
	height = args.height
	l_traj = args.l_traj
	approx = args.approx
	n_iters = args.n_iters
	n_trajs = args.n_trajs
	act_rand = args.act_random
	rand_start = args.rand_start
	learning_rate = args.learning_rate

	# variables for number of actions and states
	n_actions = 5
	n_states = height * width

	# initialize the gridworld
	# rmap_gt is the ground truth for rewards
	rmap_gt = np.zeros([height, width])

	rmap_gt[0, width-1] = r_max
	rmap_gt[height-1, 0] = r_max
	rmap_gt[height-1, width-1] = r_max

	# create grid world instance
	gw = gridworld.GridWorld(rmap_gt, {}, 1-act_rand)

	# get true rewards, state transition dynamics
	rewards_gt = np.reshape(rmap_gt, height*width, order='F')
	P_a_true = gw.get_transition_mat()

	trajs = generate_random(gw, n_actions, n_trajs=n_trajs, len_traj=l_traj, rand_start=rand_start)

	# get approximation of state transition dynamics
	P_a_approx = np.zeros((n_states, n_states, n_actions))
	for traj in trajs:
		for t in range(len(traj)):
			P_a_approx[traj[t].cur_state, traj[t].next_state, traj[t].action] += 1

	for s in range(n_states):
		for a in range(n_actions):
			if np.sum(P_a_approx[s,:,a]) != 0:
				P_a_approx[s,:,a] /= np.sum(P_a_approx[s,:,a])

	if approx:
		P_a = P_a_approx
	else:
		P_a = P_a_true

	# get true value function and policy from reward map
	values_gt, policy_gt = value_iteration.value_iteration(P_a, rewards_gt, gamma, error=0.01, deterministic=True)

	# use identity matrix as feature
	feat_map = np.eye(n_states)

	# other two features. due to the linear nature, 
	# the following two features might not work as well as the identity.
	# feat_map = feature_basis(gw)
	# feat_map = feature_coord(gw)

	trajs = generate_demonstrations(gw, policy_gt, n_trajs=n_trajs, len_traj=l_traj, 
									rand_start=rand_start)

	# perform inverse reinforcement learning to get reward function
	rewards = maxent_irl(feat_map, P_a, gamma, trajs, learning_rate, n_iters)
	values, _ = value_iteration.value_iteration(P_a, rewards, gamma, error=0.01, deterministic=True)

	# plots
	plt.figure(figsize=(20,4))
	plt.subplot(2, 2, 1)
	img_utils.heatmap2d(rmap_gt, 'Rewards Map - Ground Truth', block=False)
	plt.subplot(2, 2, 2)
	img_utils.heatmap2d(np.reshape(values_gt, (height,width), order='F'), 'Value Map - Ground Truth', block=False)
	plt.subplot(2, 2, 3)
	img_utils.heatmap2d(np.reshape(rewards, (height,width), order='F'), 'Reward Map - Recovered', block=False)
	plt.subplot(2, 2, 4)
	img_utils.heatmap2d(np.reshape(values, (height,width), order='F'), 'Value Map - Recovered', block=False)
	plt.show()

	# plots for state transition dynamics
	plt.figure(figsize=(10,4))
	plt.subplot(2, 1, 1)
	img_utils.heatmap2d(np.reshape(P_a_true[10,:,2], (height,width), order='F'), 'True Dist', block=False)
	plt.subplot(2, 1, 2)
	img_utils.heatmap2d(np.reshape(P_a_approx[10,:,2], (height,width), order='F'), 'Approx Dist', block=False)
	plt.show()