def test_gridworld_q_learning():
    np.random.seed(0)

    N = 5
    goal_pos = np.array([[N-1, N-1]])
    human_pos = np.array([[N-1, 0]])
    human_radius = 2

    grid = np.ones((N, N), dtype=float) * -1
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=0.8,
        render=True,
    )

    mdp_algo = q_learning(env.transition, env.reward, gamma=0.99)
    mdp_algo.run()
    policy = StochasticGreedyPolicy(
        env.action_space(), mdp_algo, env.transition)

    # plot results
    R = env.reward.reshape((N, N)).T
    V = np.asarray(mdp_algo.V).reshape((N, N)).T

    plot_grid_map(R, "Reward", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)
    plt.show()

    obs, rew, done, info = env.reset()
    while not done:
        act = policy.get_action(obs)
        obs, rew, done, info = env.step(act)
        time.sleep(0.2)

    env.close()
def test_gridworld_value_iteration():
    np.random.seed(0)

    N = 10
    goal_pos = np.array([[N-1, N-1], [N-1, N-2]])
    human_pos = np.array([[N//2, N//2], [N-1, 0]])
    human_radius = 3

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=1,
        render=True,
    )

    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    policy = EpsGreedyPolicy(env.action_space(), mdp_algo)

    # plot results
    R = env.reward.reshape((N, N)).T
    V = np.asarray(mdp_algo.V).reshape((N, N)).T

    plot_grid_map(R, "Reward", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)
    plot_policy(policy, (N, N), "Policy", values=V, cmap=plt.cm.Blues)
    plt.show()

    obs, rew, done, info = env.reset()
    while not done:
        act = policy.get_action(obs)
        obs, rew, done, info = env.step(act)
        time.sleep(0.2)

    env.close()
Beispiel #3
0
def test_feature_gridworld_maxent_irl():
    np.random.seed(0)

    # env
    N = 15

    init_pos = np.zeros((N, N), dtype=float)
    for i, j in product(range(N // 2 + 2, N), range(N // 2 + 2, N)):
        init_pos[i, j] = i**2 + j**2
    init_pos /= np.sum(init_pos)
    goal_pos = np.array([[n, 0] for n in range(N)])

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_feature_boundary_reward(
        grid,
        boundary_axis=0,
        boundary_value=N // 2,
        reward=-10,
        exp_constant=0.2,
    )

    plot_grid_map(init_pos.T,
                  "Initial Position Distribution",
                  cmap=plt.cm.Blues)
    plot_grid_map(grid.T, "Reward (Ground Truth)", cmap=plt.cm.Reds)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=init_pos,
        goal_pos=goal_pos,
        reward_grid=grid,
        action_success_rate=1,
        render=False,
    )

    # learn a policy
    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    policy = StochasticGreedyPolicy(env.action_space(), mdp_algo,
                                    env.transition)

    # roll out trajectories
    dataset = collect_trajectories(policy=policy,
                                   env=env,
                                   num_trajectories=20,
                                   maxlen=N * 2)
    plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution")

    # IRL feature map
    feature_map = [
        env._feature_map(s) for s in range(env.observation_space().n)
    ]
    feature_map = np.array(feature_map)

    # IRL
    me_irl = MaxEntIRL(observation_space=env.observation_space(),
                       action_space=env.action_space(),
                       transition=env.transition,
                       goal_states=env.goal_states,
                       dataset=dataset,
                       feature_map=feature_map,
                       max_iter=10,
                       lr=0.1,
                       anneal_rate=0.9)
    Rprime = me_irl.train()
    Rprime = Rprime.reshape((N, N)).T

    # plot results
    plot_grid_map(Rprime, "Reward (IRL)", cmap=plt.cm.Blues)
    plt.show()
Beispiel #4
0
def test_gridworld_maxent_irl():
    np.random.seed(0)

    # env
    N = 10
    goal_pos = np.array([[N - 1, N - 1]])
    human_pos = np.array([[3, 3]])
    human_radius = 2

    grid = np.zeros((N, N), dtype=float)
    grid = construct_goal_reward(grid, goal_pos, 10)
    grid = construct_human_radius_reward(grid, human_pos, human_radius, -10)

    env = GridWorld(
        dimensions=(N, N),
        init_pos=(0, 0),
        goal_pos=goal_pos,
        reward_grid=grid,
        human_pos=human_pos,
        action_success_rate=1,
        render=False,
    )

    # learn a policy
    mdp_algo = value_iteration(env.transition, env.reward, gamma=0.99)
    # mdp_algo = q_learning(env.transition, env.reward, gamma=0.99)
    # policy = GreedyPolicy(env.action_space(), mdp_algo)
    # policy = EpsGreedyPolicy(env.action_space(), mdp_algo, epsilon=0.1)
    policy = StochasticGreedyPolicy(env.action_space(), mdp_algo,
                                    env.transition)

    V = np.asarray(mdp_algo.V).reshape((N, N)).T
    R = env.reward.reshape((N, N)).T
    plot_grid_map(R, "Reward (Ground Truth)", cmap=plt.cm.Reds)
    plot_grid_map(V, "Value Function", cmap=plt.cm.Blues)

    # roll out trajectories
    dataset = collect_trajectories(policy=policy,
                                   env=env,
                                   num_trajectories=200,
                                   maxlen=N * 2)
    plot_dataset_distribution(dataset, (N, N), "Dataset State Distribution")

    # feature map
    feature_map = [
        env._feature_map(s) for s in range(env.observation_space().n)
    ]
    feature_map = np.array(feature_map)

    # IRL
    me_irl = MaxEntIRL(observation_space=env.observation_space(),
                       action_space=env.action_space(),
                       transition=env.transition,
                       goal_states=env.goal_states,
                       dataset=dataset,
                       feature_map=feature_map,
                       max_iter=10,
                       lr=0.1,
                       anneal_rate=0.9)
    Rprime = me_irl.train()
    Rprime = Rprime.reshape((N, N)).T

    # plot results
    plot_grid_map(Rprime, "Reward (IRL)", print_values=True, cmap=plt.cm.Blues)
    plt.show()