def show_agents(grids,
                agent_list,
                agent_names,
                grid_names,
                filename='AgentComparison',
                figtitle=''):
    """Shows how agents perform on a gridworld

    grid - list of gridworlds (see examples in earlier part of file)
    agent_list - list of agent (objects)
    agent_names - names of agents (strings)
    """
    num_ex = len(agent_list)
    num_grids = len(grids)
    fig, axes_grid = plt.subplots(num_grids, num_ex, figsize=(14.0, 4.5))

    if num_grids == 1:
        axes_grid = [axes_grid]

    for i, axes in enumerate(axes_grid):
        # Give each gridworld a name (uncomment to do so)
        # ax.set_ylabel(grid_names[i])
        # Generate MDP
        grid = grids[i]
        mdp = GridworldMdp(grid, noise=0.2)
        walls, reward, start = mdp.convert_to_numpy_input()

        for idx, agent in enumerate(agent_list):
            ax = axes[idx]
            ax.set_aspect('equal')

            plot_reward(reward, walls, '', fig=fig, ax=ax)
            plot_trajectory(walls,
                            reward,
                            start,
                            agent,
                            arrow_width=0.35,
                            fig=fig,
                            ax=ax)
            # Only write Agent names if it's the first row
            if i == 0:
                ax.set_title(agent_names[idx],
                             fontname='Times New Roman',
                             fontsize=16)

            print('Agent {} is {}'.format(agent_names[idx], agent))

    # Increase vertical space btwn subplots
    # fig.subplots_adjust(hspace=0.2)
    # fig.suptitle(figtitle)
    fig.savefig(filename, bbox_inches='tight', dpi=500)
    print("Saved figure to {}.png".format(filename))
Beispiel #2
0
    def test_uncalibrated_agents(self):
        grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
                ['X', -9, ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' ', 'X'],
                ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 3, 'X'],
                ['X', ' ', ' ', 'X', -9, -9, -9, -9, -9, ' ', 'X'],
                ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']]
        n, s, e, w, stay = self.all_actions

        mdp = GridworldMdp(grid, living_reward=-0.1, noise=0.2)
        env = Mdp(mdp)

        agent1 = agents.OptimalAgent(gamma=0.9, num_iters=50)
        agent1.set_mdp(mdp)
        actions, _ = self.run_on_env(agent1, env, gamma=0.9, episode_length=13)
        self.assertEqual(actions,
                         [e, e, e, n, e, e, e, e, e, s, stay, stay, stay])

        agent2 = agents.UncalibratedAgent(gamma=0.9,
                                          num_iters=20,
                                          calibration_factor=5)
        agent2.set_mdp(mdp)
        actions, _ = self.run_on_env(agent2, env, gamma=0.9, episode_length=13)
        self.assertEqual(
            actions, [e, e, e, e, e, e, e, e, stay, stay, stay, stay, stay])

        agent3 = agents.UncalibratedAgent(gamma=0.9,
                                          num_iters=20,
                                          calibration_factor=0.5)
        agent3.set_mdp(mdp)
        actions, _ = self.run_on_env(agent3, env, gamma=0.9, episode_length=13)
        self.assertEqual(actions, [s, e, n, e, e, n, e, e, e, e, e, s, stay])
Beispiel #3
0
    def test_myopic_agent(self):
        grid = [
            'XXXXXXXX', 'XA     X', 'X XXXX9X', 'X      X', 'X X2   X',
            'XXXXXXXX'
        ]
        n, s, e, w, stay = self.all_actions

        mdp = GridworldMdp(grid, living_reward=-0.1)
        env = Mdp(mdp)

        optimal_agent = agents.OptimalAgent(gamma=0.9, num_iters=20)
        optimal_agent.set_mdp(mdp)
        actions, _ = self.run_on_env(optimal_agent,
                                     env,
                                     gamma=0.9,
                                     episode_length=10)
        self.assertEqual(actions, [e, e, e, e, e, s, stay, stay, stay, stay])

        myopic_agent = agents.MyopicAgent(6, gamma=0.9, num_iters=20)
        myopic_agent.set_mdp(mdp)
        actions, _ = self.run_on_env(myopic_agent,
                                     env,
                                     gamma=0.9,
                                     episode_length=10)
        self.assertEqual(actions, [s, s, e, e, e, e, e, n, stay, stay])
def get_policy(agent, grid):
    """Returns the policy of the agent given"""
    from gridworld.gridworld import GridworldMdp, Direction
    from utils import Distribution

    num_actions = len(Direction.ALL_DIRECTIONS)
    mdp = GridworldMdp(grid=grid)
    agent.set_mdp(mdp)

    def dist_to_numpy(dist):
        return dist.as_numpy_array(Direction.get_number_from_direction,
                                   num_actions)

    def action(state):
        # Walls are invalid states and the MDP will refuse to give an action for
        # them. However, the VIN's architecture requires it to provide an action
        # distribution for walls too, so hardcode it to always be STAY.
        x, y = state
        if mdp.walls[y][x]:
            return dist_to_numpy(Distribution({Direction.STAY: 1}))
        return dist_to_numpy(agent.get_action_distribution(state))

    imsize = len(grid)

    # I think it's this line that's wrong. Writing as (y, x) gives expected SVF vec
    action_dists = [[action((x, y)) for y in range(imsize)]
                    for x in range(imsize)]
    action_dists = np.array(action_dists)
    return action_dists
def random_gridworld_plot(agent, other_agent, size, filename='RandomGrid'):
    """Plots random gridworld"""
    from gridworld.gridworld import Direction
    from utils import Distribution
    if agent is None:
        raise ValueError("agent cannot be None")

    num_R = 5
    mdp = GridworldMdp.generate_random_connected(size, size, num_R, noise=0)

    walls, reward, start = mdp.convert_to_numpy_input()

    def get_policy(agent):
        num_actions = 5
        imsize = len(walls)

        def dist_to_numpy(dist):
            return dist.as_numpy_array(Direction.get_number_from_direction,
                                       num_actions)

        def action(state):
            # Walls are invalid states and the MDP will refuse to give an action for
            # them. However, the VIN's architecture requires it to provide an action
            # distribution for walls too, so hardcode it to always be STAY.
            x, y = state
            if mdp.walls[y][x]:
                return dist_to_numpy(Distribution({Direction.STAY: 1}))
            return dist_to_numpy(agent.get_action_distribution(state))

        agent.set_mdp(mdp)
        action_dists = [[action((x, y)) for x in range(imsize)]
                        for y in range(imsize)]
        action_dists = np.array(action_dists)
        return action_dists

    fig, axes = plt.subplots(1, 1)
    fig.set_size_inches(5, 5)

    # Reward only
    plot_reward(reward, np.zeros_like(walls), fig=fig, ax=axes, ax_title='')
    fig.savefig(filename + 'R', bbox_inches='tight', dpi=100)

    # Walls only
    plot_reward(np.zeros_like(reward), walls, fig=fig, ax=axes, ax_title='')
    fig.savefig(filename + 'W', bbox_inches='tight', dpi=100)

    # Trajectory + Walls + Rewards
    plot_reward(reward, walls, fig=fig, ax=axes, ax_title='')
    # plot_trajectory(walls, reward, start, agent, fig=fig, ax=axes)
    policy = get_policy(agent)
    plot_policy(walls, policy, fig=fig, ax=axes)
    fig.savefig(filename + 'Ptrue', bbox_inches='tight', dpi=100)

    axes.clear()
    plot_reward(reward, walls, fig=fig, ax=axes, ax_title='')
    predicted = get_policy(other_agent)
    plot_policy_diff(predicted, policy, walls, fig=fig, ax=axes)
    fig.savefig(filename + 'Ppredicted', bbox_inches='tight', dpi=100)
Beispiel #6
0
def test_visitations(grid, agent):
    """Tests the expected_counts calculation--might be einsum error"""
    # print("Testing expected_counts")
    from gridworld.gridworld import GridworldMdp, Direction
    from utils import Distribution

    num_actions = len(Direction.ALL_DIRECTIONS)

    mdp = GridworldMdp(grid=grid)
    agent.set_mdp(mdp)

    def dist_to_numpy(dist):
        return dist.as_numpy_array(Direction.get_number_from_direction,
                                   num_actions)

    def action(state):
        # Walls are invalid states and the MDP will refuse to give an action for
        # them. However, the VIN's architecture requires it to provide an action
        # distribution for walls too, so hardcode it to always be STAY.
        x, y = state
        if mdp.walls[y][x]:
            return dist_to_numpy(Distribution({Direction.STAY: 1}))
        return dist_to_numpy(agent.get_action_distribution(state))

    imsize = len(grid)

    action_dists = [[action((x, y)) for y in range(imsize)]
                    for x in range(imsize)]
    action_dists = np.array(action_dists)

    walls, rewards, start_state = mdp.convert_to_numpy_input()

    # print("Start state for given mdp:", start_state)

    start = start_state
    trans = mdp.get_transition_matrix()
    initial_states = np.zeros((len(grid), len(grid)))
    initial_states[start[1]][start[0]] = 1
    initial_states = initial_states.reshape(-1)
    policy = flatten_policy(action_dists)

    demo_counts = expected_counts(policy, trans, initial_states, 20, 0.9)

    import matplotlib.pyplot as plt
    plt.imsave("democounts", demo_counts.reshape((len(grid), len(grid))))
Beispiel #7
0
def test_trajectory_plotting():
    """Tests trajectory plotting"""
    from agents import MyopicAgent, OptimalAgent
    from gridworld.gridworld import GridworldMdp

    agent = OptimalAgent()
    mdp = GridworldMdp.generate_random(12, 12, pr_wall=0.1, pr_reward=0.1)
    agent.set_mdp(mdp)
    walls, reward, start = mdp.convert_to_numpy_input()
    myopic = MyopicAgent(horizon=10)
    _plot_reward_and_trajectories_helper(
        reward, reward, walls, start, myopic, OptimalAgent(), filename="trajectory.png"
    )
Beispiel #8
0
 def compare_agents(self, name, agent1, agent2, places=7, print_mdp=False):
     print('Comparing {0} agents'.format(name))
     set_seeds(314159)
     mdp = GridworldMdp.generate_random_connected(16, 16, 5, 0.2)
     if print_mdp: print(mdp)
     env = Mdp(mdp)
     self.time(lambda: agent1.set_mdp(mdp), "Python planner")
     self.time(lambda: agent2.set_mdp(mdp), "Numpy/Tensorflow planner")
     for s in mdp.get_states():
         for a in mdp.get_actions(s):
             mu = agent1.extend_state_to_mu(s)
             qval1, qval2 = agent1.qvalue(mu, a), agent2.qvalue(mu, a)
             self.assertAlmostEqual(qval1, qval2, places=places)
Beispiel #9
0
def test_irl(grid, agent):
    from gridworld.gridworld import GridworldMdp, Direction
    from utils import Distribution

    num_actions = len(Direction.ALL_DIRECTIONS)

    mdp = GridworldMdp(grid=grid)
    agent.set_mdp(mdp)

    def dist_to_numpy(dist):
        return dist.as_numpy_array(Direction.get_number_from_direction,
                                   num_actions)

    def action(state):
        # Walls are invalid states and the MDP will refuse to give an action for
        # them. However, the VIN's architecture requires it to provide an action
        # distribution for walls too, so hardcode it to always be STAY.
        x, y = state
        if mdp.walls[y][x]:
            return dist_to_numpy(Distribution({Direction.STAY: 1}))
        return dist_to_numpy(agent.get_action_distribution(state))

    imsize = len(grid)

    # I think it's this line that's wrong. Writing as (y, x) gives expected SVF vec
    action_dists = [[action((x, y)) for y in range(imsize)]
                    for x in range(imsize)]
    action_dists = np.array(action_dists)

    walls, rewards, start_state = mdp.convert_to_numpy_input()

    # print("Start state for given mdp:", start_state)
    inferred = irl_wrapper(walls, action_dists, start_state, 20, 0.9)
    # print("---true below---")
    # print(rewards)

    return walls, start_state, inferred, rewards
Beispiel #10
0
def evaluate_proxy(walls,
                   start_state,
                   proxy_reward,
                   true_reward,
                   gamma=0.9,
                   episode_length=float("inf")):
    """Runs agent on a proxy environment for one episode, while collecting true reward from a separate environment

    walls: Numpy array of walls, where each entry is 1 or 0
    start_state: Starting state for the agent
    proxy_reward: Numpy array of reward values
    true_reward: Numpy array of reward values

    Creates a proxy mdp by overlaying walls onto proxy grid.
    True reward is summed if the reward grid's entry at the given state can be casted to a float
    
    Returns sum of proxy reward / sum of true reward. Which is related to regret.
    """
    proxy_mdp = GridworldMdp.from_numpy_input(walls, proxy_reward, start_state)
    true_mdp = GridworldMdp.from_numpy_input(walls, true_reward, start_state)
    env = Mdp(true_mdp)

    proxy_agent = FastOptimalAgent()
    proxy_agent.set_mdp(true_mdp, proxy_mdp)
    proxy_trajectory = run_agent(proxy_agent, env, episode_length)
    reward_from_proxy_agent = get_reward_from_trajectory(
        proxy_trajectory, gamma)

    true_agent = FastOptimalAgent()
    true_agent.set_mdp(true_mdp)
    true_trajectory = run_agent(true_agent, env, episode_length)
    reward_from_true_agent = get_reward_from_trajectory(true_trajectory, gamma)
    if reward_from_true_agent == 0:
        # TODO(rohinmshah): Figure out why this can happen, and come up with a
        # better solution than this hack
        return (1.0 + reward_from_proxy_agent) / (1.0 + reward_from_true_agent)
    return float(reward_from_proxy_agent) / reward_from_true_agent
Beispiel #11
0
def main():
    grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'],
            ['X', ' ', -90, -90, -90, -90, '8', ' ', 'X'],
            ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', 'X'],
            ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'],
            ['X', ' ', ' ', -99, '2', ' ', ' ', ' ', 'X'],
            ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'],
            ['X', ' ', '1', ' ', ' ', ' ', ' ', ' ', 'X'],
            ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']]
    mdp = GridworldMdp(grid, living_reward=-0.01, noise=0.2)
    env = Mdp(mdp)
    opt = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20)
    naive = NaiveTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20)
    soph = fast_agents.FastSophisticatedTimeDiscountingAgent(10,
                                                             1,
                                                             gamma=0.95,
                                                             num_iters=20)
    myopic = fast_agents.FastMyopicAgent(6, gamma=0.95, num_iters=20)
    over = fast_agents.FastUncalibratedAgent(gamma=0.95,
                                             num_iters=20,
                                             calibration_factor=5)
    under = fast_agents.FastUncalibratedAgent(gamma=0.95,
                                              num_iters=20,
                                              calibration_factor=0.5)

    agents = [opt, naive, soph, myopic, over, under]
    names = [
        'Optimal', 'Naive', 'Sophisticated', 'Myopic', 'Overconfident',
        'Underconfident'
    ]
    for name, agent in zip(names, agents):
        print('{} agent'.format(name))
        agent.set_mdp(mdp)
        trajectory = run_agent(agent, env, episode_length=50, determinism=True)
        if agent == naive:
            print([a for _, a, _, _ in trajectory])
        print_training_example(mdp, trajectory)
    print(opt.values.T)
Beispiel #12
0
def plot_trajectory(
    wall,
    reward,
    start,
    agent,
    fig,
    ax,
    arrow_width=0.5,
    EPISODE_LENGTH=35,
    animate=False,
    fname=None,
):
    """Simulates a rollout of an agent given an MDP specified
    by the wall, reward, and start state. And plots it.

    If animate is true, an animation object will be returned
    """
    from agent_runner import run_agent
    from gridworld.gridworld import GridworldMdp
    from mdp_interface import Mdp

    mdp = GridworldMdp.from_numpy_input(wall, reward, start)

    agent.set_mdp(mdp)
    env = Mdp(mdp)
    trajectory = run_agent(agent, env, episode_length=EPISODE_LENGTH, determinism=True)

    if len(trajectory) <= 1:
        raise ValueError("Trajectory rolled out unsuccessfully")

    # Tuples of (state, next) - to be used for plotting
    state_trans = [(info[0], info[2]) for info in trajectory]
    count = 0
    for trans in state_trans:
        if trans[0] == trans[1]:
            count += 1
    if count == len(state_trans):
        print(
            "Yes, the agent given stayed in the same spot for {} iterations...".format(
                len(state_trans)
            )
        )

    if fig is None or ax is None:
        fig, ax = plt.subplots(1, 1)
    if ax is not None and type(ax) is list:
        raise ValueError("Given {} axes, but can only use 1 axis".format(len(ax)))

    # Plot starting point
    plot_pos(start, ax=ax, color="k", marker="o", grid_size=len(wall))
    # Plot ending trajectory point
    finish = state_trans[-1][0]
    plot_pos(finish, ax=ax, color="k", marker="*", grid_size=len(wall))
    plot_lines(
        ax,
        fig,
        trans_list=state_trans,
        color="black",
        arrow_width=arrow_width,
        grid_size=len(wall),
        animate=animate,
        fname=fname,
    )
    ax.set_xticks([])
    ax.set_yticks([])
    return fig, ax
Beispiel #13
0
def test_coherence(grid, agent):
    """Test that these arrays perform as expected under np.einsum"""
    from gridworld.gridworld import GridworldMdp, Direction
    from utils import Distribution

    num_actions = len(Direction.ALL_DIRECTIONS)

    mdp = GridworldMdp(grid=grid)
    agent.set_mdp(mdp)

    def dist_to_numpy(dist):
        return dist.as_numpy_array(Direction.get_number_from_direction,
                                   num_actions)

    def action(state):
        # Walls are invalid states and the MDP will refuse to give an action for
        # them. However, the VIN's architecture requires it to provide an action
        # distribution for walls too, so hardcode it to always be STAY.
        x, y = state
        if mdp.walls[y][x]:
            return dist_to_numpy(Distribution({Direction.STAY: 1}))
        return dist_to_numpy(agent.get_action_distribution(state))

    imsize = len(grid)

    action_dists = [[action((x, y)) for y in range(imsize)]
                    for x in range(imsize)]
    action_dists = np.array(action_dists)

    walls, rewards, start_state = mdp.convert_to_numpy_input()

    print("Start state for given mdp:", start_state)
    # inferred = _irl_wrapper(walls, action_dists, start_state, 20, 1.0)

    start = start_state
    trans = mdp.get_transition_matrix()
    initial_states = np.zeros((len(grid), len(grid)))
    initial_states[start[1]][start[0]] = 1
    initial_states = initial_states.reshape(-1)
    policy = flatten_policy(action_dists)

    gshape = (len(grid), len(grid))
    print("initial states")
    print('-' * 20)
    print(initial_states.reshape(gshape))
    next_states = np.einsum("i,ij,ijk -> k", initial_states, policy, trans)
    # next_states = (next_states.reshape(gshape).T).reshape(-1)
    print("first expected counts")
    print('-' * 20)
    print(next_states.reshape(gshape))
    next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans)
    print("second expected counts")
    print('-' * 20)
    print(next_states.reshape(gshape))

    next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans)
    # next_states = (next_states.reshape(gshape).T).reshape(-1)
    print("third expected counts")
    print('-' * 20)
    print(next_states.reshape(gshape))

    # for i in range(5):
    #     next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans)
    #     # next_states = (next_states.reshape(gshape).T).reshape(-1)
    #     print("{}th expected counts".format(4+i))
    #     print('-'*20)
    #     print(next_states.reshape(gshape))
    return next_states.reshape((len(grid), len(grid)))
Beispiel #14
0
    def optimal_agent_test(self, agent):
        grid = [
            'XXXXXXXXX', 'X9X6XA  X', 'X X X XXX', 'X      2X', 'XXXXXXXXX'
        ]
        n, s, e, w, stay = self.all_actions

        mdp = GridworldMdp(grid, living_reward=-0.1)
        env = Mdp(mdp)
        agent.set_mdp(mdp)
        start_state = mdp.get_start_state()

        # Action distribution
        action_dist = agent.get_action_distribution(start_state)
        self.assertEqual(action_dist, Distribution({s: 1}))

        # Trajectory
        actions, _ = self.run_on_env(agent, env, gamma=0.95, episode_length=10)
        self.assertEqual(actions, [s, s, w, w, w, w, n, n, stay, stay])

        # Same thing, but with a bigger discount
        mdp = GridworldMdp(grid, living_reward=-0.001)
        env = Mdp(mdp)
        agent = agents.OptimalAgent(gamma=0.5, num_iters=20)
        agent.set_mdp(mdp)
        start_state = mdp.get_start_state()

        # Values
        # Inaccurate because I ignore living reward and we only use 20
        # iterations of value iteration, so only check to 2 places
        self.assertAlmostEqual(agent.value(start_state), 0.25, places=2)

        # Action distribution
        action_dist = agent.get_action_distribution(start_state)
        self.assertEqual(action_dist, Distribution({s: 1}))

        # Trajectory
        actions, reward = self.run_on_env(agent,
                                          env,
                                          gamma=0.5,
                                          episode_length=10)
        # Again approximate comparison since we don't consider living rewards
        self.assertAlmostEqual(reward, (4 - 0.0625) / 16, places=2)
        self.assertEqual(actions,
                         [s, s, e, e, stay, stay, stay, stay, stay, stay])

        # Same thing, but with Boltzmann rationality
        agent = agents.OptimalAgent(beta=1, gamma=0.5, num_iters=20)
        agent.set_mdp(mdp)

        # Action distribution
        dist = agent.get_action_distribution(start_state).get_dict()
        nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w]
        for p in [nprob, sprob, eprob, wprob]:
            self.assertTrue(0 < p < 1)
        self.assertEqual(nprob, wprob)
        self.assertTrue(sprob > nprob)
        self.assertTrue(nprob > eprob)

        middle_state = (2, 3)
        dist = agent.get_action_distribution(middle_state).get_dict()
        nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w]
        for p in [nprob, sprob, eprob, wprob]:
            self.assertTrue(0 < p < 1)
        self.assertEqual(nprob, sprob)
        self.assertTrue(wprob > eprob)
        self.assertTrue(eprob > nprob)
from model import tf_value_iter_no_config
from agents import OptimalAgent
from gridworld.gridworld import GridworldMdp

sess = tf.InteractiveSession()

walls = [[1, 1, 1, 1, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1],
         [1, 1, 1, 1, 1]]
reward = [[0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0]]

walls = np.array(walls)
reward = np.array(reward)
agent_start = (1, 3)
mdp = GridworldMdp.from_numpy_input(walls.astype(np.float32),
                                    reward.astype(np.float32),
                                    start_state=agent_start)
imsize = walls.shape[0]
discount = 0.9
num_iters = 50


def test_model(wall_tf, reward_tf, alg):
    return alg(wall_tf, reward_tf)


def tf_value_iter_model(wall_tf, reward_tf):
    a = tf.reshape(wall_tf, [1, imsize, imsize])
    b = tf.reshape(reward_tf, [1, imsize, imsize])
    X = tf.stack([a, b], axis=-1)
    qvals = tf_value_iter_no_config(X,
Beispiel #16
0
    return tf.app.flags.FLAGS


if __name__ == '__main__':
    # get flags || Data
    config = init_birl_flags()
    if config.datafile is None:
        print('--datafile option is required')
        exit()

    # seed random generators
    set_seeds(config.seed)

    imagetest, rewardtest, ytest = load_dataset(config.datafile)[-3:]
    for image, reward, policy in zip(imagetest, rewardtest, ytest):
        mdp = GridworldMdp.from_numpy_input(image, reward)
        mdp = GridworldMdpLearnableR.from_full_mdp(mdp)
        inferred_reward = birl(mdp,
                               policy,
                               config.beta,
                               num_burn_in=config.num_burn_in,
                               num_samples=config.num_samples,
                               display_step=config.display_step)

        print('The first set of walls is:')
        print(image)
        print('The first reward should be:')
        print(reward)
        inferred_reward = inferred_reward / inferred_reward.max()
        inferred_reward = np.reshape(inferred_reward, image.shape)
        print('The inferred reward is:')