def test_uncalibrated_agents(self): grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], ['X', -9, ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 3, 'X'], ['X', ' ', ' ', 'X', -9, -9, -9, -9, -9, ' ', 'X'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1, noise=0.2) env = Mdp(mdp) agent1 = agents.OptimalAgent(gamma=0.9, num_iters=50) agent1.set_mdp(mdp) actions, _ = self.run_on_env(agent1, env, gamma=0.9, episode_length=13) self.assertEqual(actions, [e, e, e, n, e, e, e, e, e, s, stay, stay, stay]) agent2 = agents.UncalibratedAgent(gamma=0.9, num_iters=20, calibration_factor=5) agent2.set_mdp(mdp) actions, _ = self.run_on_env(agent2, env, gamma=0.9, episode_length=13) self.assertEqual( actions, [e, e, e, e, e, e, e, e, stay, stay, stay, stay, stay]) agent3 = agents.UncalibratedAgent(gamma=0.9, num_iters=20, calibration_factor=0.5) agent3.set_mdp(mdp) actions, _ = self.run_on_env(agent3, env, gamma=0.9, episode_length=13) self.assertEqual(actions, [s, e, n, e, e, n, e, e, e, e, e, s, stay])
def test_myopic_agent(self): grid = [ 'XXXXXXXX', 'XA X', 'X XXXX9X', 'X X', 'X X2 X', 'XXXXXXXX' ] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1) env = Mdp(mdp) optimal_agent = agents.OptimalAgent(gamma=0.9, num_iters=20) optimal_agent.set_mdp(mdp) actions, _ = self.run_on_env(optimal_agent, env, gamma=0.9, episode_length=10) self.assertEqual(actions, [e, e, e, e, e, s, stay, stay, stay, stay]) myopic_agent = agents.MyopicAgent(6, gamma=0.9, num_iters=20) myopic_agent.set_mdp(mdp) actions, _ = self.run_on_env(myopic_agent, env, gamma=0.9, episode_length=10) self.assertEqual(actions, [s, s, e, e, e, e, e, n, stay, stay])
def compare_agents(self, name, agent1, agent2, places=7, print_mdp=False): print('Comparing {0} agents'.format(name)) set_seeds(314159) mdp = GridworldMdp.generate_random_connected(16, 16, 5, 0.2) if print_mdp: print(mdp) env = Mdp(mdp) self.time(lambda: agent1.set_mdp(mdp), "Python planner") self.time(lambda: agent2.set_mdp(mdp), "Numpy/Tensorflow planner") for s in mdp.get_states(): for a in mdp.get_actions(s): mu = agent1.extend_state_to_mu(s) qval1, qval2 = agent1.qvalue(mu, a), agent2.qvalue(mu, a) self.assertAlmostEqual(qval1, qval2, places=places)
def main(): grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], ['X', ' ', -90, -90, -90, -90, '8', ' ', 'X'], ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', -99, '2', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', '1', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']] mdp = GridworldMdp(grid, living_reward=-0.01, noise=0.2) env = Mdp(mdp) opt = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20) naive = NaiveTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20) soph = fast_agents.FastSophisticatedTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20) myopic = fast_agents.FastMyopicAgent(6, gamma=0.95, num_iters=20) over = fast_agents.FastUncalibratedAgent(gamma=0.95, num_iters=20, calibration_factor=5) under = fast_agents.FastUncalibratedAgent(gamma=0.95, num_iters=20, calibration_factor=0.5) agents = [opt, naive, soph, myopic, over, under] names = [ 'Optimal', 'Naive', 'Sophisticated', 'Myopic', 'Overconfident', 'Underconfident' ] for name, agent in zip(names, agents): print('{} agent'.format(name)) agent.set_mdp(mdp) trajectory = run_agent(agent, env, episode_length=50, determinism=True) if agent == naive: print([a for _, a, _, _ in trajectory]) print_training_example(mdp, trajectory) print(opt.values.T)
def evaluate_proxy(walls, start_state, proxy_reward, true_reward, gamma=0.9, episode_length=float("inf")): """Runs agent on a proxy environment for one episode, while collecting true reward from a separate environment walls: Numpy array of walls, where each entry is 1 or 0 start_state: Starting state for the agent proxy_reward: Numpy array of reward values true_reward: Numpy array of reward values Creates a proxy mdp by overlaying walls onto proxy grid. True reward is summed if the reward grid's entry at the given state can be casted to a float Returns sum of proxy reward / sum of true reward. Which is related to regret. """ proxy_mdp = GridworldMdp.from_numpy_input(walls, proxy_reward, start_state) true_mdp = GridworldMdp.from_numpy_input(walls, true_reward, start_state) env = Mdp(true_mdp) proxy_agent = FastOptimalAgent() proxy_agent.set_mdp(true_mdp, proxy_mdp) proxy_trajectory = run_agent(proxy_agent, env, episode_length) reward_from_proxy_agent = get_reward_from_trajectory( proxy_trajectory, gamma) true_agent = FastOptimalAgent() true_agent.set_mdp(true_mdp) true_trajectory = run_agent(true_agent, env, episode_length) reward_from_true_agent = get_reward_from_trajectory(true_trajectory, gamma) if reward_from_true_agent == 0: # TODO(rohinmshah): Figure out why this can happen, and come up with a # better solution than this hack return (1.0 + reward_from_proxy_agent) / (1.0 + reward_from_true_agent) return float(reward_from_proxy_agent) / reward_from_true_agent
def plot_trajectory( wall, reward, start, agent, fig, ax, arrow_width=0.5, EPISODE_LENGTH=35, animate=False, fname=None, ): """Simulates a rollout of an agent given an MDP specified by the wall, reward, and start state. And plots it. If animate is true, an animation object will be returned """ from agent_runner import run_agent from gridworld.gridworld import GridworldMdp from mdp_interface import Mdp mdp = GridworldMdp.from_numpy_input(wall, reward, start) agent.set_mdp(mdp) env = Mdp(mdp) trajectory = run_agent(agent, env, episode_length=EPISODE_LENGTH, determinism=True) if len(trajectory) <= 1: raise ValueError("Trajectory rolled out unsuccessfully") # Tuples of (state, next) - to be used for plotting state_trans = [(info[0], info[2]) for info in trajectory] count = 0 for trans in state_trans: if trans[0] == trans[1]: count += 1 if count == len(state_trans): print( "Yes, the agent given stayed in the same spot for {} iterations...".format( len(state_trans) ) ) if fig is None or ax is None: fig, ax = plt.subplots(1, 1) if ax is not None and type(ax) is list: raise ValueError("Given {} axes, but can only use 1 axis".format(len(ax))) # Plot starting point plot_pos(start, ax=ax, color="k", marker="o", grid_size=len(wall)) # Plot ending trajectory point finish = state_trans[-1][0] plot_pos(finish, ax=ax, color="k", marker="*", grid_size=len(wall)) plot_lines( ax, fig, trans_list=state_trans, color="black", arrow_width=arrow_width, grid_size=len(wall), animate=animate, fname=fname, ) ax.set_xticks([]) ax.set_yticks([]) return fig, ax
def optimal_agent_test(self, agent): grid = [ 'XXXXXXXXX', 'X9X6XA X', 'X X X XXX', 'X 2X', 'XXXXXXXXX' ] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1) env = Mdp(mdp) agent.set_mdp(mdp) start_state = mdp.get_start_state() # Action distribution action_dist = agent.get_action_distribution(start_state) self.assertEqual(action_dist, Distribution({s: 1})) # Trajectory actions, _ = self.run_on_env(agent, env, gamma=0.95, episode_length=10) self.assertEqual(actions, [s, s, w, w, w, w, n, n, stay, stay]) # Same thing, but with a bigger discount mdp = GridworldMdp(grid, living_reward=-0.001) env = Mdp(mdp) agent = agents.OptimalAgent(gamma=0.5, num_iters=20) agent.set_mdp(mdp) start_state = mdp.get_start_state() # Values # Inaccurate because I ignore living reward and we only use 20 # iterations of value iteration, so only check to 2 places self.assertAlmostEqual(agent.value(start_state), 0.25, places=2) # Action distribution action_dist = agent.get_action_distribution(start_state) self.assertEqual(action_dist, Distribution({s: 1})) # Trajectory actions, reward = self.run_on_env(agent, env, gamma=0.5, episode_length=10) # Again approximate comparison since we don't consider living rewards self.assertAlmostEqual(reward, (4 - 0.0625) / 16, places=2) self.assertEqual(actions, [s, s, e, e, stay, stay, stay, stay, stay, stay]) # Same thing, but with Boltzmann rationality agent = agents.OptimalAgent(beta=1, gamma=0.5, num_iters=20) agent.set_mdp(mdp) # Action distribution dist = agent.get_action_distribution(start_state).get_dict() nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w] for p in [nprob, sprob, eprob, wprob]: self.assertTrue(0 < p < 1) self.assertEqual(nprob, wprob) self.assertTrue(sprob > nprob) self.assertTrue(nprob > eprob) middle_state = (2, 3) dist = agent.get_action_distribution(middle_state).get_dict() nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w] for p in [nprob, sprob, eprob, wprob]: self.assertTrue(0 < p < 1) self.assertEqual(nprob, sprob) self.assertTrue(wprob > eprob) self.assertTrue(eprob > nprob)
grid = ['XXXXXXXXX', 'X9XAX X', 'X X X X', 'X X', 'XXXXXXXXX'] preference_grid = [ 'XXXXXXXXXXXXXX', 'XXXXXX4XXXXXXX', 'XXXXXX XXXXXXX', 'XXXXX XXXX', 'XXXXX XXX 2XX', 'XXXXX XXX XXXX', 'XXXX1 XXX XXXX', 'XXXXX XXX XXXX', 'XXXXX XXX XXXX', 'XXXXX XXX XXXX', 'X1 XXXX', 'XXXXX XX1XXXXX', 'XXXXXAXXXXXXXX', 'XXXXXXXXXXXXXX' ] mdp = GridworldMdp(preference_grid) # mdp = GridworldMdp.generate_random(imsize, imsize, pr_wall, pr_reward) # agent = agents.OptimalAgent() agent = agents.SophisticatedTimeDiscountingAgent(2, 0.01) agent.set_mdp(mdp) env = Mdp(mdp) trajectory = env.perform_rollout(agent, max_iter=20, print_step=1000) print_training_example(mdp, trajectory) print(agent.reward) # class NeuralAgent(Agent): # def __init__(self, save_dir): # Agent.__init__(self) # self.sess = tf.Session(graph=tf.Graph()) # tf.saved_model.loader.load(sess, ['train'], '/tmp/planner-vin/model/') # def get_action(self, state): # walls, rewards, _ = self.mdp.convert_to_numpy_input() # fd = { # 'image:0': walls,
def test_environment(self): env = Mdp(self.mdp3) self.assertEqual(env.get_current_state(), (3, 3)) next_state, reward = env.perform_action(Direction.NORTH) self.assertEqual(next_state, (3, 2)) self.assertEqual(reward, -0.01) self.assertEqual(env.get_current_state(), next_state) self.assertFalse(env.is_done()) env.reset() self.assertEqual(env.get_current_state(), (3, 3)) self.assertFalse(env.is_done()) next_state, reward = env.perform_action(Direction.WEST) self.assertEqual(next_state, (2, 3)) self.assertEqual(reward, -0.01) self.assertEqual(env.get_current_state(), next_state) self.assertFalse(env.is_done()) next_state, reward = env.perform_action(Direction.NORTH) self.assertEqual(next_state, (2, 2)) self.assertEqual(reward, -0.01) self.assertEqual(env.get_current_state(), next_state) self.assertFalse(env.is_done()) next_state, reward = env.perform_action(Direction.STAY) self.assertEqual(next_state, (2, 2)) self.assertEqual(reward, 1) self.assertEqual(env.get_current_state(), next_state) self.assertFalse(env.is_done()) env.reset() self.assertFalse(env.is_done()) self.assertEqual(env.get_current_state(), (3, 3))