def __init__(self, col_sq_locs_dict, width=5, height=3, init_loc=(1, 1), goal_locs=[(5, 3)]): ''' Args: col_sq_locs_dict (dict): Key: int (width) Val: dict Key: int (height) Val: color width (int) height (int) init_loc (tuple) goal_locs (list of tuples) ''' GridWorldMDP.__init__(self, width, height, init_loc=init_loc, goal_locs=goal_locs) self.init_state = ColoredGridWorldState( init_loc[0], init_loc[1], col_sq_locs_dict[init_loc[0]][init_loc[1]]) self.col_sq_locs_dict = col_sq_locs_dict
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) tabular_agent = CherryQAgent(mdp, model=lambda *x: ActionValueFunction(*x, init=1.0), name='Tabular', lr=0.7) linear_agent = CherryQAgent(mdp, model=lambda *x: nn.Linear(*x), name='Linear', lr=0.1) mlp_agent = CherryQAgent(mdp, model=lambda *x: MLP(*x), name='MLP', lr=0.07) # Run experiment and make plot. agents = [rand_agent, ql_agent, tabular_agent, linear_agent, mlp_agent] run_agents_on_mdp(agents, mdp, instances=10, episodes=50, steps=50, open_plot=open_plot)
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=8, height=3, init_loc=(1, 1), goal_locs=[(8, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=20, episodes=300, steps=20, open_plot=open_plot, track_success=True, success_reward=1)
def __init__(self, width=5, height=3, init_loc=(1, 1), rand_init=False, goal_locs=[(5, 3)], lava_locs=[()], walls=[], is_goal_terminal=True, gamma=0.99, slip_prob=0.0, step_cost=0.0, lava_cost=0.01, goal_rewards=[1.], name="Grid-world"): GridWorldMDP.__init__(self, width=width, height=height, init_loc=init_loc, rand_init=rand_init, goal_locs=goal_locs, lava_locs=lava_locs, walls=walls, is_goal_terminal=is_goal_terminal, gamma=gamma, slip_prob=slip_prob, step_cost=step_cost, lava_cost=lava_cost, name=name) self.goal_rewards = goal_rewards self.slip_unidirectional = True
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)]) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9, "actions":mdp.get_actions()}) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)]) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9}, actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def make_mdp(mdp_class="grid", grid_dim=7): ''' Returns: (MDP) ''' # Grid/Hallway stuff. width, height = grid_dim, grid_dim hall_goal_locs = [(i, width) for i in range(1, height + 1)] four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)] four_room_goal_loc = four_room_goal_locs[5] # Taxi stuff. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{ "x": grid_dim / 2, "y": grid_dim / 2, "dest_x": grid_dim - 2, "dest_y": 2, "in_taxi": 0 }] walls = [] mdp = { "hall": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=hall_goal_locs), "pblocks_grid": make_grid_world_from_file("pblocks_grid.txt", randomize=True), "grid": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)]), "four_room": FourRoomMDP(width=width, height=height, goal_locs=[four_room_goal_loc]), "chain": ChainMDP(num_states=grid_dim), "random": RandomMDP(num_states=50, num_rand_trans=2), "hanoi": HanoiMDP(num_pegs=grid_dim, num_discs=3), "taxi": TaxiOOMDP(width=grid_dim, height=grid_dim, slip_prob=0.0, agent=agent, walls=walls, passengers=passengers) }[mdp_class] return mdp
def main(): # Setup MDP. actual_args = { "width": 10, "height": 10, "init_loc": (1, 1), "goal_locs": [(10, 10)], "lava_locs": [(1, 10), (3, 10), (5, 10), (7, 10), (9, 10)], "gamma": 0.9, "walls": [ (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9) ], "slip_prob": 0.01, "lava_cost": 1.0, "step_cost": 0.1 } mdp = GridWorldMDP(**actual_args) # Initialize the custom Q function for a q-learning agent. This should be equivalent to potential shaping. # This should cause the Q agent to learn more quickly. custom_q = defaultdict(lambda: defaultdict(lambda: 0)) custom_q[GridWorldState(5, 1)]['right'] = 1.0 custom_q[GridWorldState(2, 1)]['right'] = 1.0 # Make a normal q-learning agent and another initialized with the custom_q above. # Finally, make a random agent to compare against. ql_agent = QLearningAgent(actions=mdp.get_actions(), epsilon=0.2, alpha=0.4) ql_agent_pot = QLearningAgent(actions=mdp.get_actions(), epsilon=0.2, alpha=0.4, custom_q_init=custom_q, name="PotQ") rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, ql_agent_pot, rand_agent], mdp, instances=2, episodes=60, steps=200, open_plot=True, verbose=True)
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=50, steps=10, open_plot=open_plot)
def main(): # Setup MDP, Agents. mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) value_iter = ValueIteration(mdp, sample_rate=5) value_iter.run_vi() # Value Iteration. action_seq, state_seq = value_iter.plan(mdp.get_init_state()) print("Plan for", mdp) for i in range(len(action_seq)): print("\t", action_seq[i], state_seq[i])
def make_mdp(mdp_class="grid", state_size=7): ''' Returns: (MDP) ''' # Grid/Hallway stuff. width, height = state_size, state_size hall_goal_locs = [(i, width) for i in range(1, height + 1)] # Taxi stuff. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{ "x": state_size / 2, "y": state_size / 2, "dest_x": state_size - 2, "dest_y": 2, "in_taxi": 0 }] walls = [] mdp = { "hall": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=hall_goal_locs), "pblocks_grid": make_grid_world_from_file("pblocks_grid.txt", randomize=True), "grid": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[(state_size, state_size)]), "four_room": FourRoomMDP(width=width, height=height, goal_locs=[(width, height)]), "chain": ChainMDP(num_states=state_size), "random": RandomMDP(num_states=50, num_rand_trans=2), "taxi": TaxiOOMDP(width=state_size, height=state_size, slip_prob=0.0, agent=agent, walls=walls, passengers=passengers) }[mdp_class] return mdp
def main(): # Setup MDP, Agents. mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) value_iter = ValueIteration(mdp, sample_rate=5) mcts = MCTS(mdp, num_rollouts_per_step=50) # _, val = value_iter.run_vi() # Value Iteration. vi_action_seq, vi_state_seq = value_iter.plan(mdp.get_init_state()) mcts_action_seq, mcts_state_seq = mcts.plan(mdp.get_init_state()) print("Plan for", mdp) for i in range(len(mcts_action_seq)): print("\t", mcts_action_seq[i], mcts_state_seq[i])
def main(): # Setup MDP, Agents. mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) value_iter = ValueIteration(mdp, sample_rate=5) mcts = MCTS(mdp, num_rollouts_per_step=50) # _, val = value_iter.run_vi() # Value Iteration. vi_action_seq, vi_state_seq = value_iter.plan(mdp.get_init_state()) mcts_action_seq, mcts_state_seq = mcts.plan(mdp.get_init_state()) print("Plan for", mdp) for i in range(len(mcts_action_seq)): print("\t", mcts_action_seq[i], mcts_state_seq[i])
def main(): mdp = GridWorldMDP(3, 5, (1, 1), [(3, 5)]) vi = ValueIteration(mdp) # print "num states:", len(vi._compute_reachable_state_space()) visualize_mdp(mdp)
def make_mdp(mdp_class="grid", grid_dim=7): ''' Returns: (MDP) ''' # Grid/Hallway stuff. width, height = grid_dim, grid_dim upworld_goal_locs = [(i, width) for i in range(1, height+1)] four_room_goal_locs = [(width, height)] #, (width, 1), (1, height)] # (1, height - 2), (width - 2, height - 2), (width - 1, height - 1), (width - 2, 1)] four_room_goal_loc = four_room_goal_locs[0] # Taxi stuff. agent = {"x":1, "y":1, "has_passenger":0} passengers = [{"x":grid_dim / 2, "y":grid_dim / 2, "dest_x":grid_dim-2, "dest_y":2, "in_taxi":0}] walls = [] # Trench stuff tr_agent = {"x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": grid_dim, "dest_y": grid_dim, "has_block": 0} blocks = [{"x": grid_dim, "y": 1}] lavas = [{"x": x, "y": y} for x, y in map(lambda z: (z + 1, (grid_dim + 1) / 2), range(grid_dim))] # Do grids separately to avoid making error-prone domains. if mdp_class == "four_room": mdp = FourRoomMDP(width=width, height=height, goal_locs=[four_room_goal_loc]) else: mdp = {"upworld":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=upworld_goal_locs), "chain":ChainMDP(num_states=grid_dim), "random":RandomMDP(num_states=50, num_rand_trans=2), "hanoi":HanoiMDP(num_pegs=grid_dim, num_discs=3), "taxi":TaxiOOMDP(width=grid_dim, height=grid_dim, agent=agent, walls=walls, passengers=passengers), "trench":TrenchOOMDP(width=grid_dim, height=3, agent=tr_agent, blocks=blocks, lavas=lavas)}[mdp_class] return mdp
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)]) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def choose_mdp(mdp_name, env_name="Asteroids-v0"): ''' Args: mdp_name (str): one of {gym, grid, chain, taxi, ...} gym_env_name (str): gym environment name, like 'CartPole-v0' Returns: (MDP) ''' # Other imports from simple_rl.tasks import ChainMDP, GridWorldMDP, FourRoomMDP, TaxiOOMDP, RandomMDP, PrisonersDilemmaMDP, RockPaperScissorsMDP, GridGameMDP # Taxi MDP. agent = {"x":1, "y":1, "has_passenger":0} passengers = [{"x":4, "y":3, "dest_x":2, "dest_y":2, "in_taxi":0}] walls = [] if mdp_name == "gym": # OpenAI Gym MDP. try: from simple_rl.tasks.gym.GymMDPClass import GymMDP except: raise ValueError("(simple_rl) Error: OpenAI gym not installed.") return GymMDP(env_name, render=True) else: return {"grid":GridWorldMDP(5, 5, (1, 1), goal_locs=[(5, 3), (4,1)]), "four_room":FourRoomMDP(), "chain":ChainMDP(5), "taxi":TaxiOOMDP(10, 10, slip_prob=0.0, agent=agent, walls=walls, passengers=passengers), "random":RandomMDP(num_states=40, num_rand_trans=20), "prison":PrisonersDilemmaMDP(), "rps":RockPaperScissorsMDP(), "grid_game":GridGameMDP(), "multi":{0.5:RandomMDP(num_states=40, num_rand_trans=20), 0.5:RandomMDP(num_states=40, num_rand_trans=5)}}[mdp_name]
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], gamma=0.95, walls=[(2, 2)]) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=20, open_plot=open_plot)
def __init__(self, width=5, height=3, init_loc=(1, 1), rand_init=False, goal_locs=[(5, 3)], lava_locs=[()], walls=[], safe_locs=[], is_goal_terminal=True, gamma=0.99, slip_prob=0.0, step_cost=0.0, lava_cost=1, name="gridworld", gui=False): GridWorldMDP.__init__(self, width, height, init_loc, rand_init, goal_locs, lava_locs, walls, is_goal_terminal, gamma, slip_prob, step_cost, lava_cost, name) self.jump_dist = 2 self.actions = UnsafeGridWorldMDP.ACTIONS self.safe_states = set() for state in self.get_all_states(): if (state.x, state.y) in safe_locs: self.safe_states.add(state) self.gui = gui if gui: self.screen = pygame.display.set_mode( (SCREEN_HEIGHT, SCREEN_HEIGHT)) self.agent_shape = None # Pygame setup. pygame.init() self.screen.fill((255, 255, 255)) pygame.display.update() self.agent_shape = self._draw_state(self.init_state, draw_statics=True)
def main(): mdp1 = GridWorldMDP(width=2, height=1, init_loc=(1, 1), goal_locs=[(2, 1)], slip_prob=0.5, gamma=0.5) vi = ValueIteration(mdp1) iters, value = vi.run_vi() print("value=", value)
def __init__(self, col_sq_locs_dict, width=5, height=3, init_loc=(1, 1), goal_locs=[(5, 3)]): ''' Args: col_sq_locs_dict (dict): Key: int (width) Val: dict Key: int (height) Val: color width (int) height (int) init_loc (tuple) goal_locs (list of tuples) ''' GridWorldMDP.__init__(self, width, height, init_loc=init_loc, goal_locs=goal_locs) self.init_state = ColoredGridWorldState(init_loc[0], init_loc[1], col_sq_locs_dict[init_loc[0]][init_loc[1]]) self.col_sq_locs_dict = col_sq_locs_dict
def main(): test_mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) lower_value_function = MonotoneLowerBound(test_mdp).lower_values upper_value_function = MonotoneUpperBound(test_mdp).upper_values bounded_rtdp = BoundedRTDP(test_mdp, lower_values_init=lower_value_function, upper_values_init=upper_value_function) test_policy = bounded_rtdp.plan() print('Derived policy:\n{}'.format(test_policy))
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)]) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=5, episodes=50, steps=25, open_plot=open_plot, track_disc_reward=False)
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)]) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=5, episodes=50, steps=25, open_plot=open_plot) # Reproduce the experiment. reproduce_from_exp_file(exp_name=str(mdp), open_plot=open_plot)
def main(): # Simple test code. from simple_rl.tasks import GridWorldMDP mdp_distr = {} height, width = 8, 8 prob_list = [0.0, 0.1, 0.2, 0.3, 0.4] for i in range(len(prob_list)): next_mdp = GridWorldMDP(width=width, height=width, init_loc=(1, 1), goal_locs=r.sample(zip(range(1, width + 1), [height] * width), 2), is_goal_terminal=True) mdp_distr[next_mdp] = prob_list[i] m = MDPDistribution(mdp_distr) m.sample()
def generate_MDP(width, height, init_loc, goal_locs, lava_locs, gamma, walls, slip_prob): """ Creates an MDP object based on user input """ actual_args = { "width": width, "height": height, "init_loc": init_loc, "goal_locs": goal_locs, "lava_locs": lava_locs, "gamma": gamma, "walls": walls, "slip_prob": slip_prob, "lava_cost": 1.0, "step_cost": 0.1 } return GridWorldMDP(**actual_args)
def make_mdp_distr(mdp_class="grid", num_mdps=15, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} num_mdps (int) Returns: (MDPDistribution) ''' mdp_dist_dict = {} mdp_prob = 1.0 / num_mdps height, width = 10, 10 # Make @num_mdps MDPs. for i in xrange(num_mdps): next_goals = rnd.sample([(1, 7), (7, 1), (7, 7), (6, 6), (6, 1), (1, 6)], 2) new_mdp = { "grid": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=rnd.sample( zip(range(1, width + 1), [height] * width), 1), is_goal_terminal=True, gamma=gamma), "four_room": FourRoomMDP(width=8, height=8, goal_locs=next_goals, gamma=gamma), "chain": ChainMDP(num_states=10, reset_val=rnd.choice([0, 0.01, 0.05, 0.1]), gamma=gamma), "random": RandomMDP(num_states=40, num_rand_trans=rnd.randint(1, 10), gamma=gamma) }[mdp_class] mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict)
def choose_mdp(mdp_name, atari_game="centipede"): ''' Args: mdp_name (str): one of {atari, grid, chain, taxi} atari_game (str): one of {centipede, breakout, etc.} Returns: (MDP) ''' # Grid World MDP. grid_mdp = GridWorldMDP(10, 10, (1, 1), (10, 10)) # Chain MDP. chain_mdp = ChainMDP(15) # Taxi MDP. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{"x": 5, "y": 5, "dest_x": 3, "dest_y": 3, "in_taxi": 0}] taxi_mdp = TaxiOOMDP(6, 6, agent_loc=agent, walls=[], passengers=passengers) if mdp_name == "atari": # Atari import is here in case users don't have the Arcade Learning Environment. try: from simple_rl.tasks.atari.AtariMDPClass import AtariMDP return AtariMDP(rom=atari_game, grayscale=True) except: print "ERROR: you don't have the Arcade Learning Environment installed." print "\tTry here: https://github.com/mgbellemare/Arcade-Learning-Environment." quit() else: return { "grid": grid_mdp, "chain": chain_mdp, "taxi": taxi_mdp }[mdp_name]
def planFromAtoB(self, Maps, nearestVertex, kStepConfig): # if not self.computedMDP: # self.wallLocations = [] # for x in range(len(self.Maps.occupancyMap)): # for y in range(len(self.Maps.occupancyMap[x])): # if self.Maps.occupancyMap[x][y] == Env.WALL: # self.wallLocations.append(Loc.Location(x,y)) # self.computedMDP = True mdp = GridWorldMDP(width=len(Maps.occupancyMap), height=len(Maps.occupancyMap[0]), init_loc=(nearestVertex.x, nearestVertex.y), goal_locs=[(kStepConfig.x, kStepConfig.y)], gamma=0.95) vi = ValueIteration(mdp) vi.run_vi() action_seq, state_seq = vi.plan() #check if conflict for s in state_seq: if Maps.occupancyMap[s[0], s[1]] == env.WALL: return False return True
def main(): # Setup MDP, Agents. args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) mdp = GridWorldMDP(width=7, height=7, init_loc=(1, 1), goal_locs=[(7, 7)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.1) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=args.epsilon, alpha=args.alpha, gamma=args.gamma, explore=args.explore, anneal=args.anneal) # Choose viz type. viz = args.visualization if viz == "value": # --> Color corresponds to higher value. # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # --> Press <spacebar> to advance the agent. # First let the agent solve the problem and then visualize the agent's resulting policy. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200) elif viz == "interactive": # Press <1>, <2>, <3>, and so on to execute action 1, action 2, etc. mdp.visualize_interaction()
#!/usr/bin/env python ''' NOTE: Incomplete. Planning infrastructure in development. ''' # Other imports. import srl_example_setup from simple_rl.tasks import GridWorldMDP from simple_rl.planning import ValueIteration, MCTS # Setup MDP, Agents. mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)]) vi = ValueIteration(mdp) vi.run_vi() action_seq, state_seq = vi.plan(mdp.get_init_state()) for i in range(len(action_seq)): print action_seq[i], state_seq[i]
def make_mdp_distr(mdp_class, is_goal_terminal, mdp_size=11, horizon=0, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) step_cost (float) gamma (float) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width, = mdp_size, mdp_size # Corridor. corr_width = 20 corr_goal_magnitude = 1 #random.randint(1, 5) corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [j for j in range(corr_width-corr_goal_magnitude + 1, corr_width + 1)] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World tl_grid_world_rows, tl_grid_world_cols = [i for i in range(width - 4, width)], [j for j in range(height - 4, height)] tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols)) tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [j for j in range(height - 4, height)] tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols)) grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs # Four room. four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)] # SPREAD vs. TIGHT spread_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1), (2,2)] tight_goal_locs = [(width, height), (width-1, height), (width, height-1), (width, height - 2), (width - 2, height), (width - 1, height-1), (width-2,height-2)] changing_entities = {"four_room":four_room_goal_locs, "grid":grid_goal_locs, "corridor":corr_goal_locs, "spread":spread_goal_locs, "tight":tight_goal_locs, "chain":[0.0, 0.01, 0.1, 0.5, 1.0], "combo_lock":[[3,1,2],[3,2,1],[2,3,1],[3,3,1]], "walls":make_wall_permutations(mdp_size), "lava":make_lava_permutations(mdp_size) } # MDP Probability. num_mdps = 10 if mdp_class not in changing_entities.keys() else len(changing_entities[mdp_class]) if mdp_class == "octo": num_mdps = 12 mdp_prob = 1.0 / num_mdps for i in range(num_mdps): new_mdp = {"chain":ChainMDP(reset_val=changing_entities["chain"][i%len(changing_entities["chain"])]), # "lava":GridWorldMDP(width=width, height=height, rand_init=False, step_cost=-0.001, lava_cost=0.0, lava_locs=changing_entities["lava"][i%len(changing_entities["lava"])], goal_locs=[(mdp_size-3, mdp_size-3)], is_goal_terminal=is_goal_terminal, name="lava_world", slip_prob=0.1), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[changing_entities["four_room"][i % len(changing_entities["four_room"])]], is_goal_terminal=is_goal_terminal), # "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[changing_entities["corridor"][i % len(changing_entities["corridor"])]], is_goal_terminal=is_goal_terminal, name="corridor"), "combo_lock":ComboLockMDP(combo=changing_entities["combo_lock"][i%len(changing_entities["combo_lock"])]), "spread":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["spread"][i % len(changing_entities["spread"])]], is_goal_terminal=is_goal_terminal, name="spread_grid"), "tight":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["tight"][i % len(changing_entities["tight"])]], is_goal_terminal=is_goal_terminal, name="tight_grid"), }[mdp_class] new_mdp.set_gamma(gamma) mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
def main(open_plot=True): mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05 ) ql_agent =
def __init__(self, width=30, height=30, goal_locs=[(21, 21)], cell_types=["empty", "yellow", "red", "green", "purple"], cell_type_rewards=[0, 0, -10, -10, -10], cell_distribution="probability", # cell_type_probs: default is chosen arbitrarily larger than # percolation threshold for square lattice, which is just an # approximation to match cell distribution with that of the # paper. cell_type_probs=[0.68, 0.17, 0.05, 0.05, 0.05], cell_type_forced_locations=[np.inf, np.inf, [(1,1),(5,5)], [(2,2)], [4,4]], gamma=0.99, slip_prob=0.00, step_cost=0.0, goal_rewards=[1.0], is_goal_terminal=True, traj_init_cell_types=[0], goal_colors=["blue"], init_loc=(1,1), rand_init=True, init_state=None, name="Navigation MDP"): """ Note: 1. locations and state dimensions start from 1 instead of 0. 2. 2d locations are interpreted in (x,y) format. Args: height (int): Height of navigation grid in no. of cells. width (int): Width of navigation grid in no. of cells. goal_locs (list of tuples: [(int, int)...]): Goal locations. cell_type (list of cell types: [str, str, ...]): Non-goal cell types. cell_rewards (list of ints): Reward for each @cell_type. cell_distribution (str): "probability" - will assign cells according to @cell_type_probs over the state space. "manual" - will use @cell_type_forced_locations to assign cells to locations. cell_type_probs (list of floats): Only applicable when @cell_distribution is set to "probability". Specifies probability corresponding to each @cell_type. Values must sum to 1. Each value signifies the probability of occurence of particular cell type in the grid. Note: the actual probabilities would be slightly off because this doesn't factor in number of goals. cell_type_forced_locations (list of list of tuples [[(x1,y1), (x2,y2)], [(x3,y3), ...], np.inf, ...}): Only applicable when @cell_distribution is set to "Manual". Used to specify additional cells and their locations. If elements are set to np.inf, all of them will be sampled uniformly at random. goal_colors (list of str/int): Color of goal corresponding to @goal_locs. If colors are different, each goal will be represented with a unique feature, otherwise all goals are mapped to same feature. traj_init_cell_types (list of ints): To specify which cell types are navigable. This is used in sampling empty/drivable states while generating trajectories. Not used but are properties of superclass GridWorldMDP: init_loc (tuple: (int, int)): (x,y) initial location rand_init (bool): Whether to use random initial location init_state (GridWorldState): Initial GridWorldState """ assert height > 0 and isinstance(height, int) and width > 0 \ and isinstance(width, int), "height and widht must be integers and > 0" assert len(goal_colors) == len(goal_locs) == len(goal_rewards) assert len(cell_types) == len(cell_type_rewards) assert cell_distribution == "manual" or len(cell_types) == len(cell_type_probs) assert cell_distribution == "probability" or len(cell_types) == len(cell_type_forced_locations) self.value_iter = None self._policy_invalidated = True self.cell_types = cell_types GridWorldMDP.__init__(self, width=width, height=height, init_loc=init_loc, rand_init=rand_init, goal_locs=goal_locs, lava_locs=[()], walls=[], # no walls in this mdp is_goal_terminal=is_goal_terminal, gamma=gamma, init_state=init_state, slip_prob=slip_prob, step_cost=step_cost, name=name) # Cell Types self.cells = self.__generate_cell_type_grid( height, width, cell_distribution, cell_type_probs, cell_type_forced_locations) # Preserve a copy without goals self.cells_wo_goals = self.cells.copy() # Cell Rewards self.cell_type_rewards = cell_type_rewards self.cell_rewards = np.asarray( [[self.cell_type_rewards[item] for item in row] for row in self.cells] ).reshape(height,width) # Preserve a copy without goals self.cell_rewards_wo_goals = self.cell_rewards.copy() # Update cells and cell_rewards with goal and its rewards self.reset_goals(goal_locs, goal_rewards, goal_colors) # Find set of Empty/Navigable cells for sampling trajectory init state self.set_traj_init_cell_types(cell_types=traj_init_cell_types) # Additional book-keeping self.feature_cell_dist = None self.feature_cell_dist_normalized = None