def get_all_states(self): states = set() for x in range(1, self.width + 1): for y in range(1, self.height + 1): state = GridWorldState(x, y) state.set_terminal(self._terminal_function(state)) states.add(state) return states
def main(): # Setup MDP. actual_args = { "width": 10, "height": 10, "init_loc": (1, 1), "goal_locs": [(10, 10)], "lava_locs": [(1, 10), (3, 10), (5, 10), (7, 10), (9, 10)], "gamma": 0.9, "walls": [ (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9) ], "slip_prob": 0.01, "lava_cost": 1.0, "step_cost": 0.1 } mdp = GridWorldMDP(**actual_args) # Initialize the custom Q function for a q-learning agent. This should be equivalent to potential shaping. # This should cause the Q agent to learn more quickly. custom_q = defaultdict(lambda: defaultdict(lambda: 0)) custom_q[GridWorldState(5, 1)]['right'] = 1.0 custom_q[GridWorldState(2, 1)]['right'] = 1.0 # Make a normal q-learning agent and another initialized with the custom_q above. # Finally, make a random agent to compare against. ql_agent = QLearningAgent(actions=mdp.get_actions(), epsilon=0.2, alpha=0.4) ql_agent_pot = QLearningAgent(actions=mdp.get_actions(), epsilon=0.2, alpha=0.4, custom_q_init=custom_q, name="PotQ") rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, ql_agent_pot, rand_agent], mdp, instances=2, episodes=60, steps=200, open_plot=True, verbose=True)
def location_invariance_equivalency(self, state1, action1, state_prime1, state2, action2): state_prime2 = None if action1 == action2: x_diff = state_prime1.x - state1.x y_diff = state_prime1.y - state1.y x = state2.x + x_diff y = state2.y + y_diff state_prime2 = GridWorldState(x, y) state_prime2.set_terminal(self._terminal_function(state_prime2)) return state_prime2
def states(self): """ Return a list of the states of the environment. :return: list of states """ states = [] for i in range(1, self.width + 1): for j in range(1, self.height + 1): s = GridWorldState(i, j) if self.is_goal_terminal and (i, j) in self.goal_locs: s.set_terminal(True) states.append(s) return states
def __init__(self, width=5, height=3, init_loc=(1, 1), rand_init=False, goal_locs=[()], lava_locs=[()], walls=[], is_goal_terminal=True, is_lava_terminal=False, gamma=0.99, slip_prob=0.0, step_cost=0.0, lava_cost=1.0, name="gridworld"): ''' Args: height (int) width (int) init_loc (tuple: (int, int)) goal_locs (list of tuples: [(int, int)...]) lava_locs (list of tuples: [(int, int)...]): These locations return -1 reward. walls (list) is_goal_terminal (bool) ''' # Setup init location. self.rand_init = rand_init if rand_init: init_loc = random.randint(1, width), random.randint(1, height) while init_loc in walls: init_loc = random.randint(1, width), random.randint(1, height) self.init_loc = init_loc init_state = GridWorldState(init_loc[0], init_loc[1]) MDP.__init__(self, GridWorldMDP.ACTIONS, self._transition_func, self._reward_func, init_state=init_state, gamma=gamma) if type(goal_locs) is not list: raise ValueError("(simple_rl) GridWorld Error: argument @goal_locs needs to be a list of locations. For example: [(3,3), (4,3)].") self.step_cost = step_cost self.lava_cost = lava_cost self.walls = walls self.width = width self.height = height self.goal_locs = goal_locs self.cur_state = GridWorldState(init_loc[0], init_loc[1]) self.is_goal_terminal = is_goal_terminal self.is_lava_terminal = is_lava_terminal self.slip_prob = slip_prob self.name = name self.lava_locs = lava_locs
def example(): size = 5 gamma = .9 epsilon = .1 delta = .05 fancy_plot = True # Create environment env = GridWorld(width=size, height=size, init_loc=(1, 1), goal_locs=[(size, size)], gamma=gamma, slip_prob=.1, goal_reward=1.0, is_goal_terminal=True) # Run approximate value iteration value_function = approximate_value_iteration(env, gamma, epsilon, delta) # Print computed value function print('Computed value function:') if fancy_plot: for j in range(size, 0, -1): for i in range(1, size + 1): print('{:>9}'.format( round(value_function[GridWorldState(i, j)], 2)), end=' ') print() else: for s in value_function: print('Value of', str(s), ':', value_function[s])
def __init__(self, width=5, height=3, init_loc=(1, 1), goal_locs=[(5, 3)], walls=[], is_goal_terminal=True, gamma=0.99, init_state=None): ''' Args: height (int) width (int) init_loc (tuple: (int, int)) goal_locs (list of tuples: [(int, int)...]) ''' init_state = GridWorldState( init_loc[0], init_loc[1]) if init_state is None else init_state MDP.__init__(self, GridWorldMDP.ACTIONS, self._transition_func, self._reward_func, init_state=init_state, gamma=gamma) if type(goal_locs) is not list: print "Error: argument @goal_locs needs to be a list of locations. For example: [(3,3), (4,3)]." quit() self.walls = walls for g in goal_locs: if g[0] > width or g[1] > height: print "Error: goal provided is off the map or overlaps with a wall.." print "\tGridWorld dimensions: (" + str(width) + "," + str( height) + ")" print "\tProblematic Goal:", g quit() if self.is_wall(g[0], g[1]): print "Error: goal provided is off the map or overlaps with a wall.." print "\tWalls:", walls print "\tProblematic Goal:", g quit() self.width = width self.height = height self.init_loc = init_loc self.goal_locs = goal_locs self.cur_state = GridWorldState(init_loc[0], init_loc[1]) self.is_goal_terminal = is_goal_terminal
def get_random_init_state(self): """ Returns a random empty/white cell """ rows, cols = np.where(self.cells == 0) rand_idx = np.random.randint(len(rows)) x, y = self._rowcol_to_xy(rows[rand_idx], cols[rand_idx]) return GridWorldState(x, y)
def compute_cell_values(nvmdp, SFT_full, heap_size, nA, w_r, tf_graph): v_map = np.zeros((nvmdp.height, nvmdp.width), dtype=np.float32) for row in range(nvmdp.height): for col in range(nvmdp.width): x, y = nvmdp._rowcol_to_xy(row, col) v_map[row, col] = RHC_value(SFT_full[GridWorldState(x, y)], heap_size, nA, w_r, tf_graph)[0][0] return v_map
def _transition_func(self, state, action): ''' Args: state (simple_rl.State) action (str) Returns: state (simple_rl.State) ''' if state.is_terminal(): return state noise = np.random.randn(1)[0] / 100.0 to_move = self.delta + noise if action == "up": next_state = GridWorldState(state.x, min(state.y + to_move, 1)) elif action == "down": next_state = GridWorldState(state.x, max(state.y - to_move, 0)) elif action == "right": next_state = GridWorldState(min(state.x + to_move, 1), state.y) elif action == "left": next_state = GridWorldState(max(state.x - to_move, 0), state.y) else: next_state = GridWorldState(state.x, state.y) if self._is_goal_state_action(state, action) and self.is_goal_terminal: next_state.set_terminal(True) return next_state
def compute_full_SFT(nvmdp, nA, phi, h): SFT_full = {} for row in range(nvmdp.height): for col in range(nvmdp.width): x, y = nvmdp._rowcol_to_xy(row, col) state = GridWorldState(x, y) SFT_full[state] = get_FLH(state, nA, phi, nvmdp.transition_func, nvmdp.actions, h)[0] return SFT_full
def sample_empty_state(self, idx=None): """ Returns a random empty/white state of type GridWorldState() """ if idx is None: rand_idx = np.random.randint(len(self.traj_init_cell_row_idxs)) else: assert 0 <= idx < len(self.traj_init_cell_row_idxs) rand_idx = idx x, y = self._rowcol_to_xy(self.traj_init_cell_row_idxs[rand_idx], self.traj_init_cell_col_idxs[rand_idx]) return GridWorldState(x, y)
def _transition_func(self, state, action): ''' Args: state (simple_rl) action (str) Returns (State) ''' if state.is_terminal(): return [state], [1] dx = [0, 0, 0] dy = [0, 0, 0] if action == "up": dx = [-1, 0, 1] dy = [1, 1, 1] elif action == "down": dx = [-1, 0, 1] dy = [-1, -1, -1] elif action == "right": dx = [1, 1, 1] dy = [-1, 0, 1] elif action == "left": dx = [-1, -1, -1] dy = [-1, 0, 1] elif action == "jump up": dx = [-1, 0, 1] dy = [2, 2, 2] elif action == "jump down": dx = [-1, 0, 1] dy = [-2, -2, -2] elif action == "jump right": dx = [2, 2, 2] dy = [-1, 0, 1] elif action == "jump left": dx = [-2, -2, -2] dy = [-1, 0, 1] next_states = [] for delta_x, delta_y in zip(dx, dy): x = np.clip(state.x + delta_x, 1, self.width) y = np.clip(state.y + delta_y, 1, self.height) if self.is_wall(x, y): next_state = GridWorldState(state.x, state.y) else: next_state = GridWorldState(x, y) next_state.set_terminal(self._terminal_function(next_state)) next_states.append(next_state) p = [self.slip_prob / 2., 1 - self.slip_prob, self.slip_prob / 2.] assert len(next_states) == len(p) return next_states, p
def _transition_func(self, state, action): ''' Args: state (State) action (str) Returns (State) ''' gw_state = GridWorldState(state.x, state.y) next_gw_state = GridWorldMDP._transition_func(self, gw_state, action) # Add random color. rand_color = random.randint(1, self.num_colors) next_col_state = ColorState(next_gw_state.x, next_gw_state.y, rand_color) return next_col_state
def _transition_func(self, state, action): if action == "up": next_state = GridWorldState(state.x, state.y + .01) elif action == "down": next_state = GridWorldState(state.x, state.y - .01) elif action == "right": next_state = GridWorldState(state.x + .01, state.y) elif action == "left": next_state = GridWorldState(state.x - .01, state.y) else: next_state = GridWorldState(state.x, state.y) if (next_state.x, next_state.y) in self.goal_locs and self.is_goal_terminal: next_state.set_terminal(True) return next_state
def transition(self, s, a): """ Joint transition method. :param s: (GridWorldState) state :param a: (str) action :return: reward and resulting state (r, s_p) """ if s.is_terminal(): return 0., s if self.slip_prob > random.random(): # Flip direction if a == "up": a = random.choice(["left", "right" ]) if self.slip_unidirectional else "right" elif a == "down": a = random.choice(["left", "right" ]) if self.slip_unidirectional else "left" elif a == "left": a = random.choice(["up", "down" ]) if self.slip_unidirectional else "up" elif a == "right": a = random.choice(["up", "down" ]) if self.slip_unidirectional else "down" if a == "up" and s.y < self.height and not self.is_wall(s.x, s.y + 1): s_p = GridWorldState(s.x, s.y + 1) elif a == "down" and s.y > 1 and not self.is_wall(s.x, s.y - 1): s_p = GridWorldState(s.x, s.y - 1) elif a == "right" and s.x < self.width and not self.is_wall( s.x + 1, s.y): s_p = GridWorldState(s.x + 1, s.y) elif a == "left" and s.x > 1 and not self.is_wall(s.x - 1, s.y): s_p = GridWorldState(s.x - 1, s.y) else: s_p = GridWorldState(s.x, s.y) if (s_p.x, s_p.y) in self.goal_locs and self.is_goal_terminal: s_p.set_terminal(True) if (s_p.x, s_p.y) in self.goal_locs: r = -self.step_cost for i in range(len(self.goal_locs)): if (s_p.x, s_p.y) == self.goal_locs[i]: r += self.goal_rewards[i] break elif (s_p.x, s_p.y) in self.lava_locs: r = 0. - self.lava_cost else: r = 0. - self.step_cost return r, s_p
def _transition_func(self, state, action): ''' Args: state (simple_rl.State) action (str) Returns: state (simple_rl.State) ''' if state.is_terminal(): return state noise = np.random.randn(1)[0] / 100.0 to_move = self.delta + noise if action == "up": next_state = GridWorldState(state.x, min(state.y + to_move, 1)) elif action == "down": next_state = GridWorldState(state.x, max(state.y - to_move, 0)) elif action == "right": next_state = GridWorldState(min(state.x + to_move, 1), state.y) elif action == "left": next_state = GridWorldState(max(state.x - to_move, 0), state.y) else: next_state = GridWorldState(state.x, state.y) if self._is_goal_state_action(state, action) and self.is_goal_terminal: next_state.set_terminal(True) return next_state
def __init__(self, x, y, q): GridWorldState.__init__(self,x,y) self.q = q self.data.append(q)
def get_init_state(self): x = random.choice([0.0, 0.2, 0.4, 0.6, 0.8, 1]) y = random.choice([0.0, 0.2, 0.4, 0.6, 0.8, 1]) return GridWorldState(x, y)
def parse_custom_q_table(q_dict, default_q): custom_q = defaultdict(lambda: defaultdict(lambda: default_q)) for state, action_dict in q_dict.items(): for action, value in action_dict.items(): custom_q[GridWorldState(*ast.literal_eval(state))][action] = value return custom_q
def _transition_func(self, state, action): ''' Args: state (State) action (str) Returns (State) ''' if state.is_terminal(): return state if not(self._is_goal_state_action(state, action)) and self.slip_prob > random.random(): # Flip dir. if action == "up": action = random.choice(["left", "right"]) elif action == "down": action = random.choice(["left", "right"]) elif action == "left": action = random.choice(["up", "down"]) elif action == "right": action = random.choice(["up", "down"]) if action == "up" and state.y < self.height and not self.is_wall(state.x, state.y + 1): next_state = GridWorldState(state.x, state.y + 1) elif action == "down" and state.y > 1 and not self.is_wall(state.x, state.y - 1): next_state = GridWorldState(state.x, state.y - 1) elif action == "right" and state.x < self.width and not self.is_wall(state.x + 1, state.y): next_state = GridWorldState(state.x + 1, state.y) elif action == "left" and state.x > 1 and not self.is_wall(state.x - 1, state.y): next_state = GridWorldState(state.x - 1, state.y) else: next_state = GridWorldState(state.x, state.y) landed_in_term_goal = (next_state.x, next_state.y) in self.goal_locs and self.is_goal_terminal landed_in_term_lava = (next_state.x, next_state.y) in self.lava_locs and self.is_lava_terminal if landed_in_term_goal or landed_in_term_lava: next_state.set_terminal(True) if (next_state.x, next_state.y) in self.lava_locs: next_state.set_terminal(True) return next_state
def _transition_func(self, state, action): ''' Args: state (State) action (str) Returns (State) ''' if state.is_terminal(): return state if action == "up" and state.y < self.height and not self.is_wall( state.x, state.y + 1): next_state = GridWorldState(state.x, state.y + 1) elif action == "down" and state.y > 1 and not self.is_wall( state.x, state.y - 1): next_state = GridWorldState(state.x, state.y - 1) elif action == "right" and state.x < self.width and not self.is_wall( state.x + 1, state.y): next_state = GridWorldState(state.x + 1, state.y) elif action == "left" and state.x > 1 and not self.is_wall( state.x - 1, state.y): next_state = GridWorldState(state.x - 1, state.y) else: next_state = GridWorldState(state.x, state.y) if (next_state.x, next_state.y) in self.goal_locs and self.is_goal_terminal: next_state.set_terminal(True) return next_state
def reset(self): if self.rand_init: init_loc = random.randint(1, num_cols), random.randint(1, num_rows) self.cur_state = GridWorldState(init_loc[0], init_loc[1]) else: self.cur_state = copy.deepcopy(self.init_state)
def transition(self, s, a): """ Joint transition method. :param s: (GridWorldState) state :param a: (str) action :return: reward and resulting state (r, s_p) """ if s.is_terminal(): return 0., s if self.slip_prob > random.random(): # Flip direction if a == "up": a = random.choice(["left", "right"]) elif a == "down": a = random.choice(["left", "right"]) elif a == "left": a = random.choice(["up", "down"]) elif a == "right": a = random.choice(["up", "down"]) if a == "up" and s.y < self.height and not self.is_wall(s.x, s.y + 1): s_p = GridWorldState(s.x, s.y + 1) elif a == "down" and s.y > 1 and not self.is_wall(s.x, s.y - 1): s_p = GridWorldState(s.x, s.y - 1) elif a == "right" and s.x < self.width and not self.is_wall( s.x + 1, s.y): s_p = GridWorldState(s.x + 1, s.y) elif a == "left" and s.x > 1 and not self.is_wall(s.x - 1, s.y): s_p = GridWorldState(s.x - 1, s.y) else: s_p = GridWorldState(s.x, s.y) if (s_p.x, s_p.y) in self.goal_locs and self.is_goal_terminal: s_p.set_terminal(True) if (s_p.x, s_p.y) in self.goal_locs: r = self.goal_reward - self.step_cost elif (s_p.x, s_p.y) in self.lava_locs: r = -self.lava_cost else: heat_reward = 0. if self.reward_span > 0.: for g in self.goal_locs: heat_reward += self.goal_reward * np.exp(-( (s_p.x - g[0])**2 + (s_p.y - g[1])**2) / (2. * self.reward_span**2)) r = heat_reward - self.step_cost return r, s_p