def get_all_states(self): """Returns all states. """ states = [] for x in range(1, self.width + 1): for y in range(1, self.height + 1): state = NavigationWorldState(x, y) if self.is_goal(x, y) and self.is_goal_terminal: state.set_terminal(True) states.append(state) return states
def get_all_states(self): """Returns all states. """ return [ NavigationWorldState(x, y) for x in range(1, self.width + 1) for y in range(1, self.height + 1) ]
def get_value_grid(self): """Returns value over states space grid. """ value_iter = self.run_value_iteration() V = np.zeros((self.height, self.width), dtype=np.float32) for row in range(self.height): for col in range(self.width): x, y = self._rowcol_to_xy(row, col) V[row, col] = value_iter.value_func[NavigationWorldState(x, y)] return V
def sample_traj_init_state(self, idx=None): """Samples trajectory init state (GridWorldState). Type of init cells to be sampled can be specified using set_traj_init_cell_types(). """ if idx is None: rand_idx = np.random.randint(len(self.traj_init_cell_row_idxs)) else: assert 0 <= idx < len(self.traj_init_cell_row_idxs) rand_idx = idx x, y = self._rowcol_to_xy(self.traj_init_cell_row_idxs[rand_idx], self.traj_init_cell_col_idxs[rand_idx]) return NavigationWorldState(x, y)
def _transition_func(self, state, action): """ Args: state (State) action (str) Returns (State) """ if state.is_terminal(): return state r = random.random() if self.slip_prob > r: # Flip dir. if action == "up": action = random.choice(["left", "right"]) elif action == "down": action = random.choice(["left", "right"]) elif action == "left": action = random.choice(["up", "down"]) elif action == "right": action = random.choice(["up", "down"]) if action == "up" and state.y < self.height and not self.is_wall( state.x, state.y + 1): next_state = NavigationWorldState(state.x, state.y + 1) elif action == "down" and state.y > 1 and not self.is_wall( state.x, state.y - 1): next_state = NavigationWorldState(state.x, state.y - 1) elif action == "right" and state.x < self.width and not self.is_wall( state.x + 1, state.y): next_state = NavigationWorldState(state.x + 1, state.y) elif action == "left" and state.x > 1 and not self.is_wall(state.x - 1, state.y): next_state = NavigationWorldState(state.x - 1, state.y) else: next_state = NavigationWorldState(state.x, state.y) if self.is_goal(next_state.x, next_state.y) and self.is_goal_terminal: next_state.set_terminal(True) return next_state
def __init__(self, width=30, height=30, nav_cell_types=["white", "yellow", "red", "green", "purple"], nav_cell_rewards=[0, 0, -10, -10, -10], nav_cell_p_or_locs=[0.68, 0.17, 0.05, 0.05, 0.05], wall_cell_types=[], wall_cell_rewards=[], wall_cell_locs=[], goal_cell_types=["blue"], goal_cell_rewards=[1.], goal_cell_locs=[[(21, 21)]], init_loc=(1, 1), rand_init=False, gamma=0.99, slip_prob=0.00, step_cost=0.5, is_goal_terminal=True, name="Navigation MDP"): """Navigation World MDP constructor. Args: height (int): Navigation grid height (in no. of cells). width (int): Navigation grid width (in no. of cells). nav_cell_types (list of <str / int>): Navigation cell types. nav_cell_rewards (list of float): Rewards associated with @nav_cell_types. nav_cell_p_or_locs (list of <float / list of tuples (int x, int y)>): Probability corresponding to @nav_cell_types distribution, or it could be list of fixed/forced locations [(x,y),...]. (Default values are chosen arbitrarily larger than percolation threshold for square lattice--just an approximation to match cell distribution in the paper.) goal_cell_types (list of <str / int>): Goal cell types. goal_cell_locs (list of list of tuples (int, int)): Goal cell locations [(x,y),...] associated with @goal_cell_types. nav_cell_rewards (list of float): Rewards associated with @goal_cell_types. init_loc (int x, int y): Init cell to compute state space reachability and value iteration. gamma (float): MDP discount factor slip_prob (float): With this probability agent could fall either on left or right from the intended cell. step_cost (float): Living penalty. is_goal_terminal (bool): True to set goal state terminal. Note: Locations are specified in (x,y) format, but (row, col) convention is used while storing in memory. """ assert len(nav_cell_types) == len(nav_cell_rewards) \ == len(nav_cell_p_or_locs) assert len(wall_cell_types) == len(wall_cell_rewards) \ == len(wall_cell_locs) assert len(goal_cell_types) == len(goal_cell_rewards) \ == len(goal_cell_locs) self.width = width self.height = height self.init_loc = init_loc self.rand_init = rand_init self.gamma = gamma self.slip_prob = slip_prob self.step_cost = step_cost self.is_goal_terminal = is_goal_terminal self.name = name self.goal_cell_locs = goal_cell_locs # Setup init location. self.rand_init = rand_init if rand_init: init_loc = random.randint(1, width), random.randint(1, height) while self.is_wall(*init_loc) or self.is_goal(*init_loc): init_loc = random.randint(1, width), random.randint(1, height) self.init_loc = init_loc # Construct base class MDP.__init__(self, NavigationWorldMDP.ACTIONS, self._transition_func, self._reward_func, init_state=NavigationWorldState(*init_loc), gamma=gamma) # Navigation MDP self.__reset_nav_mdp() self.__register_cell_types(nav_cell_types, wall_cell_types, goal_cell_types) self.__add_cells_by_locs(self.goal_cell_ids, goal_cell_locs, NavigationWorldMDP.CELL_KIND_GOAL) self.__add_cells_by_locs(self.wall_cell_ids, wall_cell_locs, NavigationWorldMDP.CELL_KIND_WALL) self.__add_nav_cells(self.nav_cell_ids, nav_cell_p_or_locs, NavigationWorldMDP.CELL_KIND_NAV) self.__check_state_map() self.__register_cell_rewards(nav_cell_rewards, wall_cell_rewards, goal_cell_rewards) # Initialize value iteration object (computes reachable states) self.value_iter = ValueIteration(self, sample_rate=1) self._policy_invalidated = True
def sample_trajectories(self, n_traj, horizon, init_states=None, init_cell_types=None, init_unique=False, policy=None, rand_init_to_match_n_traj=True): """Samples trajectories. Args: n_traj: Number of trajectories to sample. horizon (int): Planning horizon (max trajectory length). init_states: None - to use random init state [GridWorldState(x,y),...] - to use specific init states init_unique: When init_unique is set to False, this will sample every possible init state and try to not repeat init state unless @n_traj > @self.num_traj_init_states policy (fn): S->A rand_init_to_match_n_traj: If True, this will always return @n_traj many trajectories. If # of unique states are less than @n_traj, this will override the effect of @init_unique and sample repeated init states. Returns: (traj_states_list, traj_actions_list) where traj_states: [s1, s2, ..., sT] traj_actions: [a1, a2, ..., aT] """ assert len(init_cell_types) >= 1 self.set_traj_init_cell_types(init_cell_types) traj_states_list = [] traj_action_list = [] traj_init_states = [] if init_states is None: # If no init_states are provided, sample n_traj many init states. traj_init_states = self.sample_init_states(n_traj, init_unique=init_unique) else: if type(init_states[0]) == tuple or type(init_states[0]) == list: init_states = [NavigationWorldState(*s) for s in init_states] traj_init_states = copy.deepcopy(init_states) if len(traj_init_states) >= n_traj: traj_init_states = traj_init_states[:n_traj] elif rand_init_to_match_n_traj: # If # init_states < n_traj, sample remaining ones if len(traj_init_states) < n_traj: traj_init_states += self.sample_init_states( n_traj - len(traj_init_states), init_unique=init_unique, skip_states=traj_init_states) # If rand_init_to_match_n_traj is set to True, sample more # init_states if needed (may not be unique) if rand_init_to_match_n_traj and len(traj_init_states) < n_traj: traj_init_states += self.sample_init_states(n_traj, init_unique=False) if policy is None: if len(self.goal_cell_locs) == 0: print("Running value iteration with no goals assigned..") policy = self.run_value_iteration().policy for init_state in traj_init_states: action_seq, state_seq = self.policy_propagate(init_state, policy=policy, horizon=horizon) traj_states_list.append(state_seq) traj_action_list.append(action_seq) return traj_states_list, traj_action_list