def get_all_possible_states(self): """ Returns a list containing all the possible states in the MDP :return: List of GridWorldState """ state_list = [] walls = self.compute_walls() for col_idx, column in enumerate(range(1, self.height + 1, 1)): for row_idx, row in enumerate(range(self.width, 0, -1)): if (not (column, row) in walls): state = GridWorldState(column, row) if (column, row) in self.goal_location: state._is_terminal = True state_list.append(state) return state_list
def __init__(self, height=11, width=11, init_state=(1, 1), gamma=0.99, slip_prob=0.0, goal_location=None, goal_value=1.0, build_walls=True ): super().__init__(actions=list(Dir), init_state=GridWorldState(init_state[0], init_state[1]), gamma=gamma) self.height = height self.width = width self.slip_prob = slip_prob if goal_location is None: self.goal_location = [(width, height)] else: self.goal_location = goal_location self.goal_value = goal_value self.walls = [] if build_walls: self.walls = self.compute_walls() self.hallway_states = [(3,6), (6,3), (6,8), (8,5)] self.int_rewards_received = []
def visualizeLearnedPolicy(self, agent): """ Shows best action learned at each state of the MDP :return: """ screen = pygame.display.set_mode( [self.screen_width, self.screen_height]) mdp_env = self.createAbstractGridWorldMDP() WIDTH_DIM = self.abstr_mdp.mdp.get_width() HEIGHT_DIM = self.abstr_mdp.mdp.get_height() walls = self.abstr_mdp.mdp.compute_walls() pygame.init() complete_viz = False while True: for event in pygame.event.get(): if event.type == pygame.QUIT: sys.exit() if (not complete_viz): for col_idx, column in enumerate(range(1, HEIGHT_DIM + 1, 1)): for row_idx, row in enumerate(range(WIDTH_DIM, 0, -1)): if (not (column, row) in walls): ground_state = GridWorldState(column, row) abs_state = self.abstr_mdp.get_abstr_from_ground( ground_state) print("abs_state", abs_state) best_action = agent.get_best_action(abs_state) print(best_action) action_img = self.createAction(best_action) mdp_and_action = self.placeAction( action_img, ground_state, mdp_env) screen.blit(mdp_and_action, (0, 0)) pygame.display.flip() complete_viz = True
def test_rollout_adjustment(key): """ Train the agent on a state abstraction with fatal errors. Then generate a roll-out, detach the first state that's part of a cycle, and restart learning. """ # Load a poorly-performing abstraction names = ['AbstrType', 'AbstrEps', 'CorrType', 'CorrProp', 'Batch', 'Dict'] df = pd.read_csv('../abstr_exp/corrupted/corrupted_abstractions.csv', names=names) abstr_string = df.loc[(df['AbstrType'] == str(key[0])) & (df['AbstrEps'] == key[1]) & (df['CorrType'] == str(key[2])) & (df['CorrProp'] == key[3]) & (df['Batch'] == key[4])]['Dict'].values[0] abstr_list = ast.literal_eval(abstr_string) abstr_dict = {} for el in abstr_list: is_term = el[0][0] == 11 and el[0][1] == 11 state = GridWorldState(el[0][0], el[0][1], is_terminal=is_term) abstr_dict[state] = el[1] # Create an agent with this abstraction s_a = StateAbstraction(abstr_dict, abstr_type=Abstr_type.PI_STAR) mdp = GridWorldMDP() agent = AbstractionAgent(mdp, s_a=s_a) # This is useful for later agent2 = copy.deepcopy(agent) # Generate a roll-out from a trained agent after 10000 steps for i in range(5000): agent.explore() rollout = agent.generate_rollout() print('Roll-out for model with no adjustment, 5,000 steps') for state in rollout: print(state, end=', ') for i in range(5000): agent.explore() rollout = agent.generate_rollout() print('Roll-out for model with no adjustment, 10,000 steps') for state in rollout: print(state, end=', ') print('\n') # Train an agent for 5000 steps, detach the first state in the cycle, and train for another 5000 steps # The hope is that this will get further than the 10000 step one for i in range(5000): agent2.explore() rollout = agent2.generate_rollout() print('Roll-out for model pre-adjustment, 5,000 steps') for state in rollout: print(state, end=', ') print() print('Detaching state', rollout[-1]) agent2.detach_state(rollout[-1]) for i in range(5000): agent2.explore() rollout = agent2.generate_rollout() print('Roll-out for model post-adjustment, 10,000 steps') for state in rollout: print(state, end=', ')
def __call__(self, state, action, mdp): ''' This needs access to the MDP parameters Parameters: state:GridWorldState action:Enum mdp:GridWorldMDP Returns: state:GridWorldState ''' next_state = state # If terminal, do nothing if state.is_terminal(): return next_state # Apply slip probability and change action if applicable if random.random() < self.slip_prob: if action in [Dir.UP, Dir.DOWN]: action = random.choice([Dir.LEFT, Dir.RIGHT]) elif action in [Dir.LEFT, Dir.RIGHT]: action = random.choice([Dir.UP, Dir.DOWN]) # Calculate next state based on action if action == Dir.UP and state.y < mdp.height and (state.x, state.y + 1) not in mdp.walls: next_state = GridWorldState(state.x, state.y + 1) if action == Dir.DOWN and state.y > 1 and (state.x, state.y - 1) not in mdp.walls: next_state = GridWorldState(state.x, state.y - 1) if action == Dir.LEFT and state.x > 1 and (state.x - 1, state.y) not in mdp.walls: next_state = GridWorldState(state.x - 1, state.y) if action == Dir.RIGHT and state.x < mdp.width and ( state.x + 1, state.y) not in mdp.walls: next_state = GridWorldState(state.x + 1, state.y) if (next_state.x, next_state.y) in mdp.goal_location: next_state.set_terminal(True) return next_state
def get_all_possible_states(self): """ Create a list of all possible states in the MDP """ state_list = [] for x in range(1, self.total_width + 1): for y in range(1, self.total_height + 1): #print('Checking if', x, y, 'is a state') state = GridWorldState(x, y) if self.is_inside_rooms(state): state_list.append(state) #print() return state_list
def transition(self, state, action): ''' Parameters: state:GridWorldState action:Enum mdp:GridWorldMDP Returns: state:GridWorldState ''' next_state = state # If MDP is already in the goal state, no actions should be available if self.is_goal_state(state): return state # Apply slip probability and change action if applicable if random.random() < self.slip_prob: if action in [Dir.UP, Dir.DOWN]: action = random.choice([Dir.LEFT, Dir.RIGHT]) elif action in [Dir.LEFT, Dir.RIGHT]: action = random.choice([Dir.UP, Dir.DOWN]) # Calculate next state based on action if action == Dir.UP and state.y < self.height and (state.x, state.y + 1) not in self.walls: next_state = GridWorldState(state.x, state.y + 1) if action == Dir.DOWN and state.y > 1 and (state.x, state.y - 1) not in self.walls: next_state = GridWorldState(state.x, state.y - 1) if action == Dir.LEFT and state.x > 1 and (state.x - 1, state.y) not in self.walls: next_state = GridWorldState(state.x - 1, state.y) if action == Dir.RIGHT and state.x < self.width and (state.x + 1, state.y) not in self.walls: next_state = GridWorldState(state.x + 1, state.y) # If the next state takes the agent into the goal location, # return initial state if (next_state.x, next_state.y) in self.goal_location: next_state.set_terminal(True) return next_state
def createAbstractGridWorldMDP(self): """ Creates and returns a Pygame Surface from the Abstract MDP this class is initialized with. All cells that belong to the same abstract class are shown in the same color :return: """ WIDTH_DIM = self.abstr_mdp.mdp.get_width() HEIGHT_DIM = self.abstr_mdp.mdp.get_height() rand_color = randomcolor.RandomColor() #dictionary of abstract state to colors abs_to_color = {} WINDOW_WIDTH = (self.cell_size + self.margin) * WIDTH_DIM + self.margin WINDOW_HEIGTH = (self.cell_size + self.margin) * HEIGHT_DIM + self.margin screen = pygame.Surface((WINDOW_WIDTH, WINDOW_HEIGTH)) window = pygame.Rect(0, 0, WINDOW_HEIGTH, WINDOW_WIDTH) walls = self.abstr_mdp.mdp.compute_walls() # draw background pygame.draw.rect(screen, BLACK, window) # draw cells for col_idx, column in enumerate(range(1, HEIGHT_DIM + 1, 1)): for row_idx, row in enumerate(range(WIDTH_DIM, 0, -1)): color = WHITE if (column, row) in walls: color = BLACK else: ground_state = GridWorldState(column, row) abs_state = self.abstr_mdp.get_abstr_from_ground( ground_state) print("ground state", ground_state) print("abstract state", abs_state) if (abs_state in abs_to_color): new_color = abs_to_color[abs_state] else: new_color = rand_color.generate() while (new_color in abs_to_color.values()): new_color = rand_color.generate() abs_to_color[abs_state] = new_color color = pygame.Color(new_color[0]) pygame.draw.rect( screen, color, [(self.margin + self.cell_size) * (col_idx) + self.margin, (self.margin + self.cell_size) * (row_idx) + self.margin, self.cell_size, self.cell_size]) return screen
def test_detach_state(agent): # Test that detach_state both removes the state from the abstraction dictionary and resets the Q-table to 0 # We select this state to remove since we are guaranteed to always interact with it state_to_remove = GridWorldState(1, 1) print('State and abstr state prior to detach:', state_to_remove, agent.s_a.abstr_dict[state_to_remove]) print('Other states in this abstract state: ', end = '') for temp_state in agent.mdp.get_all_possible_states(): if agent.s_a.abstr_dict[temp_state] == agent.s_a.abstr_dict[state_to_remove]: print(temp_state, end = ' ') print() for i in range(5000): agent.explore() print() print('Q-value of state after exploring: (should be non-zero)') for i in range(len(agent.mdp.actions)): print(agent.mdp.actions[i], agent.get_q_value(state_to_remove, agent.mdp.actions[i])) print() agent.detach_state(state_to_remove, reset_q_value=True) print('State and abstr state after detach:', state_to_remove, agent.s_a.abstr_dict[state_to_remove]) print('Q-value of state after detaching: (should be zero)') for i in range(len(agent.mdp.actions)): print(agent.mdp.actions[i], agent.get_q_value(state_to_remove, agent.mdp.actions[i])) print() for i in range(5000): agent.explore() print('Q-value of state after exploring again: (should be non-zero)') for i in range(len(agent.mdp.actions)): print(agent.mdp.actions[i], agent.get_q_value(state_to_remove, agent.mdp.actions[i])) print('\n'*3) print('Full Q-table:') for key, value in agent.get_q_table().items(): print(key[0], key[1], value) # Check that the ground -> abstr and abstr -> ground mappings correspond for key in agent.group_dict.keys(): for state in agent.all_possible_states: if agent.s_a.abstr_dict[state] == key and state not in agent.group_dict[key]: print('F**K', key, state) print('Success!')
def test_check_for_optimal_action_and_value(states, num_steps): """ Create a list of actions generated by following the greedy policy, starting at the given state """ mdp = GridWorldMDP() abstr_mdp = mdp.make_abstr_mdp(Abstr_type.Q_STAR) agent = AbstractionAgent(mdp, s_a=abstr_mdp.state_abstr) for i in range(100000): if i % 1000 == 0: print('On step', i) agent.explore() # print(agent.get_learned_policy_as_string()) policy = agent.get_learned_policy() #for key, value in agent.get_learned_policy_as_string().items(): # print(key, value, agent.get_q_value(key[0], key[1])) for s in agent.mdp.get_all_possible_states(): #for a in agent.mdp.actions: print(s, agent.get_best_action_value(s)) for state in states: mdp_state = GridWorldState(state[0], state[1]) action, value = agent.check_for_optimal_action_value_next_state(mdp_state, verbose=True) print()
def transition(self, state, action): # If in goal state, no actions available if self.is_goal_state(state): return state # Apply slip probability if random.random() < self.slip_prob: if action in [Dir.UP, Dir.DOWN]: action = random.choice([Dir.LEFT, Dir.RIGHT]) else: action = random.choice([Dir.UP, Dir.DOWN]) # Start by assigning next_state to current_state. This way we only have to check for cases where action # successfully changes states below next_state = state # Check if state is outside of the two rooms; if so action should have no effect if not self.is_inside_rooms(state): return next_state # Calculate next state for cases where action changes state; add +1 to upper_height to account for # wall if action == Dir.UP: # If in lower room not against wall, or in lower room under hallway state, or in upper room # not against wall, or in hallway ''' if state.y < self.lower_height \ or (state.y == self.lower_height and state.x in self.hallway_states) \ or (self.upper_start_height <= state.y < self.total_height) \ or (self.lower_height < state.y < self.upper_start_height and state.x in self.hallway_states): next_state = GridWorldState(state.x, state.y + 1) ''' next_state = GridWorldState(state.x, state.y + 1) if not self.is_inside_rooms(next_state): next_state = GridWorldState(state.x, state.y) elif action == Dir.DOWN: # In upper room not against wall, in upper room above hallway, or in lower room not against wall, or in # hallway ''' if (state.y > self.upper_start_height) \ or (state.y == self.upper_start_height and state.x in self.hallway_states) \ or (1 < state.y <= self.lower_height) \ or (self.lower_height < state.y < self.upper_start_height and state.x in self.hallway_states): next_state = GridWorldState(state.x, state.y - 1) ''' next_state = GridWorldState(state.x, state.y - 1) if not self.is_inside_rooms(next_state): next_state = GridWorldState(state.x, state.y) elif action == Dir.LEFT: # In lower room not against wall, or upper room not against wall ''' if (state.y <= self.lower_height and state.x > max(self.lower_offset + 1, 1)) \ or (state.y >= self.upper_start_height and state.x > max(self.upper_offset + 1, 1)): next_state = GridWorldState(state.x - 1, state.y) ''' next_state = GridWorldState(state.x - 1, state.y) if not self.is_inside_rooms(next_state): next_state = GridWorldState(state.x, state.y) elif action == Dir.RIGHT: # In lower room not against wall, or upper room not against wall ''' if (state.y <= self.lower_height and state.x < self.lower_width + self.lower_offset) \ or (state.y >= self.upper_start_height and state.x < self.upper_width + self.upper_offset): next_state = GridWorldState(state.x + 1, state.y) ''' next_state = GridWorldState(state.x + 1, state.y) if not self.is_inside_rooms(next_state): next_state = GridWorldState(state.x, state.y) # If agent enters goal state, make next state terminal if (next_state.x, next_state.y) in self.goal_location: next_state.set_terminal(True) return next_state
def __init__(self, upper_width=5, upper_height=5, lower_width=5, lower_height=5, upper_offset=0, lower_offset=0, init_state=(1, 1), goal_location=None, slip_prob=0.0, goal_value=1.0, hallway_states=[3], hallway_height=1, gamma=0.99): """ :param upper_width: width (x-coordinate) of upper room :param upper_height: height (y-coordinate) of upper room :param lower_width: width of lower room :param lower_height: height of upper room :param upper_offset: shift upper room to the right by this value :param lower_offset: shift lower room to the right by this value :param init_state: starting state (x,y) :param goal_location: goal state (x,y) :param slip_prob: probably of random action instead of selected action :param goal_value: reward on reaching goal :param hallway_states: tuple of states through which the agent can move to get from one room to another :param hallway_height: length of the hallway states :param gamma: discount factor """ super().__init__(actions=list(Dir), init_state=GridWorldState(init_state[0], init_state[1]), gamma=gamma) lower_bound = min(upper_offset, lower_offset) upper_offset = upper_offset - lower_bound lower_offset = lower_offset - lower_bound self.upper_width = upper_width self.upper_height = upper_height self.lower_width = lower_width self.lower_height = lower_height self.upper_offset = upper_offset self.lower_offset = lower_offset self.goal_location = goal_location self.goal_value = goal_value self.slip_prob = slip_prob self.hallway_states = hallway_states self.hallway_height = hallway_height # Hallway states shouldn't be wider than either room #if max(self.hallway_states) > min(self.upper_width + self.upper_offset, self.lower_width + self.lower_offset) \ # or min(self.hallway_states) < min(self.upper_offset, self.lower_offset): # raise ValueError('Hallway states extend beyond room widths ' + str(self.hallway_states) ) # Some useful values self.total_height = self.lower_height + self.upper_height + self.hallway_height #print(self.lower_height, self.upper_height, self.hallway_height) self.total_width = max(self.lower_offset + self.lower_width, self.upper_offset + self.upper_width) self.upper_start_height = self.lower_height + self.hallway_height + 1 #print('In MDP. total_width, total_height =', self.total_width, self.total_height) # If no goal location given, make goal location be the upper right hand corner of the upper room; if there is # no upper room, make it upper-right hand corner of lower room if self.goal_location is None: if self.upper_width > 0 and self.upper_height > 0: self.goal_location = [(self.upper_width + self.upper_offset, self.total_height)] else: self.goal_location = [(self.lower_offset + self.lower_width, self.lower_height)] # If goal location is outside rooms, raise value error for loc in self.goal_location: if not self.is_inside_rooms(GridWorldState(loc[0], loc[1])): raise ValueError('Goal location is outside rooms ' + str([loc for loc in self.goal_location]))
from GridWorld.TwoRoomsMDP import TwoRoomsMDP from GridWorld.GridWorldStateClass import GridWorldState from MDP.ValueIterationClass import ValueIteration if __name__ == '__main__': test_num = 8 # (1) Check that each state-action combination on the default arguments yields the expected results if test_num == 1: mdp = TwoRoomsMDP() print('Checking all state-action combos') # 5 squares wide, 11 squares tall (including hallway) for x in range(1, 20): for y in range(1, 20): if mdp.is_inside_rooms(GridWorldState(x, y)): for action in mdp.actions: state = GridWorldState(x, y) next_state = mdp.transition(state, action) if state != next_state: print(state, action, next_state) print() # (2) Upper offset elif test_num == 2: mdp = TwoRoomsMDP(upper_offset=1) for x in range(1, 20): for y in range(1, 20): if mdp.is_inside_rooms(GridWorldState(x, y)): for action in mdp.actions: state = GridWorldState(x, y)
# Add group dict (for detachment) self.group_dict = self.reverse_abstr_dict(self.s_a.abstr_dict) # Testing use only if __name__ == '__main__': # Create environment mdp = TwoRoomsMDP(lower_width=3, upper_width=3, lower_height=3, upper_height=3, hallway_states=[3], goal_location=[(1, 5)]) error_dict = { GridWorldState(1, 2): GridWorldState(2, 5), GridWorldState(3, 3): GridWorldState(1, 6) } ABSTR_TYPE = Abstr_type.Q_STAR ERROR_NUM = 6 mdp = GridWorldMDP() if ABSTR_TYPE == Abstr_type.Q_STAR: abstr_mdp = mdp.make_abstr_mdp(Abstr_type.Q_STAR) if ERROR_NUM == 1: error_dict = { GridWorldState(6, 3): GridWorldState(10, 9), GridWorldState(9, 10): GridWorldState(9, 3) } elif ERROR_NUM == 2:
def iterate_detachment(mdp_key, batch_size=5000): """ Load an incorrect abstraction. Train the model, generate a roll-out, detach the first cycle state. Repeat until the roll-out achieves a terminal state. Save the adjusted abstraction and learned policy. Visualize the original incorrect abstraction with roll-outs from original agents and the adjusted abstraction with a roll-out from the new agent :param key: key for incorrect (poorly performing) abstraction :param batch_size: Number of steps to train between state detachments """ # Load a poorly-performing abstraction names = ['AbstrType', 'AbstrEps', 'CorrType', 'CorrProp', 'Batch', 'Dict'] df = pd.read_csv('../abstr_exp/corrupted/corrupted_abstractions.csv', names=names) abstr_string = df.loc[(df['AbstrType'] == str(mdp_key[0])) & (df['AbstrEps'] == mdp_key[1]) & (df['CorrType'] == str(mdp_key[2])) & (df['CorrProp'] == mdp_key[3]) & (df['Batch'] == mdp_key[4])]['Dict'].values[0] abstr_list = ast.literal_eval(abstr_string) abstr_dict = {} for el in abstr_list: is_term = el[0][0] == 11 and el[0][1] == 11 state = GridWorldState(el[0][0], el[0][1], is_terminal=is_term) abstr_dict[state] = el[1] # Create an agent with this abstraction s_a = StateAbstraction(abstr_dict, abstr_type=Abstr_type.PI_STAR) mdp = GridWorldMDP() agent = AbstractionAgent(mdp, s_a=s_a) # Generate a roll-out from untrained model (should be random and short) rollout = agent.generate_rollout() print('Roll-out from untrained model') for state in rollout: print(state, end=', ') print() # Until roll-out leads to terminal state, explore and detach last state of roll-out. Record each of the detached # states so they can be visualized later detached_states = [] step_counter = 0 while not rollout[-1].is_terminal(): for i in range(batch_size): agent.explore() step_counter += batch_size rollout = agent.generate_rollout() print('Roll-out after', step_counter, 'steps') for state in rollout: print(state, end=', ') print() print('State Q-value pre-detach:') for action in agent.mdp.actions: print(rollout[-1], action, agent.get_q_value(rollout[-1], action)) detach_flag = agent.detach_state(rollout[-1]) if detach_flag == 0: print('Detaching state', rollout[-1]) detached_states.append(rollout[-1]) elif detach_flag == 1: print(rollout[-1], 'already a singleton state. No change.') print('State Q-value post-detach:') for action in agent.mdp.actions: print(rollout[-1], action, agent.get_q_value(rollout[-1], action)) print() for key, value in agent.get_q_table(): print(key, value) # Save resulting adapted state abstraction and learned policy s_a_file = open('../abstr_exp/adapted/adapted_abstraction.csv', 'w', newline='') s_a_writer = csv.writer(s_a_file) print(mdp_key) s_a_writer.writerow((mdp_key[0], mdp_key[1], mdp_key[2], mdp_key[3], mdp_key[4], agent.get_abstraction_as_string())) s_a_file.close() policy_file = open('../abstr_exp/adapted/learned_policy.csv', 'w', newline='') policy_writer = csv.writer(policy_file) policy_writer.writerow((mdp_key[0], mdp_key[1], mdp_key[2], mdp_key[3], mdp_key[4], agent.get_learned_policy_as_string())) policy_file.close() # Visualize the adapted state abstraction and learned policy, along with the original for comparison viz = GridWorldVisualizer() surface = viz.create_corruption_visualization(mdp_key, '../abstr_exp/adapted/adapted_abstraction.csv', error_file='../abstr_exp/corrupted/error_states.csv') # Draw small white circles over the states that were detached for state in detached_states: print(state, end=', ') #for d_state in viz.display_surface(surface)
''' # True A-star with episode buffer ''' test_udm(mdp, Abstr_type.A_STAR, EPISODE_COUNT, episode_buffer=10) quit() ''' # True Pi-Star with episode buffer ''' test_udm(mdp, Abstr_type.PI_STAR, EPISODE_COUNT, episode_buffer=20) quit() ''' # Bad error 1 error_dict = {GridWorldState(1, 2): GridWorldState(2, 5)} # Q-Star with bad error 1 ''' test_udm(mdp, Abstr_type.Q_STAR, EPISODE_COUNT, error_dict=error_dict, episode_buffer=10) quit() ''' # A-star with bad error 1 ''' test_udm(mdp, Abstr_type.A_STAR, EPISODE_COUNT, error_dict=error_dict) quit()
def get_next_possible_states(self, state, action): """ Get a dictionary (States -> floats), mapping states to the probability that that state is reached by the given (state, action) pair """ next_state_probs = {} if self.is_goal_state(state): next_state_probs[state] = 1 return next_state_probs up_state = GridWorldState(state.x, state.y + 1) down_state = GridWorldState(state.x, state.y - 1) left_state = GridWorldState(state.x - 1, state.y) right_state = GridWorldState(state.x + 1, state.y) # can the agent move left? left_cond = self.is_inside_rooms(GridWorldState(state.x - 1, state.y)) # can the agent move right? right_cond = self.is_inside_rooms(GridWorldState(state.x + 1, state.y)) # can the agent move down? down_cond = self.is_inside_rooms(GridWorldState(state.x, state.y - 1)) # can the agent move up up_cond = self.is_inside_rooms(GridWorldState(state.x, state.y + 1)) # Set next_state_probs for current state so it can be incremented later next_state_probs[state] = 0 # I'm sure there's a cleaner way to do this but what the hell if action == Dir.UP: if (up_cond): next_state_probs[up_state] = 1 - self.slip_prob else: next_state_probs[state] += (1 - self.slip_prob) # what if it slips?: it would either slip right or left if (left_cond): next_state_probs[left_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 if (right_cond): next_state_probs[right_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 elif action == Dir.DOWN: if (down_cond): next_state_probs[down_state] = (1 - self.slip_prob) else: next_state_probs[state] += (1 - self.slip_prob) # what if it slips?: it would either slip right or left if (left_cond): next_state_probs[left_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 if (right_cond): next_state_probs[right_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 elif action == Dir.LEFT: if (left_cond): next_state_probs[left_state] = (1 - self.slip_prob) else: next_state_probs[state] += (1 - self.slip_prob) # what if it slips?: it would either slip up or down if (up_cond): next_state_probs[up_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 if (down_cond): next_state_probs[down_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 elif action == Dir.RIGHT: if (right_cond): next_state_probs[right_state] = 1 - self.slip_prob else: next_state_probs[state] += (1 - self.slip_prob) # what if it slips?: it would either slip up or down if (up_cond): next_state_probs[up_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 if (down_cond): next_state_probs[down_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 # In the end remove keys whose value is 0 next_state_probs = {k: v for k, v in next_state_probs.items() if v} return next_state_probs
def get_next_possible_states(self, state, action): """ For value iteration, part of model: given a state and an action, outputs a dictionary of State->probability that gives each state that the agent can end up in from the given state if they took the given action and with what probability :param state: State :param action: ActionEnum :return: dictionary of State->Float (probability, should be less than one) """ next_state_probs = {} # if we are in the goal state, every action will take us back to the goal state if (self.is_goal_state(state)): next_state_probs[state] = 1 return next_state_probs # set the probability of ending back at the current state as 0, so it can be incremented later next_state_probs[state] = 0 up_state = GridWorldState(state.x, state.y + 1) down_state = GridWorldState(state.x, state.y - 1) left_state = GridWorldState(state.x - 1, state.y) right_state = GridWorldState(state.x + 1, state.y) # can the agent move left? left_cond = (state.x > 1 and (state.x - 1, state.y) not in self.walls) # can the agent move right? right_cond = (state.x < self.width and (state.x + 1, state.y) not in self.walls) # can the agent move down? down_cond = (state.y > 1 and (state.x, state.y - 1) not in self.walls) # can the agent move up up_cond = (state.y < self.height and (state.x, state.y + 1) not in self.walls) if action == Dir.UP: if (up_cond): next_state_probs[up_state] = 1 - self.slip_prob else: next_state_probs[state] += (1 - self.slip_prob) # what if it slips?: it would either slip right or left if (left_cond): next_state_probs[left_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 if (right_cond): next_state_probs[right_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 elif action == Dir.DOWN: if (down_cond): next_state_probs[down_state] = (1 - self.slip_prob) else: next_state_probs[state] += (1 - self.slip_prob) # what if it slips?: it would either slip right or left if (left_cond): next_state_probs[left_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 if (right_cond): next_state_probs[right_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 elif action == Dir.LEFT: if (left_cond): next_state_probs[left_state] = (1 - self.slip_prob) else: next_state_probs[state] += (1 - self.slip_prob) # what if it slips?: it would either slip up or down if (up_cond): next_state_probs[up_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 if (down_cond): next_state_probs[down_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 elif action == Dir.RIGHT: if (right_cond): next_state_probs[right_state] = 1 - self.slip_prob else: next_state_probs[state] += (1 - self.slip_prob) # what if it slips?: it would either slip up or down if (up_cond): next_state_probs[up_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 if (down_cond): next_state_probs[down_state] = self.slip_prob / 2 else: next_state_probs[state] += self.slip_prob / 2 # In the end remove keys whose value is 0 next_state_probs = {k: v for k, v in next_state_probs.items() if v} return next_state_probs