def update(self, action, measurements, reward): """Updates model after the given action and ending up in the state corresponding to the given measurements. Args: action (tuple): The action taken measurements (dict): The measurements collected after the action was taken reward (double): The reward acquired after this action was taken """ if self._current_state is None: raise StateNotSetError() self._current_state.visit() q_state = self._current_state.get_q_state(action) if q_state is None: return new_state = self._get_state(measurements) q_state.update(new_state, reward) if self._update_algorithm == SINGLE_UPDATE: self._q_update(q_state) self._current_state.update_value() elif self._update_algorithm == VALUE_ITERATION: self.value_iteration() elif self._update_algorithm == PRIORITIZED_SWEEPING: self.prioritized_sweeping() self._current_state = new_state
def suggest_action(self): """Suggest the optimal action to take from the current state. Returns: action (tuple): The optimal action from the current state """ if self._current_state is None: raise StateNotSetError(logger) return self._current_state.get_optimal_action()
def get_legal_actions(self): """Returns all the legal actions from the current_state. Returns: legal_actions (list(tuple)): A list of all the legal actions from the current state """ if self._current_state is None: raise StateNotSetError() return self._current_state.get_legal_actions()
def suggest_action(self): """Suggest the next action based on the greedy criterion. Returns: optimal_action (tuple(str, int)): The suggested optimal action """ if self._current_state is None: raise StateNotSetError() return self._current_state.get_optimal_action()
def prioritized_sweeping(self, initial_state=None, error=None, max_updates=None, debug=False): """Runs prioritized sweeping starting from the given state.""" if self._current_state is None and initial_state is None: raise StateNotSetError(logger) if initial_state is None: initial_state = self._current_state if error is None: error = self._update_error if max_updates is None: max_updates = self._max_updates # transition probabilities have changed for the initial state reverse_transitions = [{} for _ in self._states] for state in self._states: for state_num, t in state.get_max_transitions().items(): reverse_transitions[state_num][state.state_num] = t state = initial_state for i in range(max_updates): # update the state value old_value = state.get_value() self._v_update(state) new_value = state.get_value() delta = abs(new_value - old_value) # update the priorities of the predecessors rev_transitions = reverse_transitions[state.state_num] for state_num, t in rev_transitions.items(): self._priorities[state_num] = max(t * delta, self._priorities[state_num]) # zero the updated state's priority self._priorities[state.state_num] = 0 # Choose the next max priority state # TODO with Priority Queue - but needs to support item removal max_index, max_priority = 0, 0 for j in range(len(self._priorities)): if self._priorities[j] > max_priority: max_priority = self._priorities[j] max_index = j # stop if the priority gets below the supplied limit if max_priority <= error: break state = self._states[max_index]
def update(self, action, measurements, reward): """Updates model after taking given action and ending up in the state corresponding to the measurements. Args: action (tuple): The recent taken action measurements (dict): The measurements collected after the action reward (double): The reward acquired through the specific action """ if self._current_measurements is None: raise StateNotSetError(logger) # TODO move this where the splitting is decided self._current_state = self._root.get_state(self._current_measurements) # determine the new state new_state = self._root.get_state(measurements) new_num = new_state.state_num # store the transition information trans_data = (self._current_measurements, measurements, action, reward) self._current_state.store_transition(trans_data, new_num) # update the qstate q_state = self._current_state.get_q_state(action) q_state.update(new_state, reward) # update the model values according to the chosen algorithm if self._update_algorithm == SINGLE_UPDATE: self._q_update(q_state) self._current_state.update_value() elif self._update_algorithm == VALUE_ITERATION: self.value_iteration() elif self._update_algorithm == PRIORITIZED_SWEEPING: self.prioritized_sweeping() # consider splitting the initial_state if self._allow_splitting: self.split() # update the current state and store the last measurements self._current_state = new_state self._current_measurements = measurements
def get_legal_actions(self): """Returns all the legal actions from the current_state.""" if self._current_state is None: raise StateNotSetError(logger) return self._current_state.get_legal_actions()
def prioritized_sweeping(self, initial_state=None, error=None, max_updates=None): """Runs prioritized sweeping starting from the given state. Args: initial_state (State): The initial state in the prioritized sweeping process error (double): The updating error max_updates (int): The max number of updates """ if self._current_state is None and initial_state is None: raise StateNotSetError() if initial_state is None: initial_state = self._current_state if error is None: error = self._update_error if max_updates is None: max_updates = self._max_updates # transition probabilities have changed for the initial state max_transitions = initial_state.get_max_transitions() initial_s_num = initial_state.state_num for state_num, t in max_transitions.items(): self._reverse_transitions[state_num][initial_s_num] = t state, num_updates = initial_state, 0 for i in range(max_updates): num_updates += 1 # Update the state value old_value = state.value self._v_update(state) new_value = state.value delta = abs(new_value - old_value) # Update the priorities of the predecessors rev_transitions = self._reverse_transitions[state.state_num] for state_num, t in rev_transitions.items(): self._priorities[state_num] = max(t * delta, self._priorities[state_num]) # zero the updated state's priority self._priorities[state.state_num] = 0 # choose the next max priority state # TODO with Priority Queue - but needs to support item removal max_index, max_priority = 0, 0 for j in range(len(self._priorities)): if self._priorities[j] > max_priority: max_priority = self._priorities[j] max_index = j # stop if the priority gets below the supplied limit if max_priority <= error: break state = self._states[max_index]