def compute_single_policy_backup(self, policy: Policy, gamma: float) -> Tuple[ValueFunction, float]: ''' Performs a policy backup on the current value function and using the specified policy. This method does not modify the current value function; instead it returns a new value function, together with the error associated with the backup operation. ''' # DONE new_value_function = ValueFunction(self._domain) error = 0 for state in self._domain.get_observation_space().get_elements(): if self._domain.is_terminal(state): new_value_function._values[state] = 0 else: action = policy.__getitem__(state) # distribution = self._domain.get_next_state_distribution(state,action).get_values() new_value_function._values[state] = self.q_value(state,action,gamma) if error < abs(self.q_value(state,action,gamma) - self.__getitem__(state)): error = abs(self.q_value(state,action,gamma) - self.__getitem__(state)) return new_value_function, error