def look_one_step_ahead(self, source, key_scenario, action): next_state = self.dict_planner[key_scenario]._domain.get_next_state( Memory([source]), action) cost = self.dict_planner[key_scenario]._domain.get_transition_value( Memory([source]), action, next_state).cost cost_f, path_f = self.dict_planner[key_scenario].solve( from_observation=Memory([next_state], maxlen=1), verbose=self.verbose, render=False) self.q_values_scenar[source][action][key_scenario] = cost + cost_f if self.reuse_plans: self.plan_by_scenario[next_state][key_scenario] = path_f action_next_state = self.dict_planner[key_scenario].get_next_action( Memory([next_state])) if self.reuse_plans: self.action_by_scenario[next_state][ key_scenario] = action_next_state self.q_values_scenar[next_state][action_next_state][ key_scenario] = cost_f if next_state not in self.q_values: self.q_values[next_state] = {} if action_next_state not in self.q_values[ next_state] and self.reuse_plans: self.q_values[next_state][action_next_state] = 0. if self.reuse_plans: self.q_values[next_state][ action_next_state] += cost_f * self.weight_scenario[ key_scenario] if self.reuse_plans: self.planned[next_state][key_scenario] = True self.q_values[source][action] += ( cost + cost_f) * self.weight_scenario[key_scenario] return None
def first_pass(self, source): missing = list(self.dict_planner.keys()) if self.reuse_plans: print("reuse plans") print(self.planned) print(source in self.planned) if source in self.planned: missing = [ k for k in self.dict_planner.keys() if k not in self.planned[source] ] if self.verbose: print("Missing, first pass", missing) list_results = self.launch_things( lambda x: (x, self.dict_planner[x].solve(from_observation=Memory([source], maxlen=1), verbose=self.verbose, render=False)), missing) for l in list_results: cost = l[1][0] action = self.dict_planner[l[0]].get_next_action(Memory([source])) self.action_by_scenario[source][l[0]] = action self.q_values_scenar[source][action][l[0]] = cost self.planned[source][l[0]] = True if source not in self.q_values: self.q_values[source] = {} if action not in self.q_values[source]: self.q_values[source][action] = 0. self.q_values[source][action] += cost * self.weight_scenario[l[0]] self.plan_by_scenario[source][l[0]] = l[1][1]
def _get_next_action( self, observation: D.T_agent[D.T_observation] ) -> D.T_agent[D.T_concurrency[D.T_event]]: # This solver selects the first action with the highest expected immediate reward (greedy) domain = self._domain memory = Memory([ observation ]) # note: observation == state (because FullyObservable) applicable_actions = domain.get_applicable_actions(memory) if domain.is_transition_value_dependent_on_next_state(): values = [] for a in applicable_actions.get_elements(): next_state_prob = domain.get_next_state_distribution( memory, [a]).get_values() expected_value = sum( p * domain.get_transition_value(memory, [a], s).reward for s, p in next_state_prob) values.append(expected_value) else: values = [ domain.get_transition_value(memory, a).reward for a in applicable_actions ] argmax = max(range(len(values)), key=lambda i: values[i]) return [applicable_actions.get_elements()[argmax] ] # list of action here because we handle Parallel domains
def _tree_search(self, state, h_act, h_obs, depth): """UCT search from a given state with act/obs history. This corresponds to the Simulate function in the POMCP paper. """ # This must be a history that ends on an observation assert len(h_act) == len(h_obs) if depth > self._max_depth: return self._VLV if (h_act, h_obs) not in self._tree: # generate new child nodes for action in self._domain.get_applicable_actions(Memory( [state])).get_elements(): assert action is not None self._tree[(h_act + (action, ), h_obs)] = [0, 0, []] # but we must also store this node, or we'll never get out of this case! cost = self._rollout(state, h_act, h_obs, depth) self._tree[(h_act, h_obs)] = [1, cost, [state]] return cost else: # pick a successor node according to the UCT formula action = self._get_best_action(h_act, h_obs, w=self._max_depth) assert action is not None # simulate outcome of this action: new_state = self._domain.get_next_state_distribution( Memory([state]), action).sample() TV = self._domain.get_transition_value(Memory([state]), action, new_state) new_obs = self._domain.get_observation_distribution( state, action).sample() if self._domain.is_goal(new_obs): s_cost = TV.cost else: s_cost = TV.cost + self._tree_search(new_state, h_act + (action, ), h_obs + (new_obs, ), depth + 1) s_cost = min(s_cost, self._VLV) this_node = self._tree[(h_act, h_obs)] succ_node = self._tree[(h_act + (action, ), h_obs)] # update average cost for succ node: succ_node[1] = ( (succ_node[1] * succ_node[0]) + s_cost) / (succ_node[0] + 1) # increment visit counters for both this node and succ node: this_node[0] = this_node[0] + 1 succ_node[0] = succ_node[0] + 1 return s_cost
def _update_belief_state(self, belief, action): new_belief = [] for state in belief: d = self._domain.get_next_state_distribution( Memory([state]), action) new_state = d.sample() new_belief.append(new_state) return new_belief
def _update_belief_state(self, belief, action): new_belief = [] for state in belief: d = (self._domain.get_next_state_distribution( Memory([state]), action) if action is not None else self._domain.get_initial_state_distribution()) new_state = d.sample() new_belief.append(new_state) return new_belief
def _rollout(self, state, h_act, h_obs, depth): if depth > self._max_depth: return self._VLV action = self._get_random_action(state, h_act, h_obs, depth) assert action is not None new_state = self._domain.get_next_state_distribution( Memory([state]), action).sample() TV = self._domain.get_transition_value(Memory([state]), action, new_state) new_obs = self._domain.get_observation_distribution(state, action).sample() if self._domain.is_goal(new_obs): s_cost = TV.cost else: s_cost = TV.cost + self._rollout(new_state, h_act + (action, ), h_obs + (new_obs, ), depth + 1) s_cost = min(s_cost, self._VLV) return s_cost
def build_graph_domain(self, init_state: Any = None, transition_extractor=None, verbose=True) -> GraphDomain: if transition_extractor is None: transition_extractor = lambda s, a, s_prime: { "cost": self.domain.get_transition_value(s, a, s_prime).cost } next_state_map = {} next_state_attributes = {} if init_state is None: init_state = self.domain.get_initial_state() stack = [(init_state, [init_state])] nb_nodes = 1 nb_edges = 0 nb_path = 0 next_state_map[init_state] = {} next_state_attributes[init_state] = {} paths_dict = {} while stack: (vertex, path) = stack.pop() actions = self.domain.get_applicable_actions(vertex).get_elements() for action in actions: next = self.domain.get_next_state(Memory([vertex]), action) if action not in next_state_map[vertex]: nb_edges += 1 else: continue next_state_map[vertex][action] = next next_state_attributes[vertex][action] = transition_extractor( vertex, action, next) if self.domain.is_goal(next): nb_path += 1 if verbose: print(nb_path, " / ", self.max_path) print("nodes ", nb_nodes, " / ", self.max_nodes) print("edges ", nb_edges, " / ", self.max_edges) else: if next not in next_state_map: stack.append((next, path + [next])) paths_dict[next] = set(tuple(path + [next])) # else: # if tuple(path+[next]) not in paths_dict[next]: # stack.append((next, path + [next])) # paths_dict[next].add(tuple(path + [next])) if next not in next_state_map: next_state_map[next] = {} next_state_attributes[next] = {} nb_nodes += 1 if (nb_path > self.max_path or (nb_nodes > self.max_nodes and nb_path >= 1) or (nb_edges > self.max_edges and nb_path >= 1)): break return GraphDomain(next_state_map, next_state_attributes, None, None)
def build_graph_domain(self, init_state: Any = None) -> GraphDomain: next_state_map = {} next_state_attributes = {} if init_state is None: init_state = self.domain.get_initial_state() stack = [(init_state, [init_state])] nb_nodes = 1 nb_edges = 0 nb_path = 0 next_state_map[init_state] = {} next_state_attributes[init_state] = {} while stack: (vertex, path) = stack.pop() actions = self.domain.get_applicable_actions(vertex).get_elements() for action in actions: next = self.domain.get_next_state(vertex, action) if next not in next_state_map: next_state_map[next] = {} next_state_attributes[next] = {} nb_nodes += 1 if action not in next_state_map[vertex]: nb_edges += 1 next_state_map[vertex][action] = next next_state_attributes[vertex][action] = { "cost": self.domain.get_transition_value(Memory([vertex]), action, next).cost, "reward": self.domain.get_transition_value(Memory([vertex]), action, next).reward, } if self.domain.is_goal(next): nb_path += 1 else: if next not in next_state_map: stack.append((next, path + [next])) if (nb_path > self.max_path or (nb_nodes > self.max_nodes and nb_path >= 1) or (nb_edges > self.max_edges and nb_path >= 1)): break return GraphDomain(next_state_map, next_state_attributes, None, None)
def _get_random_action(self, state, h_act, h_obs, depth): sel = self._domain.get_applicable_actions(Memory([state])).sample() return sel