Exemple #1
0
 def look_one_step_ahead(self, source, key_scenario, action):
     next_state = self.dict_planner[key_scenario]._domain.get_next_state(
         Memory([source]), action)
     cost = self.dict_planner[key_scenario]._domain.get_transition_value(
         Memory([source]), action, next_state).cost
     cost_f, path_f = self.dict_planner[key_scenario].solve(
         from_observation=Memory([next_state], maxlen=1),
         verbose=self.verbose,
         render=False)
     self.q_values_scenar[source][action][key_scenario] = cost + cost_f
     if self.reuse_plans:
         self.plan_by_scenario[next_state][key_scenario] = path_f
     action_next_state = self.dict_planner[key_scenario].get_next_action(
         Memory([next_state]))
     if self.reuse_plans:
         self.action_by_scenario[next_state][
             key_scenario] = action_next_state
     self.q_values_scenar[next_state][action_next_state][
         key_scenario] = cost_f
     if next_state not in self.q_values:
         self.q_values[next_state] = {}
     if action_next_state not in self.q_values[
             next_state] and self.reuse_plans:
         self.q_values[next_state][action_next_state] = 0.
     if self.reuse_plans:
         self.q_values[next_state][
             action_next_state] += cost_f * self.weight_scenario[
                 key_scenario]
     if self.reuse_plans:
         self.planned[next_state][key_scenario] = True
     self.q_values[source][action] += (
         cost + cost_f) * self.weight_scenario[key_scenario]
     return None
Exemple #2
0
 def first_pass(self, source):
     missing = list(self.dict_planner.keys())
     if self.reuse_plans:
         print("reuse plans")
         print(self.planned)
         print(source in self.planned)
         if source in self.planned:
             missing = [
                 k for k in self.dict_planner.keys()
                 if k not in self.planned[source]
             ]
     if self.verbose:
         print("Missing, first pass", missing)
     list_results = self.launch_things(
         lambda x:
         (x, self.dict_planner[x].solve(from_observation=Memory([source],
                                                                maxlen=1),
                                        verbose=self.verbose,
                                        render=False)), missing)
     for l in list_results:
         cost = l[1][0]
         action = self.dict_planner[l[0]].get_next_action(Memory([source]))
         self.action_by_scenario[source][l[0]] = action
         self.q_values_scenar[source][action][l[0]] = cost
         self.planned[source][l[0]] = True
         if source not in self.q_values:
             self.q_values[source] = {}
         if action not in self.q_values[source]:
             self.q_values[source][action] = 0.
         self.q_values[source][action] += cost * self.weight_scenario[l[0]]
         self.plan_by_scenario[source][l[0]] = l[1][1]
 def _get_next_action(
     self, observation: D.T_agent[D.T_observation]
 ) -> D.T_agent[D.T_concurrency[D.T_event]]:
     # This solver selects the first action with the highest expected immediate reward (greedy)
     domain = self._domain
     memory = Memory([
         observation
     ])  # note: observation == state (because FullyObservable)
     applicable_actions = domain.get_applicable_actions(memory)
     if domain.is_transition_value_dependent_on_next_state():
         values = []
         for a in applicable_actions.get_elements():
             next_state_prob = domain.get_next_state_distribution(
                 memory, [a]).get_values()
             expected_value = sum(
                 p * domain.get_transition_value(memory, [a], s).reward
                 for s, p in next_state_prob)
             values.append(expected_value)
     else:
         values = [
             domain.get_transition_value(memory, a).reward
             for a in applicable_actions
         ]
     argmax = max(range(len(values)), key=lambda i: values[i])
     return [applicable_actions.get_elements()[argmax]
             ]  # list of action here because we handle Parallel domains
Exemple #4
0
    def _tree_search(self, state, h_act, h_obs, depth):
        """UCT search from a given state with act/obs history.

        This corresponds to the Simulate function in the POMCP paper.
        """
        # This must be a history that ends on an observation
        assert len(h_act) == len(h_obs)
        if depth > self._max_depth:
            return self._VLV
        if (h_act, h_obs) not in self._tree:
            # generate new child nodes
            for action in self._domain.get_applicable_actions(Memory(
                [state])).get_elements():
                assert action is not None
                self._tree[(h_act + (action, ), h_obs)] = [0, 0, []]
            # but we must also store this node, or we'll never get out of this case!
            cost = self._rollout(state, h_act, h_obs, depth)
            self._tree[(h_act, h_obs)] = [1, cost, [state]]
            return cost
        else:
            # pick a successor node according to the UCT formula
            action = self._get_best_action(h_act, h_obs, w=self._max_depth)
            assert action is not None
            # simulate outcome of this action:
            new_state = self._domain.get_next_state_distribution(
                Memory([state]), action).sample()
            TV = self._domain.get_transition_value(Memory([state]), action,
                                                   new_state)
            new_obs = self._domain.get_observation_distribution(
                state, action).sample()
            if self._domain.is_goal(new_obs):
                s_cost = TV.cost
            else:
                s_cost = TV.cost + self._tree_search(new_state, h_act +
                                                     (action, ), h_obs +
                                                     (new_obs, ), depth + 1)
                s_cost = min(s_cost, self._VLV)
            this_node = self._tree[(h_act, h_obs)]
            succ_node = self._tree[(h_act + (action, ), h_obs)]
            # update average cost for succ node:
            succ_node[1] = (
                (succ_node[1] * succ_node[0]) + s_cost) / (succ_node[0] + 1)
            # increment visit counters for both this node and succ node:
            this_node[0] = this_node[0] + 1
            succ_node[0] = succ_node[0] + 1
            return s_cost
Exemple #5
0
 def _update_belief_state(self, belief, action):
     new_belief = []
     for state in belief:
         d = self._domain.get_next_state_distribution(
             Memory([state]), action)
         new_state = d.sample()
         new_belief.append(new_state)
     return new_belief
Exemple #6
0
 def _update_belief_state(self, belief, action):
     new_belief = []
     for state in belief:
         d = (self._domain.get_next_state_distribution(
             Memory([state]), action) if action is not None else
              self._domain.get_initial_state_distribution())
         new_state = d.sample()
         new_belief.append(new_state)
     return new_belief
Exemple #7
0
 def _rollout(self, state, h_act, h_obs, depth):
     if depth > self._max_depth:
         return self._VLV
     action = self._get_random_action(state, h_act, h_obs, depth)
     assert action is not None
     new_state = self._domain.get_next_state_distribution(
         Memory([state]), action).sample()
     TV = self._domain.get_transition_value(Memory([state]), action,
                                            new_state)
     new_obs = self._domain.get_observation_distribution(state,
                                                         action).sample()
     if self._domain.is_goal(new_obs):
         s_cost = TV.cost
     else:
         s_cost = TV.cost + self._rollout(new_state, h_act +
                                          (action, ), h_obs +
                                          (new_obs, ), depth + 1)
         s_cost = min(s_cost, self._VLV)
     return s_cost
 def build_graph_domain(self,
                        init_state: Any = None,
                        transition_extractor=None,
                        verbose=True) -> GraphDomain:
     if transition_extractor is None:
         transition_extractor = lambda s, a, s_prime: {
             "cost": self.domain.get_transition_value(s, a, s_prime).cost
         }
     next_state_map = {}
     next_state_attributes = {}
     if init_state is None:
         init_state = self.domain.get_initial_state()
     stack = [(init_state, [init_state])]
     nb_nodes = 1
     nb_edges = 0
     nb_path = 0
     next_state_map[init_state] = {}
     next_state_attributes[init_state] = {}
     paths_dict = {}
     while stack:
         (vertex, path) = stack.pop()
         actions = self.domain.get_applicable_actions(vertex).get_elements()
         for action in actions:
             next = self.domain.get_next_state(Memory([vertex]), action)
             if action not in next_state_map[vertex]:
                 nb_edges += 1
             else:
                 continue
             next_state_map[vertex][action] = next
             next_state_attributes[vertex][action] = transition_extractor(
                 vertex, action, next)
             if self.domain.is_goal(next):
                 nb_path += 1
                 if verbose:
                     print(nb_path, " / ", self.max_path)
                     print("nodes  ", nb_nodes, " / ", self.max_nodes)
                     print("edges  ", nb_edges, " / ", self.max_edges)
             else:
                 if next not in next_state_map:
                     stack.append((next, path + [next]))
                     paths_dict[next] = set(tuple(path + [next]))
                 # else:
                 #     if tuple(path+[next]) not in paths_dict[next]:
                 #        stack.append((next, path + [next]))
                 #        paths_dict[next].add(tuple(path + [next]))
             if next not in next_state_map:
                 next_state_map[next] = {}
                 next_state_attributes[next] = {}
                 nb_nodes += 1
         if (nb_path > self.max_path
                 or (nb_nodes > self.max_nodes and nb_path >= 1)
                 or (nb_edges > self.max_edges and nb_path >= 1)):
             break
     return GraphDomain(next_state_map, next_state_attributes, None, None)
 def build_graph_domain(self, init_state: Any = None) -> GraphDomain:
     next_state_map = {}
     next_state_attributes = {}
     if init_state is None:
         init_state = self.domain.get_initial_state()
     stack = [(init_state, [init_state])]
     nb_nodes = 1
     nb_edges = 0
     nb_path = 0
     next_state_map[init_state] = {}
     next_state_attributes[init_state] = {}
     while stack:
         (vertex, path) = stack.pop()
         actions = self.domain.get_applicable_actions(vertex).get_elements()
         for action in actions:
             next = self.domain.get_next_state(vertex, action)
             if next not in next_state_map:
                 next_state_map[next] = {}
                 next_state_attributes[next] = {}
                 nb_nodes += 1
             if action not in next_state_map[vertex]:
                 nb_edges += 1
             next_state_map[vertex][action] = next
             next_state_attributes[vertex][action] = {
                 "cost":
                 self.domain.get_transition_value(Memory([vertex]), action,
                                                  next).cost,
                 "reward":
                 self.domain.get_transition_value(Memory([vertex]), action,
                                                  next).reward,
             }
             if self.domain.is_goal(next):
                 nb_path += 1
             else:
                 if next not in next_state_map:
                     stack.append((next, path + [next]))
         if (nb_path > self.max_path
                 or (nb_nodes > self.max_nodes and nb_path >= 1)
                 or (nb_edges > self.max_edges and nb_path >= 1)):
             break
     return GraphDomain(next_state_map, next_state_attributes, None, None)
Exemple #10
0
 def _get_random_action(self, state, h_act, h_obs, depth):
     sel = self._domain.get_applicable_actions(Memory([state])).sample()
     return sel