Beispiel #1
0
    def traverse(self, belief_node, tree_depth, start_time):
        delayed_reward = 0

        state = belief_node.sample_particle()

        # Time expired
        if time.time() - start_time > self.model.action_selection_timeout:
            console(4, module, "action selection timeout")
            return 0

        action = ucb_action(self, belief_node, False)

        # Search horizon reached
        if tree_depth >= self.model.max_depth:
            console(4, module, "Search horizon reached")
            return 0

        step_result, is_legal = self.model.generate_step(state, action)

        child_belief_node = belief_node.child(action, step_result.observation)
        if child_belief_node is None and not step_result.is_terminal and belief_node.action_map.total_visit_count > 0:
            child_belief_node, added = belief_node.create_or_get_child(
                action, step_result.observation)

        if not step_result.is_terminal or not is_legal:
            tree_depth += 1
            if child_belief_node is not None:
                # Add S' to the new belief node
                # Add a state particle with the new state
                if child_belief_node.state_particles.__len__(
                ) < self.model.max_particle_count:
                    child_belief_node.state_particles.append(
                        step_result.next_state)
                delayed_reward = self.traverse(child_belief_node, tree_depth,
                                               start_time)
            else:
                delayed_reward = self.rollout(belief_node)
            tree_depth -= 1
        else:
            console(4, module, "Reached terminal state.")

        # delayed_reward is "Q maximal"
        # current_q_value is the Q value of the current belief-action pair
        action_mapping_entry = belief_node.action_map.get_entry(
            action.bin_number)

        q_value = action_mapping_entry.mean_q_value
        # STEP update p value as well
        p_value = action_mapping_entry.mean_p_value
        # off-policy Q learning update rule
        q_value += (step_result.reward +
                    (self.model.discount * delayed_reward) - q_value)
        p_value += 1 if step_result.observation.is_obstacle or not is_legal else 0
        action_mapping_entry.update_visit_count(1)
        action_mapping_entry.update_q_value(q_value)
        action_mapping_entry.update_p_value(p_value)

        # Add RAVE ?
        return q_value
Beispiel #2
0
    def select_action(self):
        """
        Return an action given the current belief, as marked by the belief tree iterator, using an epsilon-greedy policy.

        If necessary, first carry out a rollout_search to expand the episode
        :return:
        """
        if self.disable_tree:
            self.rollout_search(self.policy_iterator)
        return ucb_action(None, self.policy_iterator, greedy=True)
Beispiel #3
0
 def select_eps_greedy_action(self, eps, start_time):
     """
     Starts off the Monte-Carlo Tree Search and returns the selected action. If the belief tree
             data structure is disabled, random rollout is used.
     """
     if self.disable_tree:
         self.rollout_search(self.belief_tree_index)
     else:
         self.monte_carlo_approx(eps, start_time)
     return ucb_action(self, self.belief_tree_index, True)
Beispiel #4
0
 def select_eps_greedy_action(self, eps, start_time):
     """
     Starts off the Monte-Carlo Tree Search and returns the selected action. If the belief tree
             data structure is disabled, random rollout is used.
     """
     if self.disable_tree:
         self.rollout_search(self.belief_tree_index)
     else:
         #simulation
         self.monte_carlo_approx(
             eps, start_time)  #use UCB action here to calculate q value
     return ucb_action(
         self, self.belief_tree_index,
         True)  #chose best action after simulation use mean_q_value
Beispiel #5
0
    def traverse(self, belief_node, tree_depth, start_time):
        delayed_reward = 0

        state = belief_node.sample_particle(
        )  #s~B, belief tree does not change, just change sampled state

        #sample action
        action = ucb_action(
            self, belief_node, False
        )  #argmax action, inside simuation, cal q-value to expand tree

        # Search horizon reached
        if tree_depth >= self.model.max_depth:
            console(4, module, "Search horizon reached")
            return 0

        step_result, is_legal = self.model.generate_step(state,
                                                         action)  #black box

        #h' <- (h, a, o)
        # get belief node, but could be none
        child_belief_node = belief_node.child(action,
                                              step_result.observation)  #

        if child_belief_node is None and not step_result.is_terminal and belief_node.action_map.total_visit_count > 0:
            child_belief_node, added = belief_node.create_or_get_child(
                action, step_result.observation)

        if not step_result.is_terminal or not is_legal:
            tree_depth += 1
            if child_belief_node is not None:
                # Add S' to the new belief node
                # Add a state particle with the new state
                if child_belief_node.state_particles.__len__(
                ) < self.model.max_particle_count:
                    child_belief_node.state_particles.append(
                        step_result.next_state)
                delayed_reward = self.traverse(child_belief_node, tree_depth,
                                               start_time)  #recursion
            else:
                delayed_reward = self.rollout(
                    belief_node)  # if child_belief_node is None
            tree_depth -= 1
        else:
            console(4, module, "Reached terminal state.")

        # delayed_reward is "Q maximal"
        # current_q_value is the Q value of the current belief-action pair
        action_mapping_entry = belief_node.action_map.get_entry(
            action.bin_number)

        q_value = action_mapping_entry.mean_q_value

        # off-policy Q learning update rule
        q_value += (step_result.reward +
                    (self.model.discount * delayed_reward) - q_value)

        action_mapping_entry.update_visit_count(1)
        action_mapping_entry.update_q_value(q_value)

        # Add RAVE ?
        return q_value
Beispiel #6
0
    def traverse(self, belief_node, tree_depth, start_time):
        delayed_reward = 0

        state = belief_node.sample_particle()

        # Time expired
        if time.time() - start_time > self.model.action_selection_timeout:
            console(4, module, "action selection timeout")
            return 0

        action = ucb_action(self, belief_node, False)

        # Search horizon reached
        if tree_depth >= self.model.max_depth:
            console(4, module, "Search horizon reached")
            return 0

        step_result, is_legal = self.model.generate_step(state, action)

        # if belief_node->action_node child->belief_node child exists
        # copy all the data from belief_node to the (a,o) child belief node
        # print "simulate: action=", action.bin_number, " obs=", step_result.observation.is_good, "total visit=", belief_node.action_map.total_visit_count, "depth=", belief_node.depth
        child_belief_node = belief_node.child(action, step_result.observation)

        # grow the belief tree by constructing the new child_belief_node
        if child_belief_node is None and not step_result.is_terminal and belief_node.visited:
            child_belief_node, added = belief_node.create_or_get_child(
                action, step_result.observation)

        if not step_result.is_terminal or not is_legal:
            if child_belief_node is not None:
                tree_depth += 1
                # Add S' to the new belief node
                # Add a state particle with the new state
                if child_belief_node.state_particles.__len__(
                ) < self.model.max_particle_count:
                    child_belief_node.state_particles.append(
                        step_result.next_state)
                delayed_reward = self.traverse(child_belief_node, tree_depth,
                                               start_time)
            else:
                delayed_reward = self.rollout_from_state(state)
                belief_node.visited = True
                # total_reward = step_result.reward + (self.model.discount * delayed_reward)
                # return total_reward
        else:
            console(4, module, "Reached terminal state.")

        # delayed_reward is "Q maximal"
        # current_q_value is the Q value of the current belief-action pair
        action_mapping_entry = belief_node.action_map.get_entry(
            action.bin_number)

        q_value = action_mapping_entry.mean_q_value

        # off-policy Q learning update rule
        q_value += (step_result.reward +
                    (self.model.discount * delayed_reward) - q_value)

        action_mapping_entry.update_visit_count(1)
        action_mapping_entry.update_q_value(q_value)

        #off_policy Q learning update
        max_q_value = -np.inf
        for action_entry in belief_node.action_map.entries.values():
            if action_entry.mean_q_value > max_q_value:
                max_q_value = action_entry.mean_q_value

        # Add RAVE ?
        return max_q_value