def traverse(self, belief_node, tree_depth, start_time): delayed_reward = 0 state = belief_node.sample_particle() # Time expired if time.time() - start_time > self.model.action_selection_timeout: console(4, module, "action selection timeout") return 0 action = ucb_action(self, belief_node, False) # Search horizon reached if tree_depth >= self.model.max_depth: console(4, module, "Search horizon reached") return 0 step_result, is_legal = self.model.generate_step(state, action) child_belief_node = belief_node.child(action, step_result.observation) if child_belief_node is None and not step_result.is_terminal and belief_node.action_map.total_visit_count > 0: child_belief_node, added = belief_node.create_or_get_child( action, step_result.observation) if not step_result.is_terminal or not is_legal: tree_depth += 1 if child_belief_node is not None: # Add S' to the new belief node # Add a state particle with the new state if child_belief_node.state_particles.__len__( ) < self.model.max_particle_count: child_belief_node.state_particles.append( step_result.next_state) delayed_reward = self.traverse(child_belief_node, tree_depth, start_time) else: delayed_reward = self.rollout(belief_node) tree_depth -= 1 else: console(4, module, "Reached terminal state.") # delayed_reward is "Q maximal" # current_q_value is the Q value of the current belief-action pair action_mapping_entry = belief_node.action_map.get_entry( action.bin_number) q_value = action_mapping_entry.mean_q_value # STEP update p value as well p_value = action_mapping_entry.mean_p_value # off-policy Q learning update rule q_value += (step_result.reward + (self.model.discount * delayed_reward) - q_value) p_value += 1 if step_result.observation.is_obstacle or not is_legal else 0 action_mapping_entry.update_visit_count(1) action_mapping_entry.update_q_value(q_value) action_mapping_entry.update_p_value(p_value) # Add RAVE ? return q_value
def select_action(self): """ Return an action given the current belief, as marked by the belief tree iterator, using an epsilon-greedy policy. If necessary, first carry out a rollout_search to expand the episode :return: """ if self.disable_tree: self.rollout_search(self.policy_iterator) return ucb_action(None, self.policy_iterator, greedy=True)
def select_eps_greedy_action(self, eps, start_time): """ Starts off the Monte-Carlo Tree Search and returns the selected action. If the belief tree data structure is disabled, random rollout is used. """ if self.disable_tree: self.rollout_search(self.belief_tree_index) else: self.monte_carlo_approx(eps, start_time) return ucb_action(self, self.belief_tree_index, True)
def select_eps_greedy_action(self, eps, start_time): """ Starts off the Monte-Carlo Tree Search and returns the selected action. If the belief tree data structure is disabled, random rollout is used. """ if self.disable_tree: self.rollout_search(self.belief_tree_index) else: #simulation self.monte_carlo_approx( eps, start_time) #use UCB action here to calculate q value return ucb_action( self, self.belief_tree_index, True) #chose best action after simulation use mean_q_value
def traverse(self, belief_node, tree_depth, start_time): delayed_reward = 0 state = belief_node.sample_particle( ) #s~B, belief tree does not change, just change sampled state #sample action action = ucb_action( self, belief_node, False ) #argmax action, inside simuation, cal q-value to expand tree # Search horizon reached if tree_depth >= self.model.max_depth: console(4, module, "Search horizon reached") return 0 step_result, is_legal = self.model.generate_step(state, action) #black box #h' <- (h, a, o) # get belief node, but could be none child_belief_node = belief_node.child(action, step_result.observation) # if child_belief_node is None and not step_result.is_terminal and belief_node.action_map.total_visit_count > 0: child_belief_node, added = belief_node.create_or_get_child( action, step_result.observation) if not step_result.is_terminal or not is_legal: tree_depth += 1 if child_belief_node is not None: # Add S' to the new belief node # Add a state particle with the new state if child_belief_node.state_particles.__len__( ) < self.model.max_particle_count: child_belief_node.state_particles.append( step_result.next_state) delayed_reward = self.traverse(child_belief_node, tree_depth, start_time) #recursion else: delayed_reward = self.rollout( belief_node) # if child_belief_node is None tree_depth -= 1 else: console(4, module, "Reached terminal state.") # delayed_reward is "Q maximal" # current_q_value is the Q value of the current belief-action pair action_mapping_entry = belief_node.action_map.get_entry( action.bin_number) q_value = action_mapping_entry.mean_q_value # off-policy Q learning update rule q_value += (step_result.reward + (self.model.discount * delayed_reward) - q_value) action_mapping_entry.update_visit_count(1) action_mapping_entry.update_q_value(q_value) # Add RAVE ? return q_value
def traverse(self, belief_node, tree_depth, start_time): delayed_reward = 0 state = belief_node.sample_particle() # Time expired if time.time() - start_time > self.model.action_selection_timeout: console(4, module, "action selection timeout") return 0 action = ucb_action(self, belief_node, False) # Search horizon reached if tree_depth >= self.model.max_depth: console(4, module, "Search horizon reached") return 0 step_result, is_legal = self.model.generate_step(state, action) # if belief_node->action_node child->belief_node child exists # copy all the data from belief_node to the (a,o) child belief node # print "simulate: action=", action.bin_number, " obs=", step_result.observation.is_good, "total visit=", belief_node.action_map.total_visit_count, "depth=", belief_node.depth child_belief_node = belief_node.child(action, step_result.observation) # grow the belief tree by constructing the new child_belief_node if child_belief_node is None and not step_result.is_terminal and belief_node.visited: child_belief_node, added = belief_node.create_or_get_child( action, step_result.observation) if not step_result.is_terminal or not is_legal: if child_belief_node is not None: tree_depth += 1 # Add S' to the new belief node # Add a state particle with the new state if child_belief_node.state_particles.__len__( ) < self.model.max_particle_count: child_belief_node.state_particles.append( step_result.next_state) delayed_reward = self.traverse(child_belief_node, tree_depth, start_time) else: delayed_reward = self.rollout_from_state(state) belief_node.visited = True # total_reward = step_result.reward + (self.model.discount * delayed_reward) # return total_reward else: console(4, module, "Reached terminal state.") # delayed_reward is "Q maximal" # current_q_value is the Q value of the current belief-action pair action_mapping_entry = belief_node.action_map.get_entry( action.bin_number) q_value = action_mapping_entry.mean_q_value # off-policy Q learning update rule q_value += (step_result.reward + (self.model.discount * delayed_reward) - q_value) action_mapping_entry.update_visit_count(1) action_mapping_entry.update_q_value(q_value) #off_policy Q learning update max_q_value = -np.inf for action_entry in belief_node.action_map.entries.values(): if action_entry.mean_q_value > max_q_value: max_q_value = action_entry.mean_q_value # Add RAVE ? return max_q_value