コード例 #1
0
ファイル: mcts.py プロジェクト: sungbinlim/voot
    def __init__(self, widening_parameter, exploration_parameters,
                 sampling_strategy, sampling_strategy_exploration_parameter,
                 c1, n_feasibility_checks, environment, domain_name):
        self.c1 = c1
        self.progressive_widening_parameter = widening_parameter
        self.exploration_parameters = exploration_parameters
        self.time_limit = np.inf
        if domain_name == 'namo':
            self.discount_rate = 0.9
        else:
            self.discount_rate = 1
        self.environment = environment
        self.sampling_strategy = sampling_strategy
        self.sampling_strategy_exploration_parameter = sampling_strategy_exploration_parameter
        self.depth_limit = 300

        self.env = self.environment.env
        self.robot = self.environment.robot
        self.s0_node = self.create_node(
            None,
            depth=0,
            reward=0,
            objects_not_in_goal=self.environment.objects_not_in_goal,
            is_init_node=True)

        self.original_s0_node = self.s0_node
        self.tree = MCTSTree(self.s0_node, self.exploration_parameters)
        self.found_solution = False
        if self.environment.name == 'namo':
            self.goal_reward = 2
        else:
            self.goal_reward = 0
        self.n_feasibility_checks = n_feasibility_checks
        """
コード例 #2
0
ファイル: gomoku_main.py プロジェクト: Jazhang01/GomokuAI
    def onClick(self, event):
        x,y = self.get_intersection(event.x, event.y)
        if (x != -1 and y != -1):
            board_coords = self.get_board_coordinates(x, y)
            if not self.first_move: board = self.board_state.get_board()
            if self.first_move or board[board_coords[1]][board_coords[0]] == 0: # player is able to place piece
                if self.first_move:
                    self.first_move = False
                    self.placed_pieces.append(Piece(board_coords[1],
                                                    board_coords[0],
                                                    self.player_turn,
                                                    self.placePiece(x, y),
                                                    self))
                    new_board = [[0] * BOARD_SIZE for _ in range(BOARD_SIZE)]
                    new_board[board_coords[1]][board_coords[0]] = BLACK
                    self.board_state = BoardState(grid=new_board,
                                                  recent_move=(board_coords[1], board_coords[0]), turn=BLACK, search_breadth=1)
                    self.past_board_states.append(self.board_state)
                    self.player_turn = (-1)*self.player_turn

                else:
                    self.placed_pieces.append(Piece(board_coords[1],
                                                    board_coords[0],
                                                    self.player_turn,
                                                    self.placePiece(x, y),
                                                    self))
                    self.past_board_states.append(self.board_state)
                    self.board_state = self.board_state.play(board_coords[1],board_coords[0])
                    self.player_turn = (-1)*self.player_turn

                possible_winner = self.board_state.get_winner()
                if possible_winner != 0:
                    self.winner(possible_winner)
                else:
                    ai_mcts_node = MCTSNode(self.board_state)
                    ai_mcts_tree = MCTSTree(ai_mcts_node)

                    next_state = ai_mcts_tree.best_move(time_cutoff=WAIT_TIME)
                    self.board_state = next_state

                    ai_move = next_state.get_recent_move()
                    print("("+str(ai_move[0])+", "+str(ai_move[1])+")")
                    self.placed_pieces.append(Piece(ai_move[1],
                                                    ai_move[0],
                                                    self.player_turn,
                                                    self.placePiece((ai_move[1]+1)*self.grid_interval, (ai_move[0]+1)*self.grid_interval),
                                                    self))
                    self.past_board_states.append(self.board_state)
                    self.player_turn = (-1) * self.player_turn

                    possible_winner = self.board_state.get_winner()
                    if possible_winner != 0:
                        self.winner(possible_winner)
コード例 #3
0
    def __init__(self, parameters, problem_env, goal_entities, v_fcn,
                 learned_q):
        # MCTS parameters
        self.widening_parameter = parameters.widening_parameter
        self.ucb_parameter = parameters.ucb_parameter
        self.time_limit = parameters.timelimit
        self.n_motion_plan_trials = parameters.n_motion_plan_trials
        self.use_ucb = parameters.use_ucb
        self.use_progressive_widening = parameters.pw
        self.n_feasibility_checks = parameters.n_feasibility_checks
        self.use_v_fcn = parameters.use_learned_q
        self.use_shaped_reward = parameters.use_shaped_reward
        self.planning_horizon = parameters.planning_horizon
        self.sampling_strategy = parameters.sampling_strategy
        self.explr_p = parameters.explr_p
        self.switch_frequency = parameters.switch_frequency
        self.parameters = parameters

        self.v_fcn = v_fcn
        self.learned_q = learned_q
        # Hard-coded params
        self.check_reachability = True
        self.discount_rate = 1.0

        # Environment setup
        self.problem_env = problem_env
        self.env = self.problem_env.env
        self.robot = self.problem_env.robot

        # MCTS initialization
        self.s0_node = None
        self.tree = MCTSTree(self.ucb_parameter)
        self.best_leaf_node = None
        self.goal_entities = goal_entities

        # Logging purpose
        self.search_time_to_reward = []
        self.reward_lists = []
        self.progress_list = []

        self.found_solution = False
        self.swept_volume_constraint = None
コード例 #4
0
    def __init__(self, widening_parameter, exploration_parameters,
                 sampling_strategy, sampling_strategy_exploration_parameter,
                 c1, n_feasibility_checks, environment,
                 use_progressive_widening, use_ucb, use_max_backup,
                 pick_switch, voo_sampling_mode, voo_counter_ratio, n_switch):
        self.c1 = c1
        self.widening_parameter = widening_parameter
        self.exploration_parameters = exploration_parameters
        self.time_limit = np.inf
        self.environment = environment
        if self.environment.name == 'convbelt':
            self.discount_rate = 0.99  # do we need this?
        else:
            self.discount_rate = 0.9
        if self.environment.name.find('synthetic') != -1:
            self.depth_limit = 20
        else:
            self.depth_limit = np.inf
        self.sampling_strategy = sampling_strategy
        self.sampling_strategy_exploration_parameter = sampling_strategy_exploration_parameter
        self.use_progressive_widening = use_progressive_widening
        self.voo_sampling_mode = voo_sampling_mode
        self.voo_counter_ratio = voo_counter_ratio
        self.use_ucb = use_ucb
        self.n_switch = n_switch
        self.n_switch_original = self.n_switch
        self.use_max_backup = use_max_backup
        self.pick_switch = pick_switch

        self.env = self.environment.env
        self.robot = self.environment.robot
        self.s0_node = self.create_node(None,
                                        depth=0,
                                        reward=0,
                                        is_init_node=True)

        self.original_s0_node = self.s0_node
        self.tree = MCTSTree(self.s0_node, self.exploration_parameters)
        self.found_solution = False
        self.goal_reward = 2
        self.infeasible_reward = -2
        self.n_feasibility_checks = n_feasibility_checks
コード例 #5
0
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0,-1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,-1, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,-1, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
                turn=-1, recent_move=(9, 8), search_breadth=2)

bs_toy = BoardState(grid=[[ 0, 0, 0, 0, 0, 0, 0, 0],
                          [ 0, 0, 0, 0, 0, 0, 0, 0],
                          [ 0, 0, 0, 0, 0, 0, 0, 0],
                          [ 0, 0, 0,-1, 0, 0, 0, 0],
                          [ 0, 0, 0, 0, 1, 0, 0, 0],
                          [ 0, 0, 0, 0, 0, 0, 0, 0],
                          [ 0, 0, 0, 0, 0, 0, 0, 0],
                          [ 0, 0, 0, 0, 0, 0, 0, 0]],
                turn=-1, recent_move=(3, 3), search_breadth=2)

n = MCTSNode(bs)
t = MCTSTree(n)

print("Best Move: \n", t.best_move(360))
コード例 #6
0
ファイル: mcts.py プロジェクト: sungbinlim/voot
class MCTS:
    def __init__(self, widening_parameter, exploration_parameters,
                 sampling_strategy, sampling_strategy_exploration_parameter,
                 c1, n_feasibility_checks, environment, domain_name):
        self.c1 = c1
        self.progressive_widening_parameter = widening_parameter
        self.exploration_parameters = exploration_parameters
        self.time_limit = np.inf
        if domain_name == 'namo':
            self.discount_rate = 0.9
        else:
            self.discount_rate = 1
        self.environment = environment
        self.sampling_strategy = sampling_strategy
        self.sampling_strategy_exploration_parameter = sampling_strategy_exploration_parameter
        self.depth_limit = 300

        self.env = self.environment.env
        self.robot = self.environment.robot
        self.s0_node = self.create_node(
            None,
            depth=0,
            reward=0,
            objects_not_in_goal=self.environment.objects_not_in_goal,
            is_init_node=True)

        self.original_s0_node = self.s0_node
        self.tree = MCTSTree(self.s0_node, self.exploration_parameters)
        self.found_solution = False
        if self.environment.name == 'namo':
            self.goal_reward = 2
        else:
            self.goal_reward = 0
        self.n_feasibility_checks = n_feasibility_checks
        """
        if domain_name == 'convbelt':
            self.depth_limit = 10
            self.is_satisficing_problem = True
        elif domain_name == 'namo':
            self.depth_limit = np.inf
            self.is_satisficing_problem = False
        """

    def create_sampling_agent(self, node, operator_name):
        if self.sampling_strategy == 'unif':
            return UniformGenerator(operator_name, self.environment)
        elif self.sampling_strategy == 'voo':
            return VOOGenerator(operator_name, self.environment,
                                self.sampling_strategy_exploration_parameter,
                                self.c1)
        elif self.sampling_strategy == 'gpucb':
            return GPUCBGenerator(operator_name, self.environment,
                                  self.sampling_strategy_exploration_parameter)
        elif self.sampling_strategy == 'doo':
            return DOOGenerator(node, self.environment,
                                self.sampling_strategy_exploration_parameter)
        elif self.sampling_strategy == 'randomized_doo':
            return RandomizedDOOGenerator(
                node, self.environment,
                self.sampling_strategy_exploration_parameter)
        else:
            print "Wrong sampling strategy"
            return -1

    def create_node(self, parent_action, depth, reward, objects_not_in_goal,
                    is_init_node):
        if self.environment.is_goal_reached():
            operator = None
        else:
            operator = self.environment.get_applicable_ops()

        state_saver = DynamicEnvironmentStateSaver(self.environment.env)
        node = TreeNode(operator, self.exploration_parameters, depth,
                        state_saver, self.sampling_strategy, is_init_node)

        if not self.environment.is_goal_reached():
            node.sampling_agent = self.create_sampling_agent(node, operator)

        node.objects_not_in_goal = objects_not_in_goal
        node.parent_action_reward = reward
        node.parent_action = parent_action
        return node

    @staticmethod
    def get_best_child_node(node):
        if len(node.children) == 0:
            return None
        else:
            # returns the most visited chlid
            # another option is to return the child with best value
            best_child_action_idx = np.argmax(node.N.values())
            best_child_action = node.N.keys()[best_child_action_idx]
            return node.children[best_child_action]

    def retrace_best_plan(self, best_node):
        plan = []
        #curr_node = self.get_best_goal_node()
        curr_node = best_node
        #while not curr_node.is_init_node:
        while not curr_node.parent is None:
            action = curr_node.parent_action
            path = curr_node.parent_motion
            obj = curr_node.parent.obj
            obj_name = obj.GetName()
            operator = curr_node.parent.operator
            objs_in_collision = curr_node.objs_in_collision
            plan.insert(
                0, {
                    'action':
                    action,
                    'path':
                    path,
                    'obj_name':
                    obj_name,
                    'operator':
                    operator,
                    'obj_names_in_collision':
                    [obj.GetName() for obj in objs_in_collision]
                })
            curr_node = curr_node.parent
        return plan

    def get_best_goal_node(self):
        leaves = self.tree.get_leaf_nodes()
        goal_nodes = [leaf for leaf in leaves if leaf.is_goal_node]
        if len(goal_nodes) > 1:
            best_traj_reward, curr_node, _ = self.tree.get_best_trajectory_sum_rewards_and_node(
                self.discount_rate)
            #curr_node = [leaf for leaf in goal_nodes if leaf.sum_ancestor_action_rewards == best_traj_reward][0]
        else:
            curr_node = goal_nodes[0]
        return curr_node

    def switch_init_node(self, node):
        self.environment.reset_to_init_state(node)
        self.s0_node.is_init_node = False

        self.s0_node = node
        self.s0_node.is_init_node = True
        self.found_solution = False
        """
        if self.environment.is_solving_namo:
            self.environment.set_init_namo_object_names([o.GetName() for o in node.objs_in_collision])

            # We need to keep track of what the objs in collision are for each action, so that
            # when we switch the root node, we can set that
            # More generally, we need to keep track of what objects are left to be manipulated
            # How can we do so?
            # Ideas:
            #   1) each node keeps it
            #   2) statesaver keeps it, and so the wrapper around it, when the restore is called, updates the objects
            # I guess it does not matter. What matters  is its name.
            # It should be called the objects_not_in_goal?
            self.high_level_planner.task_plan[0]['objects'] = node.objs_in_collision
            self.environment.reset_to_init_state(node)

        elif self.environment.is_solving_packing:
            curr_obj_plan = self.high_level_planner.task_plan[0]['objects']
            for obj_idx, curr_obj in enumerate(curr_obj_plan):
                if not self.environment.regions['object_region'].contains(curr_obj.ComputeAABB()):
                    break
            self.high_level_planner.set_object_index(obj_idx)
            self.environment.reset_to_init_state(node)
        """

    def choose_next_node_to_descend_to(self):
        n_visits_to_each_action = self.s0_node.N.values()
        if len(np.unique(n_visits_to_each_action)) == 1:
            # todo descend to the one that has the highest FEASIBLE Q
            best_action = self.s0_node.Q.keys()[np.argmax(
                self.s0_node.Q.values())]
        else:
            best_action = self.s0_node.N.keys()[np.argmax(
                n_visits_to_each_action)]
        best_action = self.s0_node.Q.keys()[np.argmax(self.s0_node.Q.values())]
        best_node = self.s0_node.children[best_action]
        return best_node, best_action

    def search(self, n_iter=100, n_optimal_iter=0, max_time=np.inf):
        # n_optimal_iter: additional number of iterations you are allowed to run after finding a solution
        depth = 0
        time_to_search = 0
        search_time_to_reward = []
        optimal_iter = 0
        n_node_switch = 0
        switch_counter = 0
        found_solution_permanent = False
        reward_lists = []
        for iteration in range(n_iter):
            print '*****SIMULATION ITERATION %d' % iteration
            """
            if self.environment.is_solving_namo or self.environment.is_solving_packing:
                is_pick_node = self.s0_node.operator.find('two_arm_pick') != -1
                we_have_feasible_action = False if len(self.s0_node.Q) == 0 \
                    else np.max(self.s0_node.reward_history.values()) != self.environment.infeasible_reward
                # it will actually never switch.
                if is_pick_node:
                    if self.environment.is_solving_packing:
                        we_evaluated_the_node_enough = we_have_feasible_action and self.s0_node.Nvisited > 30
                    else:
                        we_evaluated_the_node_enough = we_have_feasible_action and self.s0_node.Nvisited > 10

                    #if switch_counter > 10 and not we_have_feasible_action:
                    #    print 'Going back to s0 node'
                    #    self.switch_init_node(self.original_s0_node)
                else:
                    we_evaluated_the_node_enough = we_have_feasible_action and self.s0_node.Nvisited > 30
                    #if switch_counter > 30 and not we_have_feasible_action:
                    #    print 'Going back to s0 node'
                    #    self.switch_init_node(self.original_s0_node)

                if is_pick_node and we_evaluated_the_node_enough:
                    print "Node switching from pick node"
                    best_node, best_action = self.choose_next_node_to_descend_to()
                    self.switch_init_node(best_node)
                    switch_counter = 0
                elif (not is_pick_node) and we_evaluated_the_node_enough:
                    print "Node switching from place node"
                    best_node, best_action = self.choose_next_node_to_descend_to()
                    self.switch_init_node(best_node)
                    print 'best child reward', best_node.parent_action_reward
                    print 'best child Q', self.s0_node.parent.Q[best_action]
                    print 'best child N', self.s0_node.parent.N[best_action]
                    print 'Other Q values', self.s0_node.Q.values()
                    switch_counter = 0
            switch_counter += 1
            """

            self.environment.reset_to_init_state(self.s0_node)

            stime = time.time()
            self.simulate(self.s0_node, depth)
            time_to_search += time.time() - stime
            """
            ### logging results
            if socket.gethostname() == 'dell-XPS-15-9560':
                if self.environment.is_solving_namo:
                    pass
                    #write_dot_file(self.tree, iteration, 'solving_namo')
                elif self.environment.is_solving_packing:
                    write_dot_file(self.tree, iteration, 'solving_packing')
                elif self.environment.is_solving_fetching:
                    write_dot_file(self.tree, iteration, 'solving_fetching')
            """

            best_traj_rwd, best_node, reward_list = self.tree.get_best_trajectory_sum_rewards_and_node(
                self.discount_rate)
            search_time_to_reward.append([
                time_to_search, iteration, best_traj_rwd, self.found_solution
            ])
            reward_lists.append(reward_list)
            plan = [
                self.retrace_best_plan(best_node), best_traj_rwd,
                self.found_solution
            ]
            """
            if (iteration % 100 == 0 and iteration != 0) or (iteration == n_iter):
                self.high_level_planner.save_results(search_time_to_reward, plan, reward_lists, iteration)

            print np.array(search_time_to_reward)[:, -2], np.max(np.array(search_time_to_reward)[:, -2]), reward_list, found_solution_permanent

            if self.found_solution:
                found_solution_permanent = True
                optimal_iter += 1
                goal_node = best_node
                if self.optimal_score_achieved(best_traj_rwd):
                    print "Optimal score found"
                    break
                if self.environment.is_solving_namo and len(self.s0_node.objs_in_collision) == 0:
                    self.switch_init_node(self.original_s0_node)
                else:
                    self.switch_init_node(self.s0_node)

                self.found_solution = False
            else:
                plan = None
                goal_node = None
            """
            goal_node = None

            if time_to_search > max_time:
                break

        self.environment.reset_to_init_state(self.s0_node)
        return search_time_to_reward, plan, goal_node, reward_lists

    """
    def optimal_score_achieved(self, best_traj_rwd):
        # return best_traj_rwd == self.environment.optimal_score
        # in the case of namo, this is the length of the object
        # in the case of original problem, this is the number of objects to be packed
        return self.high_level_planner.is_optimal_score_achieved(best_traj_rwd)
    """

    def choose_action(self, curr_node):
        n_actions = len(curr_node.A)
        is_time_to_sample = n_actions <= self.progressive_widening_parameter * curr_node.Nvisited
        if len(curr_node.Q.values()) > 0:
            best_Q = np.max(curr_node.Q.values())
            is_next_node_goal = np.all(
                [child.is_goal_node for child in curr_node.children.values()])
            is_time_to_sample = is_time_to_sample or (
                best_Q
                == self.environment.infeasible_reward) or is_next_node_goal

        if is_time_to_sample:
            new_continuous_parameters = self.sample_action(curr_node)
            curr_node.add_actions(new_continuous_parameters)

        action = curr_node.perform_ucb_over_actions()

        return action

    @staticmethod
    def update_node_statistics(curr_node, action, sum_rewards, reward):
        curr_node.Nvisited += 1
        if is_action_hashable(action):
            hashable_action = action
        else:
            hashable_action = make_action_hashable(action)
        is_action_new = not (hashable_action in curr_node.A)
        if is_action_new:
            curr_node.A.append(hashable_action)
            curr_node.N[hashable_action] = 1
            curr_node.Q[hashable_action] = sum_rewards
            curr_node.reward_history[hashable_action] = [reward]
        else:
            curr_node.N[hashable_action] += 1
            curr_node.reward_history[hashable_action].append(reward)
            curr_node.Q[hashable_action] += (sum_rewards - curr_node.Q[hashable_action]) / \
                                            float(curr_node.N[hashable_action])

    def simulate(self, curr_node, depth):
        if self.environment.is_goal_reached():
            # arrived at the goal state
            if not curr_node.is_goal_and_already_visited:
                self.found_solution = True
                curr_node.is_goal_node = True
                print "Solution found, returning the goal reward", self.goal_reward
                self.update_node_statistics(curr_node, curr_node.parent_action,
                                            self.goal_reward, self.goal_reward)
            return self.goal_reward

        if depth == self.depth_limit:
            return 0

        if DEBUG:
            print "At depth ", depth
            print "Is it time to pick?", self.environment.is_pick_time()

        action = self.choose_action(curr_node)

        parent_motion = None
        if curr_node.is_action_tried(action):
            if DEBUG:
                print "Executing tree policy, taking action ", action
            next_node = curr_node.get_child_node(action)
            if next_node.parent_motion is None:
                check_feasibility = True
            else:
                parent_motion = next_node.parent_motion
                check_feasibility = False
        else:
            check_feasibility = True  # todo: store path?

        if DEBUG:
            print 'Is pick time? ', self.environment.is_pick_time()
            print "Executing action ", action

        next_state, reward, parent_motion, objs_in_collision = self.environment.apply_action(
            curr_node, action, check_feasibility, parent_motion)
        import pdb
        pdb.set_trace()
        if DEBUG:
            print 'Reward ', reward
        #self.high_level_planner.update_task_plan_indices(reward, action['operator_name']) # create the next node based on the updated task plan progress

        if not curr_node.is_action_tried(action):
            next_node = self.create_node(action,
                                         depth + 1,
                                         reward,
                                         objs_in_collision,
                                         is_init_node=False)
            self.tree.add_node(next_node, action, curr_node)
            next_node.sum_ancestor_action_rewards = next_node.parent.sum_ancestor_action_rewards + reward

        if next_node.parent_motion is None and reward != self.environment.infeasible_reward:
            next_node.parent_motion = parent_motion

        is_infeasible_action = reward == self.environment.infeasible_reward
        if is_infeasible_action:
            # this (s,a) is a dead-end
            sum_rewards = reward
        else:
            sum_rewards = reward + self.discount_rate * self.simulate(
                next_node, depth + 1)

        self.update_node_statistics(curr_node, action, sum_rewards, reward)

        if curr_node.is_init_node and curr_node.parent is not None:
            self.update_ancestor_node_statistics(curr_node.parent,
                                                 curr_node.parent_action,
                                                 sum_rewards)

        return sum_rewards

    def update_ancestor_node_statistics(self, node, action, child_sum_rewards):
        if node is None:
            return

        parent_reward_to_node = node.reward_history[make_action_hashable(
            action)][0]
        parent_sum_rewards = parent_reward_to_node + self.discount_rate * child_sum_rewards
        self.update_node_statistics(node, action, parent_sum_rewards,
                                    parent_reward_to_node)

        self.update_ancestor_node_statistics(node.parent, node.parent_action,
                                             parent_sum_rewards)

    """
    def apply_action(self, node, action, check_feasibility, parent_motion):
        if action is None:
            return None, self.environment.infeasible_reward, None
        which_operator = self.environment.which_operator(node.obj)
        if which_operator == 'two_arm_pick':
            if self.environment.name == 'convbelt':
                self.environment.disable_objects()
                node.obj.Enable(True)
            next_state, reward, path, objs_in_collision = self.environment.apply_two_arm_pick_action(action, node, check_feasibility, parent_motion)
            if self.environment.name == 'convbelt':
                self.environment.enable_objects()
                node.obj.Enable(True)
        elif which_operator == 'two_arm_place':
            next_state, reward, path, objs_in_collision = self.environment.apply_two_arm_place_action(action, node, check_feasibility, parent_motion)
        elif which_operator == 'one_arm_pick':
            next_state, reward, path, objs_in_collision = self.environment.apply_one_arm_pick_action(action, node.obj, node.region, check_feasibility, parent_motion)
        elif which_operator == 'one_arm_place':
            next_state, reward, path, objs_in_collision = self.environment.apply_one_arm_place_action(action, node.obj, node.region, check_feasibility, parent_motion)
        elif which_operator == 'next_base_pose':
            next_state, reward, path, objs_in_collision = self.environment.apply_next_base_pose(action, node, check_feasibility, parent_motion)

        return next_state, reward, path, objs_in_collision
    """

    def sample_action(self, node):
        action = node.sampling_agent.sample_next_point(
            node, self.n_feasibility_checks)
        return action
コード例 #7
0
class MCTS:
    def __init__(self, parameters, problem_env, goal_entities, v_fcn,
                 learned_q):
        # MCTS parameters
        self.widening_parameter = parameters.widening_parameter
        self.ucb_parameter = parameters.ucb_parameter
        self.time_limit = parameters.timelimit
        self.n_motion_plan_trials = parameters.n_motion_plan_trials
        self.use_ucb = parameters.use_ucb
        self.use_progressive_widening = parameters.pw
        self.n_feasibility_checks = parameters.n_feasibility_checks
        self.use_v_fcn = parameters.use_learned_q
        self.use_shaped_reward = parameters.use_shaped_reward
        self.planning_horizon = parameters.planning_horizon
        self.sampling_strategy = parameters.sampling_strategy
        self.explr_p = parameters.explr_p
        self.switch_frequency = parameters.switch_frequency
        self.parameters = parameters

        self.v_fcn = v_fcn
        self.learned_q = learned_q
        # Hard-coded params
        self.check_reachability = True
        self.discount_rate = 1.0

        # Environment setup
        self.problem_env = problem_env
        self.env = self.problem_env.env
        self.robot = self.problem_env.robot

        # MCTS initialization
        self.s0_node = None
        self.tree = MCTSTree(self.ucb_parameter)
        self.best_leaf_node = None
        self.goal_entities = goal_entities

        # Logging purpose
        self.search_time_to_reward = []
        self.reward_lists = []
        self.progress_list = []

        self.found_solution = False
        self.swept_volume_constraint = None

    def load_pickled_tree(self, fname=None):
        if fname is None:
            fname = 'tmp_tree.pkl'
        self.tree = pickle.load(open(fname, 'r'))

    def visit_all_nodes(self, curr_node):
        children = curr_node.children.values()
        print curr_node in self.tree.nodes
        for c in children:
            self.visit_all_nodes(c)

    def save_tree(self, fname=None):
        if fname is None:
            fname = 'tmp_tree.pkl'
        self.tree.make_tree_picklable()
        pickle.dump(self.tree, open(fname, 'wb'))

    def load_tree(self, fname=None):
        if fname is None:
            fname = 'tmp_tree.pkl'
        self.tree = pickle.load(open(fname, 'r'))

    def get_node_at_idx(self, idx):
        for n in self.tree.nodes:
            if n.idx == idx:
                return n
        return None

    def create_sampling_agent(self, node):
        operator_skeleton = node.operator_skeleton
        if 'one_arm' in self.problem_env.name:
            dont_check_motion_existence = True
        else:
            dont_check_motion_existence = False

        abstract_state = node.state
        abstract_action = node.operator_skeleton
        place_region = self.problem_env.regions[
            abstract_action.discrete_parameters['place_region']]
        if self.sampling_strategy == 'uniform':
            sampler = UniformSampler(place_region)
            generator = TwoArmPaPGenerator(
                abstract_state,
                abstract_action,
                sampler,
                n_parameters_to_try_motion_planning=self.n_motion_plan_trials,
                n_iter_limit=self.n_feasibility_checks,
                problem_env=self.problem_env,
                pick_action_mode='ir_parameters',
                place_action_mode='object_pose')
        elif self.sampling_strategy == 'voo':
            target_obj = abstract_action.discrete_parameters['object']
            sampler = VOOSampler(
                target_obj, place_region, self.explr_p,
                self.problem_env.reward_function.worst_potential_value)
            generator = TwoArmVOOGenerator(
                abstract_state,
                abstract_action,
                sampler,
                n_parameters_to_try_motion_planning=self.n_motion_plan_trials,
                n_iter_limit=self.n_feasibility_checks,
                problem_env=self.problem_env,
                pick_action_mode='ir_parameters',
                place_action_mode='object_pose')
        else:
            raise NotImplementedError
        return generator

    def compute_state(self, parent_node, parent_action):
        print "Computing state..."
        if self.problem_env.is_goal_reached():
            state = parent_node.state
        else:
            if parent_node is None:
                parent_state = None
            else:
                parent_state = parent_node.state

            if self.problem_env.name.find('one_arm') != -1:
                state = OneArmPaPState(self.problem_env,
                                       parent_state=parent_state,
                                       parent_action=parent_action,
                                       goal_entities=self.goal_entities)
            else:
                cache_file_name_for_debugging = './planners/mcts_cache_for_debug/pidx_{}_seed_{}.pkl'.format(
                    self.parameters.pidx, self.parameters.planner_seed)
                is_root_node = parent_state is None
                cache_file_exists = os.path.isfile(
                    cache_file_name_for_debugging)
                is_beomjoon_local_machine = False  #socket.gethostname() == 'lab'
                if cache_file_exists and is_root_node and is_beomjoon_local_machine:
                    state = pickle.load(
                        open(cache_file_name_for_debugging, 'r'))
                    state.make_plannable(self.problem_env)
                else:
                    state = ShortestPathPaPState(
                        self.problem_env,
                        parent_state=parent_state,
                        parent_action=parent_action,
                        goal_entities=self.goal_entities,
                        planner='mcts')
                    if is_root_node and is_beomjoon_local_machine:
                        state.make_pklable()
                        pickle.dump(state,
                                    open(cache_file_name_for_debugging, 'wb'))
                        state.make_plannable(self.problem_env)
        return state

    def get_current_state(self, parent_node, parent_action,
                          is_parent_action_infeasible):
        # this needs to be factored
        # why do I need a parent node? Can I just get away with parent state?
        is_operator_skeleton_node = (parent_node is None) or (
            not parent_node.is_operator_skeleton_node)
        if is_parent_action_infeasible:
            state = None
        elif is_operator_skeleton_node:
            state = self.compute_state(parent_node, parent_action)
        else:
            state = parent_node.state

        return state

    def create_node(self,
                    parent_action,
                    depth,
                    parent_node,
                    is_parent_action_infeasible,
                    is_init_node=False):
        state_saver = utils.CustomStateSaver(self.problem_env.env)
        is_operator_skeleton_node = (parent_node is None) or (
            not parent_node.is_operator_skeleton_node)
        state = self.get_current_state(parent_node, parent_action,
                                       is_parent_action_infeasible)

        if is_operator_skeleton_node:
            applicable_op_skeletons = self.problem_env.get_applicable_ops(
                parent_action)
            node = DiscreteTreeNodeWithPriorQ(state, self.ucb_parameter, depth,
                                              state_saver,
                                              is_operator_skeleton_node,
                                              is_init_node,
                                              applicable_op_skeletons,
                                              self.learned_q)
        else:
            node = ContinuousTreeNode(state, parent_action, self.ucb_parameter,
                                      depth, state_saver,
                                      is_operator_skeleton_node, is_init_node)
            node.sampling_agent = self.create_sampling_agent(node)

        node.parent = parent_node
        node.parent_action = parent_action
        return node

    @staticmethod
    def get_best_child_node(node):
        if len(node.children) == 0:
            return None
        else:
            best_child_action_idx = np.argmax(node.Q.values())
            best_child_action = node.Q.keys()[best_child_action_idx]
            return node.children[best_child_action]

    def retrace_best_plan(self):
        plan = []
        _, _, best_leaf_node = self.tree.get_best_trajectory_sum_rewards_and_node(
            self.discount_rate)
        curr_node = best_leaf_node

        while not curr_node.parent is None:
            plan.append(curr_node.parent_action)
            curr_node = curr_node.parent

        plan = plan[::-1]
        return plan, best_leaf_node

    def get_best_goal_node(self):
        leaves = self.tree.get_leaf_nodes()
        goal_nodes = [leaf for leaf in leaves if leaf.is_goal_node]
        if len(goal_nodes) > 1:
            best_traj_reward, curr_node, _ = self.tree.get_best_trajectory_sum_rewards_and_node(
                self.discount_rate)
        else:
            curr_node = goal_nodes[0]
        return curr_node

    def switch_init_node(self, node):
        self.s0_node.is_init_node = False
        self.s0_node = node
        self.s0_node.is_init_node = True
        self.problem_env.reset_to_init_state(node)
        self.found_solution = False

    @staticmethod
    def choose_child_node_to_descend_to(node):
        # todo: implement the one with highest visitation
        if node.is_operator_skeleton_node and len(node.A) == 1:
            # descend to grand-child
            only_child_node = node.children.values()[0]
            best_action = only_child_node.Q.keys()[np.argmax(
                only_child_node.Q.values())]
            best_node = only_child_node.children[best_action]
        else:
            best_action = node.Q.keys()[np.argmax(node.Q.values())]
            best_node = node.children[best_action]
        return best_node

    def log_current_tree_to_dot_file(self, iteration, node_to_search_from):
        if socket.gethostname() == 'dell-XPS-15-9560':
            write_dot_file(self.tree, iteration, '', node_to_search_from)

    def log_performance(self, elapsed_time, history_n_objs_in_goal,
                        n_feasibility_checks, iteration):
        self.search_time_to_reward.append([
            elapsed_time, iteration, n_feasibility_checks['ik'],
            n_feasibility_checks['mp'],
            max(history_n_objs_in_goal)
        ])

    def get_total_n_feasibility_checks(self):
        total_ik_checks = 0
        total_mp_checks = 0
        for n in self.tree.nodes:
            if n.sampling_agent is not None:
                total_ik_checks += n.sampling_agent.n_ik_checks
                total_mp_checks += n.sampling_agent.n_mp_checks
        return {'mp': total_mp_checks, 'ik': total_ik_checks}

    def is_time_to_switch(self, root_node):
        reached_frequency_limit = max(
            root_node.N.values()) >= self.switch_frequency
        has_enough_actions = True  #root_node.is_operator_skeleton_node #or len(root_node.A) > 3
        return reached_frequency_limit and has_enough_actions

    def get_node_to_switch_to(self, node_to_search_from):
        is_time_to_switch_node = self.is_time_to_switch(node_to_search_from)
        if not is_time_to_switch_node:
            return node_to_search_from

        if node_to_search_from.is_operator_skeleton_node:
            node_to_search_from = node_to_search_from.get_child_with_max_value(
            )
        else:
            max_child = node_to_search_from.get_child_with_max_value()
            if max_child.parent_action.continuous_parameters['is_feasible']:
                node_to_search_from = node_to_search_from.get_child_with_max_value(
                )

        return self.get_node_to_switch_to(node_to_search_from)

    def search(self,
               n_iter=np.inf,
               iteration_for_tree_logging=0,
               node_to_search_from=None,
               max_time=np.inf):
        depth = 0
        elapsed_time = 0

        if node_to_search_from is None:
            self.s0_node = self.create_node(None,
                                            depth=0,
                                            parent_node=None,
                                            is_parent_action_infeasible=False,
                                            is_init_node=True)
            self.tree.set_root_node(self.s0_node)
            node_to_search_from = self.s0_node

        if n_iter == np.inf:
            n_iter = 999999

        new_trajs = []
        plan = []
        history_of_n_objs_in_goal = []
        for iteration in range(1, n_iter):
            print '*****SIMULATION ITERATION %d' % iteration
            self.problem_env.reset_to_init_state(node_to_search_from)

            new_traj = []
            stime = time.time()
            self.simulate(node_to_search_from, node_to_search_from, depth,
                          new_traj)
            elapsed_time += time.time() - stime

            n_feasibility_checks = self.get_total_n_feasibility_checks()
            n_objs_in_goal = len(
                self.problem_env.get_objs_in_region('home_region'))
            history_of_n_objs_in_goal.append(n_objs_in_goal)
            self.log_performance(elapsed_time, history_of_n_objs_in_goal,
                                 n_feasibility_checks, iteration)
            print "Time {} n_feasible_checks {} max progress {}".format(
                elapsed_time, n_feasibility_checks['ik'],
                max(history_of_n_objs_in_goal))

            #is_time_to_switch_node = self.is_time_to_switch(node_to_search_from)
            #if is_time_to_switch_node:
            #    node_to_search_from = self.get_node_to_switch_to(node_to_search_from)

            if self.found_solution:
                print "Optimal score found"
                plan, _ = self.retrace_best_plan()
                break

            if elapsed_time > max_time:
                print "Time is up"
                break

        self.problem_env.reset_to_init_state(node_to_search_from)
        return self.search_time_to_reward, n_feasibility_checks, plan

    def get_best_trajectory(self, node_to_search_from, trajectories):
        traj_rewards = []
        curr_node = node_to_search_from
        for trj in trajectories:
            traj_sum_reward = 0
            for aidx, a in enumerate(trj):
                traj_sum_reward += np.power(
                    self.discount_rate, aidx) * curr_node.reward_history[a][0]
                curr_node = curr_node.children[a]
            traj_rewards.append(traj_sum_reward)
        return trajectories[np.argmax(traj_rewards)], curr_node

    def choose_action(self, curr_node, depth):
        if curr_node.is_operator_skeleton_node:
            print "Skeleton node"
            # here, perform psa with the learned q
            action = curr_node.perform_ucb_over_actions()
        else:
            print 'Instance node'
            if curr_node.sampling_agent is None:  # this happens if the tree has been pickled
                curr_node.sampling_agent = self.create_sampling_agent(
                    curr_node)

            if not self.use_progressive_widening:
                w_param = self.widening_parameter
            else:
                w_param = self.widening_parameter * np.power(0.9, depth / 2)
            if not curr_node.is_reevaluation_step(
                    w_param,
                    self.problem_env.reward_function.infeasible_reward,
                    self.use_progressive_widening, self.use_ucb):
                print "Sampling new action"
                new_continuous_parameters = self.sample_continuous_parameters(
                    curr_node)
                curr_node.add_actions(new_continuous_parameters)
                action = curr_node.A[-1]
            else:
                print "Re-evaluation of actions"
                # todo I want UCB here
                if self.use_ucb:
                    action = curr_node.perform_ucb_over_actions()
                else:
                    action = curr_node.choose_new_arm()
        return action

    @staticmethod
    def update_goal_node_statistics(curr_node, reward):
        # todo rewrite this function
        curr_node.Nvisited += 1
        curr_node.reward = reward

    def simulate(self, curr_node, node_to_search_from, depth, new_traj):
        if self.problem_env.reward_function.is_goal_reached():
            if not curr_node.is_goal_and_already_visited:
                self.found_solution = True
                curr_node.is_goal_node = True
                print "Solution found, returning the goal reward", self.problem_env.reward_function.goal_reward
                self.update_goal_node_statistics(
                    curr_node, self.problem_env.reward_function.goal_reward)
            return self.problem_env.reward_function.goal_reward

        if depth == self.planning_horizon:
            print "Depth limit reached"
            return 0

        if DEBUG:
            print "At depth ", depth
            print "Is it time to pick?", self.problem_env.is_pick_time()

        action = self.choose_action(curr_node, depth)
        is_action_feasible = self.apply_action(curr_node, action)

        if not curr_node.is_action_tried(action):
            next_node = self.create_node(action, depth + 1, curr_node,
                                         not is_action_feasible)
            self.tree.add_node(next_node, action, curr_node)
            reward = self.problem_env.reward_function(curr_node.state,
                                                      next_node.state, action,
                                                      depth)
            next_node.parent_action_reward = reward
            next_node.sum_ancestor_action_rewards = next_node.parent.sum_ancestor_action_rewards + reward
        else:
            next_node = curr_node.children[action]
            reward = next_node.parent_action_reward

        print "Reward", reward

        if not is_action_feasible:
            # this (s,a) is a dead-end
            print "Infeasible action"
            if self.use_v_fcn:
                sum_rewards = reward + curr_node.parent.v_fcn[
                    curr_node.parent_action]
                print sum_rewards
            else:
                sum_rewards = reward
        else:
            sum_rewards = reward + self.discount_rate * self.simulate(
                next_node, node_to_search_from, depth + 1, new_traj)

        curr_node.update_node_statistics(action, sum_rewards, reward)
        if curr_node == node_to_search_from and curr_node.parent is not None:
            self.update_ancestor_node_statistics(curr_node.parent,
                                                 curr_node.parent_action,
                                                 sum_rewards)

        return sum_rewards

    def update_ancestor_node_statistics(self, node, action, child_sum_rewards):
        if node is None:
            return
        parent_reward_to_node = node.reward_history[action][0]
        parent_sum_rewards = parent_reward_to_node + child_sum_rewards  # rwd up to parent + rwd from child to leaf
        node.update_node_statistics(action, parent_sum_rewards,
                                    parent_reward_to_node)
        self.update_ancestor_node_statistics(node.parent, node.parent_action,
                                             parent_sum_rewards)

    def apply_action(self, node, action):
        if node.is_operator_skeleton_node:
            print "Applying skeleton", action.type, action.discrete_parameters['object'], \
                action.discrete_parameters['place_region']
            is_feasible = self.problem_env.apply_operator_skeleton(
                node.state, action)
        else:
            is_feasible = self.problem_env.apply_operator_instance(
                node.state, action, self.check_reachability)
            if is_feasible:
                print "Applying instance", action.discrete_parameters[
                    'object'], action.discrete_parameters[
                        'place_region'], action.continuous_parameters['place'][
                            'q_goal']
            else:
                print "Applying infeasible instance", action.discrete_parameters[
                    'object'], action.discrete_parameters['place_region']

        return is_feasible

    def sample_continuous_parameters(self, node):
        if self.problem_env.name.find('one_arm') != -1:
            raise NotImplementedError
        else:
            if self.sampling_strategy == 'voo':
                action_parameters = [
                    np.hstack([
                        a.continuous_parameters['pick']['action_parameters'],
                        a.continuous_parameters['place']['action_parameters']
                    ]) for a in node.Q.keys()
                ]
                q_values = node.Q.values()
                # todo
                #   save all the mp values that did not work out
                smpled_param = node.sampling_agent.sample_next_point(
                    action_parameters, q_values)
                if not smpled_param[
                        'is_feasible'] and self.sampling_strategy == 'voo':
                    node.sampling_agent.update_mp_infeasible_samples(
                        smpled_param['samples'])
            else:
                smpled_param = node.sampling_agent.sample_next_point()
                node.needs_to_sample = False
        return smpled_param
コード例 #8
0
class MCTS:
    def __init__(self, widening_parameter, exploration_parameters,
                 sampling_strategy, sampling_strategy_exploration_parameter,
                 c1, n_feasibility_checks, environment,
                 use_progressive_widening, use_ucb, use_max_backup,
                 pick_switch, voo_sampling_mode, voo_counter_ratio, n_switch):
        self.c1 = c1
        self.widening_parameter = widening_parameter
        self.exploration_parameters = exploration_parameters
        self.time_limit = np.inf
        self.environment = environment
        if self.environment.name == 'convbelt':
            self.discount_rate = 0.99  # do we need this?
        else:
            self.discount_rate = 0.9
        if self.environment.name.find('synthetic') != -1:
            self.depth_limit = 20
        else:
            self.depth_limit = np.inf
        self.sampling_strategy = sampling_strategy
        self.sampling_strategy_exploration_parameter = sampling_strategy_exploration_parameter
        self.use_progressive_widening = use_progressive_widening
        self.voo_sampling_mode = voo_sampling_mode
        self.voo_counter_ratio = voo_counter_ratio
        self.use_ucb = use_ucb
        self.n_switch = n_switch
        self.n_switch_original = self.n_switch
        self.use_max_backup = use_max_backup
        self.pick_switch = pick_switch

        self.env = self.environment.env
        self.robot = self.environment.robot
        self.s0_node = self.create_node(None,
                                        depth=0,
                                        reward=0,
                                        is_init_node=True)

        self.original_s0_node = self.s0_node
        self.tree = MCTSTree(self.s0_node, self.exploration_parameters)
        self.found_solution = False
        self.goal_reward = 2
        self.infeasible_reward = -2
        self.n_feasibility_checks = n_feasibility_checks

    def create_sampling_agent(self, node, operator_skeleton):
        operator_name = operator_skeleton.type

        if operator_name == 'two_arm_pick' and self.environment.name == 'convbelt':
            return PreSampledPickGenerator()
        if self.sampling_strategy == 'unif':
            return UniformGenerator(operator_name, self.environment)
        elif self.sampling_strategy == 'voo':
            return VOOGenerator(operator_name, self.environment,
                                self.sampling_strategy_exploration_parameter,
                                self.c1, self.voo_sampling_mode,
                                self.voo_counter_ratio)
        elif self.sampling_strategy == 'gpucb':
            return GPUCBGenerator(operator_name, self.environment,
                                  self.sampling_strategy_exploration_parameter)
        elif self.sampling_strategy == 'doo':
            return DOOGenerator(operator_skeleton, self.environment,
                                self.sampling_strategy_exploration_parameter)
        elif self.sampling_strategy == 'randomized_doo':
            return RandomizedDOOGenerator(
                operator_skeleton, self.environment,
                self.sampling_strategy_exploration_parameter)
        else:
            print "Wrong sampling strategy"
            return -1

    def create_node(self, parent_action, depth, reward, is_init_node):
        if self.environment.is_goal_reached():
            operator_skeleton = None
        else:
            operator_skeleton = self.environment.get_applicable_op_skeleton(
                parent_action)
        state_saver = CustomStateSaver(self.environment.env)
        node = TreeNode(operator_skeleton, self.exploration_parameters, depth,
                        state_saver, self.sampling_strategy, is_init_node,
                        self.depth_limit)
        if not self.environment.is_goal_reached():
            node.sampling_agent = self.create_sampling_agent(
                node, operator_skeleton)

        node.objects_not_in_goal = self.environment.objects_currently_not_in_goal
        node.parent_action_reward = reward
        node.parent_action = parent_action
        return node

    def retrace_best_plan(self):
        plan = []
        _, _, best_leaf_node = self.tree.get_best_trajectory_sum_rewards_and_node(
            self.discount_rate)
        curr_node = best_leaf_node

        while not curr_node.parent is None:
            plan.append(curr_node.parent_action)
            curr_node = curr_node.parent

        plan = plan[::-1]
        return plan

    def get_best_goal_node(self):
        leaves = self.tree.get_leaf_nodes()
        goal_nodes = [leaf for leaf in leaves if leaf.is_goal_node]
        if len(goal_nodes) > 1:
            best_traj_reward, curr_node, _ = self.tree.get_best_trajectory_sum_rewards_and_node(
                self.discount_rate)
        else:
            curr_node = goal_nodes[0]
        return curr_node

    def switch_init_node(self, node):
        self.s0_node.is_init_node = False
        self.s0_node = node
        self.s0_node.is_init_node = True
        self.environment.reset_to_init_state(node)
        self.found_solution = False

    def log_current_tree_to_dot_file(self, iteration):
        if socket.gethostname() == 'dell-XPS-15-9560':
            write_dot_file(self.tree, iteration, '')

    def is_time_to_switch_initial_node(self):
        if self.environment.name.find('synth') != -1:
            n_feasible_actions = np.sum([
                self.environment.is_action_feasible(a) for a in self.s0_node.A
            ])

            if n_feasible_actions > self.n_switch:
                return True
            else:
                return False

        if self.s0_node.is_goal_node:
            return True

        is_pick_node = self.s0_node.operator_skeleton.type == 'two_arm_pick'

        if len(self.s0_node.Q) == 0:
            n_feasible_actions = 0
        else:
            root_node_reward_history = self.s0_node.reward_history.values()
            root_node_reward_history = np.array(
                [np.max(R) for R in root_node_reward_history])
            n_feasible_actions = np.sum(root_node_reward_history >= 0)

        if is_pick_node:
            if self.pick_switch:
                we_evaluated_the_node_enough = n_feasible_actions >= self.n_switch
            else:
                we_evaluated_the_node_enough = n_feasible_actions > 0
        else:
            we_evaluated_the_node_enough = n_feasible_actions >= self.n_switch

        return we_evaluated_the_node_enough

    def choose_child_node_to_descend_to(self):
        is_child_goal_node = np.any(
            [c.is_goal_node for c in self.s0_node.children.values()])
        if is_child_goal_node:
            best_node = self.tree.root
            self.n_switch += self.n_switch
        else:
            feasible_actions = [
                a for a in self.s0_node.A
                if np.max(self.s0_node.reward_history[a]) > -2
            ]
            feasible_q_values = [self.s0_node.Q[a] for a in feasible_actions]
            best_action = feasible_actions[np.argmax(feasible_q_values)]
            best_node = self.s0_node.children[best_action]
        return best_node

    def search(self, n_iter=100, max_time=np.inf):
        depth = 0
        time_to_search = 0
        search_time_to_reward = []
        plan = None

        self.n_iter = n_iter
        for iteration in range(n_iter):
            print '*****SIMULATION ITERATION %d' % iteration
            print '*****Root node idx %d' % self.s0_node.idx
            self.environment.reset_to_init_state(self.s0_node)

            if self.is_time_to_switch_initial_node():
                print "Switching root node!"
                if self.s0_node.A[0].type == 'two_arm_place':
                    # self.s0_node.store_node_information(self.environment.name)
                    # visualize_base_poses_and_q_values(self.s0_node.Q, self.environment)
                    pass
                best_child_node = self.choose_child_node_to_descend_to()
                self.switch_init_node(best_child_node)
            stime = time.time()
            self.simulate(self.s0_node, depth)
            time_to_search += time.time() - stime
            """
            tmp = []
            if np.any([a.continuous_parameters['is_feasible'] for a in self.s0_node.A]):
                feasible_action = [a for a in self.s0_node.A if a.continuous_parameters['is_feasible']][0]
                #self.log_current_tree_to_dot_file(iteration)
                tmp.append(self.s0_node.Q[feasible_action])
                import pdb;pdb.set_trace()
            """

            # self.log_current_tree_to_dot_file(iteration)
            best_traj_rwd, progress, best_node = self.tree.get_best_trajectory_sum_rewards_and_node(
                self.discount_rate)
            search_time_to_reward.append(
                [time_to_search, iteration, best_traj_rwd,
                 len(progress)])
            plan = self.retrace_best_plan()
            # rewards = np.array([np.max(rlist) for rlist in self.s0_node.reward_history.values()])

            n_feasible = np.sum(
                [self.s0_node.is_action_feasible(a) for a in self.s0_node.A])
            print 'n feasible actions , n_switch ', n_feasible, self.n_switch
            print search_time_to_reward[-1], np.argmax(
                np.array(search_time_to_reward)[:, 2])

            if time_to_search > max_time:
                break

        self.environment.reset_to_init_state(self.s0_node)
        return search_time_to_reward, self.s0_node.best_v, plan

    def choose_action(self, curr_node, depth):
        if not self.use_progressive_widening:
            is_synthetic = self.environment.name.find('synthetic') != -1
            is_convbelt = self.environment.name.find('convbelt') != -1
            is_mdr = self.environment.name.find(
                'minimum_displacement_removal') != -1
            if is_synthetic:
                w_param = self.widening_parameter * np.power(0.8, depth)
            elif is_mdr:
                w_param = self.widening_parameter * np.power(0.99, depth)
            elif is_convbelt:
                w_param = self.widening_parameter * np.power(0.99, depth)
        else:
            w_param = self.widening_parameter
        print "Widening parameter ", w_param
        if not curr_node.is_reevaluation_step(
                w_param, self.environment.infeasible_reward,
                self.use_progressive_widening, self.use_ucb):
            print "Is time to sample new action? True"
            new_continuous_parameters = self.sample_continuous_parameters(
                curr_node)
            curr_node.add_actions(new_continuous_parameters)
            action = curr_node.A[-1]
        else:
            print "Re-evaluation? True"
            if self.use_ucb:
                action = curr_node.perform_ucb_over_actions()
            else:
                action = curr_node.choose_new_arm()

        return action

    def update_node_statistics(self, curr_node, action, sum_rewards, reward):
        # todo rewrite this function
        curr_node.Nvisited += 1

        is_action_never_tried = curr_node.N[action] == 0
        if is_action_never_tried:
            curr_node.reward_history[action] = [reward]
            curr_node.N[action] += 1
            curr_node.Q[action] = sum_rewards
        else:
            curr_node.reward_history[action].append(reward)
            curr_node.N[action] += 1
            if self.environment.name.find('synthetic') == -1:
                if self.use_max_backup:
                    if sum_rewards > curr_node.Q[action]:
                        curr_node.Q[action] = sum_rewards
                else:
                    curr_node.Q[action] += (sum_rewards - curr_node.Q[action]
                                            ) / float(curr_node.N[action])
            else:
                if self.use_max_backup:
                    if sum_rewards > curr_node.Q[action]:
                        curr_node.Q[action] = sum_rewards
                else:
                    curr_node.Q[action] += (sum_rewards - curr_node.Q[action]
                                            ) / float(curr_node.N[action])

    @staticmethod
    def update_goal_node_statistics(curr_node, reward):
        # todo rewrite this function
        curr_node.Nvisited += 1
        curr_node.reward = reward

    def visualize_samples_from_sampling_agent(self, node):
        action, status, doo_node, action_parameters = node.sampling_agent.sample_feasible_action(
            node, self.n_feasibility_checks)

    def simulate(self, curr_node, depth):
        print "Curr node idx", curr_node.idx
        if self.environment.is_goal_reached():
            # arrived at the goal state
            if not curr_node.is_goal_and_already_visited:
                # todo mark the goal trajectory, and don't revisit the actions on the goal trajectory
                self.found_solution = True
                curr_node.is_goal_node = True
                print "Solution found, returning the goal reward", self.goal_reward
                self.update_goal_node_statistics(curr_node, self.goal_reward)
            return self.goal_reward

        if depth == self.depth_limit:
            if len(curr_node.parent.reward_history) > 0:
                print np.max(curr_node.parent.reward_history.values())
            return 0

        if DEBUG:
            print "At depth ", depth
            print "Is it time to pick?", self.environment.is_pick_time()
        action = self.choose_action(curr_node, depth)
        reward = self.environment.apply_operator_instance(action, curr_node)
        print "Executed ", action.type, action.continuous_parameters[
            'is_feasible'], action.discrete_parameters
        print "reward ", reward

        if not curr_node.is_action_tried(action):
            next_node = self.create_node(action,
                                         depth + 1,
                                         reward,
                                         is_init_node=False)
            self.tree.add_node(next_node, action, curr_node)
            next_node.sum_ancestor_action_rewards = next_node.parent.sum_ancestor_action_rewards + reward
        else:
            next_node = curr_node.children[action]
        is_infeasible_action = self.is_simulated_action_infeasible(
            reward, action)
        if is_infeasible_action:
            sum_rewards = reward
        else:
            sum_rewards = reward + self.discount_rate * self.simulate(
                next_node, depth + 1)

        self.update_node_statistics(curr_node, action, sum_rewards, reward)
        if curr_node.is_init_node and curr_node.parent is not None:
            self.update_ancestor_node_statistics(curr_node.parent,
                                                 curr_node.parent_action,
                                                 sum_rewards)

        return sum_rewards

    def is_simulated_action_infeasible(self, reward, action):
        if self.environment.name.find('synthetic') != -1:
            return not self.environment.is_action_feasible(action)
        else:
            return reward == self.environment.infeasible_reward

    def update_ancestor_node_statistics(self, node, action, child_sum_rewards):
        if node is None:
            return

        parent_reward_to_node = node.reward_history[action][0]
        parent_sum_rewards = parent_reward_to_node + self.discount_rate * child_sum_rewards
        self.update_node_statistics(node, action, parent_sum_rewards,
                                    parent_reward_to_node)
        self.update_ancestor_node_statistics(node.parent, node.parent_action,
                                             parent_sum_rewards)

    def sample_continuous_parameters(self, node):
        return node.sampling_agent.sample_next_point(node,
                                                     self.n_feasibility_checks)
コード例 #9
0
class MCTS:
    def __init__(self, parameters, problem_env, goal_entities, v_fcn, learned_q):
        # MCTS parameters
        self.widening_parameter = parameters.widening_parameter
        self.ucb_parameter = parameters.ucb_parameter
        self.time_limit = parameters.timelimit
        self.n_motion_plan_trials = parameters.n_motion_plan_trials
        self.use_ucb = parameters.use_ucb
        self.use_progressive_widening = parameters.pw
        self.n_feasibility_checks = parameters.n_feasibility_checks
        self.use_v_fcn = parameters.use_learned_q
        self.v_fcn = v_fcn
        self.learned_q = learned_q
        self.use_shaped_reward = parameters.use_shaped_reward
        self.planning_horizon = parameters.planning_horizon
        self.sampling_strategy = parameters.sampling_strategy
        self.explr_p = parameters.explr_p
        self.switch_frequency = parameters.switch_frequency

        # Hard-coded params
        self.check_reachability = True
        self.discount_rate = 1.0

        # Environment setup
        self.problem_env = problem_env
        self.env = self.problem_env.env
        self.robot = self.problem_env.robot

        # MCTS initialization
        self.s0_node = None
        self.tree = MCTSTree(self.ucb_parameter)
        self.best_leaf_node = None
        self.goal_entities = goal_entities

        # Logging purpose
        self.search_time_to_reward = []
        self.reward_lists = []
        self.progress_list = []

        self.found_solution = False
        self.swept_volume_constraint = None

    def load_pickled_tree(self, fname=None):
        if fname is None:
            fname = 'tmp_tree.pkl'
        self.tree = pickle.load(open(fname, 'r'))

    def visit_all_nodes(self, curr_node):
        children = curr_node.children.values()
        print curr_node in self.tree.nodes
        for c in children:
            self.visit_all_nodes(c)

    def save_tree(self, fname=None):
        if fname is None:
            fname = 'tmp_tree.pkl'
        self.tree.make_tree_picklable()
        pickle.dump(self.tree, open(fname, 'wb'))

    def load_tree(self, fname=None):
        if fname is None:
            fname = 'tmp_tree.pkl'
        self.tree = pickle.load(open(fname, 'r'))

    def get_node_at_idx(self, idx):
        for n in self.tree.nodes:
            if n.idx == idx:
                return n
        return None

    def create_sampling_agent(self, node):
        operator_skeleton = node.operator_skeleton
        if 'one_arm' in self.problem_env.name:
            dont_check_motion_existence = True
        else:
            dont_check_motion_existence = False
        if self.sampling_strategy == 'uniform':
            generator = UniformPaPGenerator(node, operator_skeleton, self.problem_env, None,
                                            n_candidate_params_to_smpl=self.n_motion_plan_trials,
                                            total_number_of_feasibility_checks=self.n_feasibility_checks,
                                            dont_check_motion_existence=dont_check_motion_existence)

        elif self.sampling_strategy == 'voo':
            return PaPVOOGenerator(node,
                                   operator_skeleton,
                                   self.problem_env,
                                   None,
                                   self.n_feasibility_checks,
                                   self.n_motion_plan_trials,
                                   dont_check_motion_existence,
                                   explr_p=self.explr_p,
                                   c1=1,
                                   sampling_mode='gaussian',
                                   counter_ratio=1)
        else:
            raise NotImplementedError
        return generator

    def compute_state(self, parent_node, parent_action):
        if self.problem_env.is_goal_reached():
            state = parent_node.state
        else:
            if parent_node is None:
                parent_state = None
            else:
                parent_state = parent_node.state
            # where is the parent state?
            if self.problem_env.name.find('one_arm') != -1:
                state = OneArmPaPState(self.problem_env,
                                       parent_state=parent_state,
                                       parent_action=parent_action,
                                       goal_entities=self.goal_entities)
            else:
                if socket.gethostname() == 'dell-XPS-15-9560':
                    if parent_node is None:
                        idx = -1
                    else:
                        idx = parent_node.idx
                    fname = './tmp_%d.pkl' % idx
                    if os.path.isfile(fname):
                        state = pickle.load(open(fname, 'r'))
                        state.make_plannable(self.problem_env)
                    else:
                        state = ShortestPathPaPState(self.problem_env,  # what's this?
                                                     parent_state=parent_state,
                                                     parent_action=parent_action,
                                                     goal_entities=self.goal_entities, planner='mcts')
                        state.make_pklable()
                        pickle.dump(state, open(fname, 'wb'))
                        state.make_plannable(self.problem_env)
                else:
                    state = ShortestPathPaPState(self.problem_env,  # what's this?
                                                 parent_state=parent_state,
                                                 parent_action=parent_action,
                                                 goal_entities=self.goal_entities, planner='mcts')
        return state

    def get_current_state(self, parent_node, parent_action, is_parent_action_infeasible):
        # this needs to be factored
        # why do I need a parent node? Can I just get away with parent state?
        is_operator_skeleton_node = (parent_node is None) or (not parent_node.is_operator_skeleton_node)
        if is_parent_action_infeasible:
            state = None
        elif is_operator_skeleton_node:
            state = self.compute_state(parent_node, parent_action)
        else:
            state = parent_node.state

        return state

    def create_node(self, parent_action, depth, parent_node, is_parent_action_infeasible, is_init_node=False):
        state_saver = utils.CustomStateSaver(self.problem_env.env)
        is_operator_skeleton_node = (parent_node is None) or (not parent_node.is_operator_skeleton_node)
        state = self.get_current_state(parent_node, parent_action, is_parent_action_infeasible)

        if is_operator_skeleton_node:
            applicable_op_skeletons = self.problem_env.get_applicable_ops(parent_action)
            node = DiscreteTreeNodeWithPriorQ(state, self.ucb_parameter, depth, state_saver, is_operator_skeleton_node,
                                              is_init_node, applicable_op_skeletons, self.learned_q)
        else:
            node = ContinuousTreeNode(state, parent_action, self.ucb_parameter, depth, state_saver,
                                      is_operator_skeleton_node, is_init_node)
            node.sampling_agent = self.create_sampling_agent(node)

        node.parent = parent_node
        node.parent_action = parent_action
        return node

    @staticmethod
    def get_best_child_node(node):
        if len(node.children) == 0:
            return None
        else:
            best_child_action_idx = np.argmax(node.Q.values())
            best_child_action = node.Q.keys()[best_child_action_idx]
            return node.children[best_child_action]

    def retrace_best_plan(self):
        plan = []
        _, _, best_leaf_node = self.tree.get_best_trajectory_sum_rewards_and_node(self.discount_rate)
        curr_node = best_leaf_node

        while not curr_node.parent is None:
            plan.append(curr_node.parent_action)
            curr_node = curr_node.parent

        plan = plan[::-1]
        return plan, best_leaf_node

    def get_best_goal_node(self):
        leaves = self.tree.get_leaf_nodes()
        goal_nodes = [leaf for leaf in leaves if leaf.is_goal_node]
        if len(goal_nodes) > 1:
            best_traj_reward, curr_node, _ = self.tree.get_best_trajectory_sum_rewards_and_node(self.discount_rate)
        else:
            curr_node = goal_nodes[0]
        return curr_node

    def switch_init_node(self, node):
        self.s0_node.is_init_node = False
        self.s0_node = node
        self.s0_node.is_init_node = True
        self.problem_env.reset_to_init_state(node)
        self.found_solution = False

    @staticmethod
    def choose_child_node_to_descend_to(node):
        # todo: implement the one with highest visitation
        if node.is_operator_skeleton_node and len(node.A) == 1:
            # descend to grand-child
            only_child_node = node.children.values()[0]
            best_action = only_child_node.Q.keys()[np.argmax(only_child_node.Q.values())]
            best_node = only_child_node.children[best_action]
        else:
            best_action = node.Q.keys()[np.argmax(node.Q.values())]
            best_node = node.children[best_action]
        return best_node

    def log_current_tree_to_dot_file(self, iteration, node_to_search_from):
        if socket.gethostname() == 'dell-XPS-15-9560':
            write_dot_file(self.tree, iteration, '', node_to_search_from)

    def log_performance(self, time_to_search, iteration):
        best_traj_rwd, progress, best_node = self.tree.get_best_trajectory_sum_rewards_and_node(self.discount_rate)
        self.search_time_to_reward.append([time_to_search, iteration, best_traj_rwd, progress, self.found_solution])
        self.progress_list.append(progress)
        self.best_leaf_node = best_node

    def is_optimal_solution_found(self):
        best_traj_rwd, best_node, reward_list = self.tree.get_best_trajectory_sum_rewards_and_node(self.discount_rate)
        if self.found_solution:
            return True
            # if self.problem_env.reward_function.is_optimal_plan_found(best_traj_rwd):
            #    print "Optimal score found"
            #    return True
            # else:
            #    return False
        else:
            return False

    def search(self, n_iter=np.inf, iteration_for_tree_logging=0, node_to_search_from=None, max_time=np.inf):
        depth = 0
        time_to_search = 0

        if node_to_search_from is None:
            self.s0_node = self.create_node(None,
                                            depth=0,
                                            parent_node=None,
                                            is_parent_action_infeasible=False,
                                            is_init_node=True)
            self.tree.set_root_node(self.s0_node)
            node_to_search_from = self.s0_node

        new_trajs = []
        plan = []
        if n_iter == np.inf:
            n_iter = 999999
        for iteration in range(1, n_iter):
            print '*****SIMULATION ITERATION %d' % iteration
            self.problem_env.reset_to_init_state(node_to_search_from)

            new_traj = []
            stime = time.time()
            self.simulate(node_to_search_from, node_to_search_from, depth, new_traj)
            time_to_search += time.time() - stime
            new_trajs.append(new_traj)

            # note that I need to evaluate all actions in a node to switch
            is_time_to_switch_node = iteration % self.switch_frequency == 0  # and the node should be feasible
            # I have to have a feasible action to switch if this is an instance node
            if is_time_to_switch_node:
                if node_to_search_from.is_operator_skeleton_node:
                    node_to_search_from = node_to_search_from.get_child_with_max_value()
                else:
                    max_child = node_to_search_from.get_child_with_max_value()
                    if max_child.parent_action.continuous_parameters['is_feasible']:
                        node_to_search_from = node_to_search_from.get_child_with_max_value()

            if iteration % 10 == 0:
                self.log_current_tree_to_dot_file(iteration_for_tree_logging + iteration, node_to_search_from)

            self.log_performance(time_to_search, iteration)
            print self.search_time_to_reward[iteration_for_tree_logging:]

            # break if the solution is found
            if self.is_optimal_solution_found():
                print "Optimal score found"
                plan, _ = self.retrace_best_plan()
                break

            if time_to_search > max_time:
                print "Time is up"
                break

        self.problem_env.reset_to_init_state(node_to_search_from)
        return self.search_time_to_reward, plan

    def get_best_trajectory(self, node_to_search_from, trajectories):
        traj_rewards = []
        curr_node = node_to_search_from
        for trj in trajectories:
            traj_sum_reward = 0
            for aidx, a in enumerate(trj):
                traj_sum_reward += np.power(self.discount_rate, aidx) * curr_node.reward_history[a][0]
                curr_node = curr_node.children[a]
            traj_rewards.append(traj_sum_reward)
        return trajectories[np.argmax(traj_rewards)], curr_node

    def choose_action(self, curr_node):
        if curr_node.is_operator_skeleton_node:
            print "Skeleton node"
            # here, perform psa with the learned q
            action = curr_node.perform_ucb_over_actions()
        else:
            print 'Instance node'
            if curr_node.sampling_agent is None:  # this happens if the tree has been pickled
                curr_node.sampling_agent = self.create_sampling_agent(curr_node)
            if not curr_node.is_reevaluation_step(self.widening_parameter,
                                                  self.problem_env.reward_function.infeasible_reward,
                                                  self.use_progressive_widening,
                                                  self.use_ucb):
                print "Sampling new action"
                new_continuous_parameters = self.sample_continuous_parameters(curr_node)
                curr_node.add_actions(new_continuous_parameters)
                action = curr_node.A[-1]
            else:
                print "Re-evaluation of actions"
                if self.use_ucb:
                    action = curr_node.perform_ucb_over_actions()
                else:
                    action = curr_node.choose_new_arm()
        return action

    @staticmethod
    def update_goal_node_statistics(curr_node, reward):
        # todo rewrite this function
        curr_node.Nvisited += 1
        curr_node.reward = reward

    def simulate(self, curr_node, node_to_search_from, depth, new_traj):
        if self.problem_env.reward_function.is_goal_reached():
            if not curr_node.is_goal_and_already_visited:
                self.found_solution = True
                curr_node.is_goal_node = True
                print "Solution found, returning the goal reward", self.problem_env.reward_function.goal_reward
                self.update_goal_node_statistics(curr_node, self.problem_env.reward_function.goal_reward)
            return self.problem_env.reward_function.goal_reward

        if depth == self.planning_horizon:
            # would it ever get here? why does it not satisfy the goal?
            print "Depth limit reached"
            return 0

        if DEBUG:
            print "At depth ", depth
            print "Is it time to pick?", self.problem_env.is_pick_time()

        action = self.choose_action(curr_node)

        is_action_feasible = self.apply_action(curr_node, action)

        if not curr_node.is_action_tried(action):
            next_node = self.create_node(action, depth + 1, curr_node, not is_action_feasible)
            self.tree.add_node(next_node, action, curr_node)
            reward = self.problem_env.reward_function(curr_node.state, next_node.state, action, depth)
            next_node.parent_action_reward = reward
            next_node.sum_ancestor_action_rewards = next_node.parent.sum_ancestor_action_rewards + reward
        else:
            next_node = curr_node.children[action]
            reward = next_node.parent_action_reward

        print "Reward", reward

        if not is_action_feasible:
            # this (s,a) is a dead-end
            print "Infeasible action"
            if self.use_v_fcn:
                sum_rewards = reward + curr_node.parent.v_fcn[curr_node.parent_action]
                print sum_rewards
            else:
                sum_rewards = reward
        else:
            sum_rewards = reward + self.discount_rate * self.simulate(next_node, node_to_search_from, depth + 1,
                                                                      new_traj)

        curr_node.update_node_statistics(action, sum_rewards, reward)
        if curr_node == node_to_search_from and curr_node.parent is not None:
            self.update_ancestor_node_statistics(curr_node.parent, curr_node.parent_action, sum_rewards)

        # todo return a plan
        return sum_rewards

    def update_ancestor_node_statistics(self, node, action, child_sum_rewards):
        if node is None:
            return
        parent_reward_to_node = node.reward_history[action][0]
        parent_sum_rewards = parent_reward_to_node + self.discount_rate * child_sum_rewards
        node.update_node_statistics(action, parent_sum_rewards, parent_reward_to_node)
        self.update_ancestor_node_statistics(node.parent, node.parent_action, parent_sum_rewards)

    def apply_action(self, node, action):
        if node.is_operator_skeleton_node:
            print "Applying skeleton", action.type, action.discrete_parameters['object'], \
                action.discrete_parameters['region']
            is_feasible = self.problem_env.apply_operator_skeleton(node.state, action)
        else:
            print "Applying instance", action.type, action.discrete_parameters['object'], action.discrete_parameters[
                'region']
            is_feasible = self.problem_env.apply_operator_instance(node.state, action, self.check_reachability)

        return is_feasible

    def sample_continuous_parameters(self, node):
        if self.problem_env.name.find('one_arm') != -1:
            raise NotImplementedError
        else:
            if isinstance(node.state, StateWithoutCspacePredicates):
                current_collides = None
            else:
                current_collides = node.state.collisions_at_all_obj_pose_pairs

            smpled_param = node.sampling_agent.sample_next_point(cached_collisions=current_collides,
                                                                 cached_holding_collisions=None)
        return smpled_param