def __init__(self, widening_parameter, exploration_parameters, sampling_strategy, sampling_strategy_exploration_parameter, c1, n_feasibility_checks, environment, domain_name): self.c1 = c1 self.progressive_widening_parameter = widening_parameter self.exploration_parameters = exploration_parameters self.time_limit = np.inf if domain_name == 'namo': self.discount_rate = 0.9 else: self.discount_rate = 1 self.environment = environment self.sampling_strategy = sampling_strategy self.sampling_strategy_exploration_parameter = sampling_strategy_exploration_parameter self.depth_limit = 300 self.env = self.environment.env self.robot = self.environment.robot self.s0_node = self.create_node( None, depth=0, reward=0, objects_not_in_goal=self.environment.objects_not_in_goal, is_init_node=True) self.original_s0_node = self.s0_node self.tree = MCTSTree(self.s0_node, self.exploration_parameters) self.found_solution = False if self.environment.name == 'namo': self.goal_reward = 2 else: self.goal_reward = 0 self.n_feasibility_checks = n_feasibility_checks """
def onClick(self, event): x,y = self.get_intersection(event.x, event.y) if (x != -1 and y != -1): board_coords = self.get_board_coordinates(x, y) if not self.first_move: board = self.board_state.get_board() if self.first_move or board[board_coords[1]][board_coords[0]] == 0: # player is able to place piece if self.first_move: self.first_move = False self.placed_pieces.append(Piece(board_coords[1], board_coords[0], self.player_turn, self.placePiece(x, y), self)) new_board = [[0] * BOARD_SIZE for _ in range(BOARD_SIZE)] new_board[board_coords[1]][board_coords[0]] = BLACK self.board_state = BoardState(grid=new_board, recent_move=(board_coords[1], board_coords[0]), turn=BLACK, search_breadth=1) self.past_board_states.append(self.board_state) self.player_turn = (-1)*self.player_turn else: self.placed_pieces.append(Piece(board_coords[1], board_coords[0], self.player_turn, self.placePiece(x, y), self)) self.past_board_states.append(self.board_state) self.board_state = self.board_state.play(board_coords[1],board_coords[0]) self.player_turn = (-1)*self.player_turn possible_winner = self.board_state.get_winner() if possible_winner != 0: self.winner(possible_winner) else: ai_mcts_node = MCTSNode(self.board_state) ai_mcts_tree = MCTSTree(ai_mcts_node) next_state = ai_mcts_tree.best_move(time_cutoff=WAIT_TIME) self.board_state = next_state ai_move = next_state.get_recent_move() print("("+str(ai_move[0])+", "+str(ai_move[1])+")") self.placed_pieces.append(Piece(ai_move[1], ai_move[0], self.player_turn, self.placePiece((ai_move[1]+1)*self.grid_interval, (ai_move[0]+1)*self.grid_interval), self)) self.past_board_states.append(self.board_state) self.player_turn = (-1) * self.player_turn possible_winner = self.board_state.get_winner() if possible_winner != 0: self.winner(possible_winner)
def __init__(self, parameters, problem_env, goal_entities, v_fcn, learned_q): # MCTS parameters self.widening_parameter = parameters.widening_parameter self.ucb_parameter = parameters.ucb_parameter self.time_limit = parameters.timelimit self.n_motion_plan_trials = parameters.n_motion_plan_trials self.use_ucb = parameters.use_ucb self.use_progressive_widening = parameters.pw self.n_feasibility_checks = parameters.n_feasibility_checks self.use_v_fcn = parameters.use_learned_q self.use_shaped_reward = parameters.use_shaped_reward self.planning_horizon = parameters.planning_horizon self.sampling_strategy = parameters.sampling_strategy self.explr_p = parameters.explr_p self.switch_frequency = parameters.switch_frequency self.parameters = parameters self.v_fcn = v_fcn self.learned_q = learned_q # Hard-coded params self.check_reachability = True self.discount_rate = 1.0 # Environment setup self.problem_env = problem_env self.env = self.problem_env.env self.robot = self.problem_env.robot # MCTS initialization self.s0_node = None self.tree = MCTSTree(self.ucb_parameter) self.best_leaf_node = None self.goal_entities = goal_entities # Logging purpose self.search_time_to_reward = [] self.reward_lists = [] self.progress_list = [] self.found_solution = False self.swept_volume_constraint = None
def __init__(self, widening_parameter, exploration_parameters, sampling_strategy, sampling_strategy_exploration_parameter, c1, n_feasibility_checks, environment, use_progressive_widening, use_ucb, use_max_backup, pick_switch, voo_sampling_mode, voo_counter_ratio, n_switch): self.c1 = c1 self.widening_parameter = widening_parameter self.exploration_parameters = exploration_parameters self.time_limit = np.inf self.environment = environment if self.environment.name == 'convbelt': self.discount_rate = 0.99 # do we need this? else: self.discount_rate = 0.9 if self.environment.name.find('synthetic') != -1: self.depth_limit = 20 else: self.depth_limit = np.inf self.sampling_strategy = sampling_strategy self.sampling_strategy_exploration_parameter = sampling_strategy_exploration_parameter self.use_progressive_widening = use_progressive_widening self.voo_sampling_mode = voo_sampling_mode self.voo_counter_ratio = voo_counter_ratio self.use_ucb = use_ucb self.n_switch = n_switch self.n_switch_original = self.n_switch self.use_max_backup = use_max_backup self.pick_switch = pick_switch self.env = self.environment.env self.robot = self.environment.robot self.s0_node = self.create_node(None, depth=0, reward=0, is_init_node=True) self.original_s0_node = self.s0_node self.tree = MCTSTree(self.s0_node, self.exploration_parameters) self.found_solution = False self.goal_reward = 2 self.infeasible_reward = -2 self.n_feasibility_checks = n_feasibility_checks
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0,-1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,-1, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,-1, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], turn=-1, recent_move=(9, 8), search_breadth=2) bs_toy = BoardState(grid=[[ 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0,-1, 0, 0, 0, 0], [ 0, 0, 0, 0, 1, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0], [ 0, 0, 0, 0, 0, 0, 0, 0]], turn=-1, recent_move=(3, 3), search_breadth=2) n = MCTSNode(bs) t = MCTSTree(n) print("Best Move: \n", t.best_move(360))
class MCTS: def __init__(self, widening_parameter, exploration_parameters, sampling_strategy, sampling_strategy_exploration_parameter, c1, n_feasibility_checks, environment, domain_name): self.c1 = c1 self.progressive_widening_parameter = widening_parameter self.exploration_parameters = exploration_parameters self.time_limit = np.inf if domain_name == 'namo': self.discount_rate = 0.9 else: self.discount_rate = 1 self.environment = environment self.sampling_strategy = sampling_strategy self.sampling_strategy_exploration_parameter = sampling_strategy_exploration_parameter self.depth_limit = 300 self.env = self.environment.env self.robot = self.environment.robot self.s0_node = self.create_node( None, depth=0, reward=0, objects_not_in_goal=self.environment.objects_not_in_goal, is_init_node=True) self.original_s0_node = self.s0_node self.tree = MCTSTree(self.s0_node, self.exploration_parameters) self.found_solution = False if self.environment.name == 'namo': self.goal_reward = 2 else: self.goal_reward = 0 self.n_feasibility_checks = n_feasibility_checks """ if domain_name == 'convbelt': self.depth_limit = 10 self.is_satisficing_problem = True elif domain_name == 'namo': self.depth_limit = np.inf self.is_satisficing_problem = False """ def create_sampling_agent(self, node, operator_name): if self.sampling_strategy == 'unif': return UniformGenerator(operator_name, self.environment) elif self.sampling_strategy == 'voo': return VOOGenerator(operator_name, self.environment, self.sampling_strategy_exploration_parameter, self.c1) elif self.sampling_strategy == 'gpucb': return GPUCBGenerator(operator_name, self.environment, self.sampling_strategy_exploration_parameter) elif self.sampling_strategy == 'doo': return DOOGenerator(node, self.environment, self.sampling_strategy_exploration_parameter) elif self.sampling_strategy == 'randomized_doo': return RandomizedDOOGenerator( node, self.environment, self.sampling_strategy_exploration_parameter) else: print "Wrong sampling strategy" return -1 def create_node(self, parent_action, depth, reward, objects_not_in_goal, is_init_node): if self.environment.is_goal_reached(): operator = None else: operator = self.environment.get_applicable_ops() state_saver = DynamicEnvironmentStateSaver(self.environment.env) node = TreeNode(operator, self.exploration_parameters, depth, state_saver, self.sampling_strategy, is_init_node) if not self.environment.is_goal_reached(): node.sampling_agent = self.create_sampling_agent(node, operator) node.objects_not_in_goal = objects_not_in_goal node.parent_action_reward = reward node.parent_action = parent_action return node @staticmethod def get_best_child_node(node): if len(node.children) == 0: return None else: # returns the most visited chlid # another option is to return the child with best value best_child_action_idx = np.argmax(node.N.values()) best_child_action = node.N.keys()[best_child_action_idx] return node.children[best_child_action] def retrace_best_plan(self, best_node): plan = [] #curr_node = self.get_best_goal_node() curr_node = best_node #while not curr_node.is_init_node: while not curr_node.parent is None: action = curr_node.parent_action path = curr_node.parent_motion obj = curr_node.parent.obj obj_name = obj.GetName() operator = curr_node.parent.operator objs_in_collision = curr_node.objs_in_collision plan.insert( 0, { 'action': action, 'path': path, 'obj_name': obj_name, 'operator': operator, 'obj_names_in_collision': [obj.GetName() for obj in objs_in_collision] }) curr_node = curr_node.parent return plan def get_best_goal_node(self): leaves = self.tree.get_leaf_nodes() goal_nodes = [leaf for leaf in leaves if leaf.is_goal_node] if len(goal_nodes) > 1: best_traj_reward, curr_node, _ = self.tree.get_best_trajectory_sum_rewards_and_node( self.discount_rate) #curr_node = [leaf for leaf in goal_nodes if leaf.sum_ancestor_action_rewards == best_traj_reward][0] else: curr_node = goal_nodes[0] return curr_node def switch_init_node(self, node): self.environment.reset_to_init_state(node) self.s0_node.is_init_node = False self.s0_node = node self.s0_node.is_init_node = True self.found_solution = False """ if self.environment.is_solving_namo: self.environment.set_init_namo_object_names([o.GetName() for o in node.objs_in_collision]) # We need to keep track of what the objs in collision are for each action, so that # when we switch the root node, we can set that # More generally, we need to keep track of what objects are left to be manipulated # How can we do so? # Ideas: # 1) each node keeps it # 2) statesaver keeps it, and so the wrapper around it, when the restore is called, updates the objects # I guess it does not matter. What matters is its name. # It should be called the objects_not_in_goal? self.high_level_planner.task_plan[0]['objects'] = node.objs_in_collision self.environment.reset_to_init_state(node) elif self.environment.is_solving_packing: curr_obj_plan = self.high_level_planner.task_plan[0]['objects'] for obj_idx, curr_obj in enumerate(curr_obj_plan): if not self.environment.regions['object_region'].contains(curr_obj.ComputeAABB()): break self.high_level_planner.set_object_index(obj_idx) self.environment.reset_to_init_state(node) """ def choose_next_node_to_descend_to(self): n_visits_to_each_action = self.s0_node.N.values() if len(np.unique(n_visits_to_each_action)) == 1: # todo descend to the one that has the highest FEASIBLE Q best_action = self.s0_node.Q.keys()[np.argmax( self.s0_node.Q.values())] else: best_action = self.s0_node.N.keys()[np.argmax( n_visits_to_each_action)] best_action = self.s0_node.Q.keys()[np.argmax(self.s0_node.Q.values())] best_node = self.s0_node.children[best_action] return best_node, best_action def search(self, n_iter=100, n_optimal_iter=0, max_time=np.inf): # n_optimal_iter: additional number of iterations you are allowed to run after finding a solution depth = 0 time_to_search = 0 search_time_to_reward = [] optimal_iter = 0 n_node_switch = 0 switch_counter = 0 found_solution_permanent = False reward_lists = [] for iteration in range(n_iter): print '*****SIMULATION ITERATION %d' % iteration """ if self.environment.is_solving_namo or self.environment.is_solving_packing: is_pick_node = self.s0_node.operator.find('two_arm_pick') != -1 we_have_feasible_action = False if len(self.s0_node.Q) == 0 \ else np.max(self.s0_node.reward_history.values()) != self.environment.infeasible_reward # it will actually never switch. if is_pick_node: if self.environment.is_solving_packing: we_evaluated_the_node_enough = we_have_feasible_action and self.s0_node.Nvisited > 30 else: we_evaluated_the_node_enough = we_have_feasible_action and self.s0_node.Nvisited > 10 #if switch_counter > 10 and not we_have_feasible_action: # print 'Going back to s0 node' # self.switch_init_node(self.original_s0_node) else: we_evaluated_the_node_enough = we_have_feasible_action and self.s0_node.Nvisited > 30 #if switch_counter > 30 and not we_have_feasible_action: # print 'Going back to s0 node' # self.switch_init_node(self.original_s0_node) if is_pick_node and we_evaluated_the_node_enough: print "Node switching from pick node" best_node, best_action = self.choose_next_node_to_descend_to() self.switch_init_node(best_node) switch_counter = 0 elif (not is_pick_node) and we_evaluated_the_node_enough: print "Node switching from place node" best_node, best_action = self.choose_next_node_to_descend_to() self.switch_init_node(best_node) print 'best child reward', best_node.parent_action_reward print 'best child Q', self.s0_node.parent.Q[best_action] print 'best child N', self.s0_node.parent.N[best_action] print 'Other Q values', self.s0_node.Q.values() switch_counter = 0 switch_counter += 1 """ self.environment.reset_to_init_state(self.s0_node) stime = time.time() self.simulate(self.s0_node, depth) time_to_search += time.time() - stime """ ### logging results if socket.gethostname() == 'dell-XPS-15-9560': if self.environment.is_solving_namo: pass #write_dot_file(self.tree, iteration, 'solving_namo') elif self.environment.is_solving_packing: write_dot_file(self.tree, iteration, 'solving_packing') elif self.environment.is_solving_fetching: write_dot_file(self.tree, iteration, 'solving_fetching') """ best_traj_rwd, best_node, reward_list = self.tree.get_best_trajectory_sum_rewards_and_node( self.discount_rate) search_time_to_reward.append([ time_to_search, iteration, best_traj_rwd, self.found_solution ]) reward_lists.append(reward_list) plan = [ self.retrace_best_plan(best_node), best_traj_rwd, self.found_solution ] """ if (iteration % 100 == 0 and iteration != 0) or (iteration == n_iter): self.high_level_planner.save_results(search_time_to_reward, plan, reward_lists, iteration) print np.array(search_time_to_reward)[:, -2], np.max(np.array(search_time_to_reward)[:, -2]), reward_list, found_solution_permanent if self.found_solution: found_solution_permanent = True optimal_iter += 1 goal_node = best_node if self.optimal_score_achieved(best_traj_rwd): print "Optimal score found" break if self.environment.is_solving_namo and len(self.s0_node.objs_in_collision) == 0: self.switch_init_node(self.original_s0_node) else: self.switch_init_node(self.s0_node) self.found_solution = False else: plan = None goal_node = None """ goal_node = None if time_to_search > max_time: break self.environment.reset_to_init_state(self.s0_node) return search_time_to_reward, plan, goal_node, reward_lists """ def optimal_score_achieved(self, best_traj_rwd): # return best_traj_rwd == self.environment.optimal_score # in the case of namo, this is the length of the object # in the case of original problem, this is the number of objects to be packed return self.high_level_planner.is_optimal_score_achieved(best_traj_rwd) """ def choose_action(self, curr_node): n_actions = len(curr_node.A) is_time_to_sample = n_actions <= self.progressive_widening_parameter * curr_node.Nvisited if len(curr_node.Q.values()) > 0: best_Q = np.max(curr_node.Q.values()) is_next_node_goal = np.all( [child.is_goal_node for child in curr_node.children.values()]) is_time_to_sample = is_time_to_sample or ( best_Q == self.environment.infeasible_reward) or is_next_node_goal if is_time_to_sample: new_continuous_parameters = self.sample_action(curr_node) curr_node.add_actions(new_continuous_parameters) action = curr_node.perform_ucb_over_actions() return action @staticmethod def update_node_statistics(curr_node, action, sum_rewards, reward): curr_node.Nvisited += 1 if is_action_hashable(action): hashable_action = action else: hashable_action = make_action_hashable(action) is_action_new = not (hashable_action in curr_node.A) if is_action_new: curr_node.A.append(hashable_action) curr_node.N[hashable_action] = 1 curr_node.Q[hashable_action] = sum_rewards curr_node.reward_history[hashable_action] = [reward] else: curr_node.N[hashable_action] += 1 curr_node.reward_history[hashable_action].append(reward) curr_node.Q[hashable_action] += (sum_rewards - curr_node.Q[hashable_action]) / \ float(curr_node.N[hashable_action]) def simulate(self, curr_node, depth): if self.environment.is_goal_reached(): # arrived at the goal state if not curr_node.is_goal_and_already_visited: self.found_solution = True curr_node.is_goal_node = True print "Solution found, returning the goal reward", self.goal_reward self.update_node_statistics(curr_node, curr_node.parent_action, self.goal_reward, self.goal_reward) return self.goal_reward if depth == self.depth_limit: return 0 if DEBUG: print "At depth ", depth print "Is it time to pick?", self.environment.is_pick_time() action = self.choose_action(curr_node) parent_motion = None if curr_node.is_action_tried(action): if DEBUG: print "Executing tree policy, taking action ", action next_node = curr_node.get_child_node(action) if next_node.parent_motion is None: check_feasibility = True else: parent_motion = next_node.parent_motion check_feasibility = False else: check_feasibility = True # todo: store path? if DEBUG: print 'Is pick time? ', self.environment.is_pick_time() print "Executing action ", action next_state, reward, parent_motion, objs_in_collision = self.environment.apply_action( curr_node, action, check_feasibility, parent_motion) import pdb pdb.set_trace() if DEBUG: print 'Reward ', reward #self.high_level_planner.update_task_plan_indices(reward, action['operator_name']) # create the next node based on the updated task plan progress if not curr_node.is_action_tried(action): next_node = self.create_node(action, depth + 1, reward, objs_in_collision, is_init_node=False) self.tree.add_node(next_node, action, curr_node) next_node.sum_ancestor_action_rewards = next_node.parent.sum_ancestor_action_rewards + reward if next_node.parent_motion is None and reward != self.environment.infeasible_reward: next_node.parent_motion = parent_motion is_infeasible_action = reward == self.environment.infeasible_reward if is_infeasible_action: # this (s,a) is a dead-end sum_rewards = reward else: sum_rewards = reward + self.discount_rate * self.simulate( next_node, depth + 1) self.update_node_statistics(curr_node, action, sum_rewards, reward) if curr_node.is_init_node and curr_node.parent is not None: self.update_ancestor_node_statistics(curr_node.parent, curr_node.parent_action, sum_rewards) return sum_rewards def update_ancestor_node_statistics(self, node, action, child_sum_rewards): if node is None: return parent_reward_to_node = node.reward_history[make_action_hashable( action)][0] parent_sum_rewards = parent_reward_to_node + self.discount_rate * child_sum_rewards self.update_node_statistics(node, action, parent_sum_rewards, parent_reward_to_node) self.update_ancestor_node_statistics(node.parent, node.parent_action, parent_sum_rewards) """ def apply_action(self, node, action, check_feasibility, parent_motion): if action is None: return None, self.environment.infeasible_reward, None which_operator = self.environment.which_operator(node.obj) if which_operator == 'two_arm_pick': if self.environment.name == 'convbelt': self.environment.disable_objects() node.obj.Enable(True) next_state, reward, path, objs_in_collision = self.environment.apply_two_arm_pick_action(action, node, check_feasibility, parent_motion) if self.environment.name == 'convbelt': self.environment.enable_objects() node.obj.Enable(True) elif which_operator == 'two_arm_place': next_state, reward, path, objs_in_collision = self.environment.apply_two_arm_place_action(action, node, check_feasibility, parent_motion) elif which_operator == 'one_arm_pick': next_state, reward, path, objs_in_collision = self.environment.apply_one_arm_pick_action(action, node.obj, node.region, check_feasibility, parent_motion) elif which_operator == 'one_arm_place': next_state, reward, path, objs_in_collision = self.environment.apply_one_arm_place_action(action, node.obj, node.region, check_feasibility, parent_motion) elif which_operator == 'next_base_pose': next_state, reward, path, objs_in_collision = self.environment.apply_next_base_pose(action, node, check_feasibility, parent_motion) return next_state, reward, path, objs_in_collision """ def sample_action(self, node): action = node.sampling_agent.sample_next_point( node, self.n_feasibility_checks) return action
class MCTS: def __init__(self, parameters, problem_env, goal_entities, v_fcn, learned_q): # MCTS parameters self.widening_parameter = parameters.widening_parameter self.ucb_parameter = parameters.ucb_parameter self.time_limit = parameters.timelimit self.n_motion_plan_trials = parameters.n_motion_plan_trials self.use_ucb = parameters.use_ucb self.use_progressive_widening = parameters.pw self.n_feasibility_checks = parameters.n_feasibility_checks self.use_v_fcn = parameters.use_learned_q self.use_shaped_reward = parameters.use_shaped_reward self.planning_horizon = parameters.planning_horizon self.sampling_strategy = parameters.sampling_strategy self.explr_p = parameters.explr_p self.switch_frequency = parameters.switch_frequency self.parameters = parameters self.v_fcn = v_fcn self.learned_q = learned_q # Hard-coded params self.check_reachability = True self.discount_rate = 1.0 # Environment setup self.problem_env = problem_env self.env = self.problem_env.env self.robot = self.problem_env.robot # MCTS initialization self.s0_node = None self.tree = MCTSTree(self.ucb_parameter) self.best_leaf_node = None self.goal_entities = goal_entities # Logging purpose self.search_time_to_reward = [] self.reward_lists = [] self.progress_list = [] self.found_solution = False self.swept_volume_constraint = None def load_pickled_tree(self, fname=None): if fname is None: fname = 'tmp_tree.pkl' self.tree = pickle.load(open(fname, 'r')) def visit_all_nodes(self, curr_node): children = curr_node.children.values() print curr_node in self.tree.nodes for c in children: self.visit_all_nodes(c) def save_tree(self, fname=None): if fname is None: fname = 'tmp_tree.pkl' self.tree.make_tree_picklable() pickle.dump(self.tree, open(fname, 'wb')) def load_tree(self, fname=None): if fname is None: fname = 'tmp_tree.pkl' self.tree = pickle.load(open(fname, 'r')) def get_node_at_idx(self, idx): for n in self.tree.nodes: if n.idx == idx: return n return None def create_sampling_agent(self, node): operator_skeleton = node.operator_skeleton if 'one_arm' in self.problem_env.name: dont_check_motion_existence = True else: dont_check_motion_existence = False abstract_state = node.state abstract_action = node.operator_skeleton place_region = self.problem_env.regions[ abstract_action.discrete_parameters['place_region']] if self.sampling_strategy == 'uniform': sampler = UniformSampler(place_region) generator = TwoArmPaPGenerator( abstract_state, abstract_action, sampler, n_parameters_to_try_motion_planning=self.n_motion_plan_trials, n_iter_limit=self.n_feasibility_checks, problem_env=self.problem_env, pick_action_mode='ir_parameters', place_action_mode='object_pose') elif self.sampling_strategy == 'voo': target_obj = abstract_action.discrete_parameters['object'] sampler = VOOSampler( target_obj, place_region, self.explr_p, self.problem_env.reward_function.worst_potential_value) generator = TwoArmVOOGenerator( abstract_state, abstract_action, sampler, n_parameters_to_try_motion_planning=self.n_motion_plan_trials, n_iter_limit=self.n_feasibility_checks, problem_env=self.problem_env, pick_action_mode='ir_parameters', place_action_mode='object_pose') else: raise NotImplementedError return generator def compute_state(self, parent_node, parent_action): print "Computing state..." if self.problem_env.is_goal_reached(): state = parent_node.state else: if parent_node is None: parent_state = None else: parent_state = parent_node.state if self.problem_env.name.find('one_arm') != -1: state = OneArmPaPState(self.problem_env, parent_state=parent_state, parent_action=parent_action, goal_entities=self.goal_entities) else: cache_file_name_for_debugging = './planners/mcts_cache_for_debug/pidx_{}_seed_{}.pkl'.format( self.parameters.pidx, self.parameters.planner_seed) is_root_node = parent_state is None cache_file_exists = os.path.isfile( cache_file_name_for_debugging) is_beomjoon_local_machine = False #socket.gethostname() == 'lab' if cache_file_exists and is_root_node and is_beomjoon_local_machine: state = pickle.load( open(cache_file_name_for_debugging, 'r')) state.make_plannable(self.problem_env) else: state = ShortestPathPaPState( self.problem_env, parent_state=parent_state, parent_action=parent_action, goal_entities=self.goal_entities, planner='mcts') if is_root_node and is_beomjoon_local_machine: state.make_pklable() pickle.dump(state, open(cache_file_name_for_debugging, 'wb')) state.make_plannable(self.problem_env) return state def get_current_state(self, parent_node, parent_action, is_parent_action_infeasible): # this needs to be factored # why do I need a parent node? Can I just get away with parent state? is_operator_skeleton_node = (parent_node is None) or ( not parent_node.is_operator_skeleton_node) if is_parent_action_infeasible: state = None elif is_operator_skeleton_node: state = self.compute_state(parent_node, parent_action) else: state = parent_node.state return state def create_node(self, parent_action, depth, parent_node, is_parent_action_infeasible, is_init_node=False): state_saver = utils.CustomStateSaver(self.problem_env.env) is_operator_skeleton_node = (parent_node is None) or ( not parent_node.is_operator_skeleton_node) state = self.get_current_state(parent_node, parent_action, is_parent_action_infeasible) if is_operator_skeleton_node: applicable_op_skeletons = self.problem_env.get_applicable_ops( parent_action) node = DiscreteTreeNodeWithPriorQ(state, self.ucb_parameter, depth, state_saver, is_operator_skeleton_node, is_init_node, applicable_op_skeletons, self.learned_q) else: node = ContinuousTreeNode(state, parent_action, self.ucb_parameter, depth, state_saver, is_operator_skeleton_node, is_init_node) node.sampling_agent = self.create_sampling_agent(node) node.parent = parent_node node.parent_action = parent_action return node @staticmethod def get_best_child_node(node): if len(node.children) == 0: return None else: best_child_action_idx = np.argmax(node.Q.values()) best_child_action = node.Q.keys()[best_child_action_idx] return node.children[best_child_action] def retrace_best_plan(self): plan = [] _, _, best_leaf_node = self.tree.get_best_trajectory_sum_rewards_and_node( self.discount_rate) curr_node = best_leaf_node while not curr_node.parent is None: plan.append(curr_node.parent_action) curr_node = curr_node.parent plan = plan[::-1] return plan, best_leaf_node def get_best_goal_node(self): leaves = self.tree.get_leaf_nodes() goal_nodes = [leaf for leaf in leaves if leaf.is_goal_node] if len(goal_nodes) > 1: best_traj_reward, curr_node, _ = self.tree.get_best_trajectory_sum_rewards_and_node( self.discount_rate) else: curr_node = goal_nodes[0] return curr_node def switch_init_node(self, node): self.s0_node.is_init_node = False self.s0_node = node self.s0_node.is_init_node = True self.problem_env.reset_to_init_state(node) self.found_solution = False @staticmethod def choose_child_node_to_descend_to(node): # todo: implement the one with highest visitation if node.is_operator_skeleton_node and len(node.A) == 1: # descend to grand-child only_child_node = node.children.values()[0] best_action = only_child_node.Q.keys()[np.argmax( only_child_node.Q.values())] best_node = only_child_node.children[best_action] else: best_action = node.Q.keys()[np.argmax(node.Q.values())] best_node = node.children[best_action] return best_node def log_current_tree_to_dot_file(self, iteration, node_to_search_from): if socket.gethostname() == 'dell-XPS-15-9560': write_dot_file(self.tree, iteration, '', node_to_search_from) def log_performance(self, elapsed_time, history_n_objs_in_goal, n_feasibility_checks, iteration): self.search_time_to_reward.append([ elapsed_time, iteration, n_feasibility_checks['ik'], n_feasibility_checks['mp'], max(history_n_objs_in_goal) ]) def get_total_n_feasibility_checks(self): total_ik_checks = 0 total_mp_checks = 0 for n in self.tree.nodes: if n.sampling_agent is not None: total_ik_checks += n.sampling_agent.n_ik_checks total_mp_checks += n.sampling_agent.n_mp_checks return {'mp': total_mp_checks, 'ik': total_ik_checks} def is_time_to_switch(self, root_node): reached_frequency_limit = max( root_node.N.values()) >= self.switch_frequency has_enough_actions = True #root_node.is_operator_skeleton_node #or len(root_node.A) > 3 return reached_frequency_limit and has_enough_actions def get_node_to_switch_to(self, node_to_search_from): is_time_to_switch_node = self.is_time_to_switch(node_to_search_from) if not is_time_to_switch_node: return node_to_search_from if node_to_search_from.is_operator_skeleton_node: node_to_search_from = node_to_search_from.get_child_with_max_value( ) else: max_child = node_to_search_from.get_child_with_max_value() if max_child.parent_action.continuous_parameters['is_feasible']: node_to_search_from = node_to_search_from.get_child_with_max_value( ) return self.get_node_to_switch_to(node_to_search_from) def search(self, n_iter=np.inf, iteration_for_tree_logging=0, node_to_search_from=None, max_time=np.inf): depth = 0 elapsed_time = 0 if node_to_search_from is None: self.s0_node = self.create_node(None, depth=0, parent_node=None, is_parent_action_infeasible=False, is_init_node=True) self.tree.set_root_node(self.s0_node) node_to_search_from = self.s0_node if n_iter == np.inf: n_iter = 999999 new_trajs = [] plan = [] history_of_n_objs_in_goal = [] for iteration in range(1, n_iter): print '*****SIMULATION ITERATION %d' % iteration self.problem_env.reset_to_init_state(node_to_search_from) new_traj = [] stime = time.time() self.simulate(node_to_search_from, node_to_search_from, depth, new_traj) elapsed_time += time.time() - stime n_feasibility_checks = self.get_total_n_feasibility_checks() n_objs_in_goal = len( self.problem_env.get_objs_in_region('home_region')) history_of_n_objs_in_goal.append(n_objs_in_goal) self.log_performance(elapsed_time, history_of_n_objs_in_goal, n_feasibility_checks, iteration) print "Time {} n_feasible_checks {} max progress {}".format( elapsed_time, n_feasibility_checks['ik'], max(history_of_n_objs_in_goal)) #is_time_to_switch_node = self.is_time_to_switch(node_to_search_from) #if is_time_to_switch_node: # node_to_search_from = self.get_node_to_switch_to(node_to_search_from) if self.found_solution: print "Optimal score found" plan, _ = self.retrace_best_plan() break if elapsed_time > max_time: print "Time is up" break self.problem_env.reset_to_init_state(node_to_search_from) return self.search_time_to_reward, n_feasibility_checks, plan def get_best_trajectory(self, node_to_search_from, trajectories): traj_rewards = [] curr_node = node_to_search_from for trj in trajectories: traj_sum_reward = 0 for aidx, a in enumerate(trj): traj_sum_reward += np.power( self.discount_rate, aidx) * curr_node.reward_history[a][0] curr_node = curr_node.children[a] traj_rewards.append(traj_sum_reward) return trajectories[np.argmax(traj_rewards)], curr_node def choose_action(self, curr_node, depth): if curr_node.is_operator_skeleton_node: print "Skeleton node" # here, perform psa with the learned q action = curr_node.perform_ucb_over_actions() else: print 'Instance node' if curr_node.sampling_agent is None: # this happens if the tree has been pickled curr_node.sampling_agent = self.create_sampling_agent( curr_node) if not self.use_progressive_widening: w_param = self.widening_parameter else: w_param = self.widening_parameter * np.power(0.9, depth / 2) if not curr_node.is_reevaluation_step( w_param, self.problem_env.reward_function.infeasible_reward, self.use_progressive_widening, self.use_ucb): print "Sampling new action" new_continuous_parameters = self.sample_continuous_parameters( curr_node) curr_node.add_actions(new_continuous_parameters) action = curr_node.A[-1] else: print "Re-evaluation of actions" # todo I want UCB here if self.use_ucb: action = curr_node.perform_ucb_over_actions() else: action = curr_node.choose_new_arm() return action @staticmethod def update_goal_node_statistics(curr_node, reward): # todo rewrite this function curr_node.Nvisited += 1 curr_node.reward = reward def simulate(self, curr_node, node_to_search_from, depth, new_traj): if self.problem_env.reward_function.is_goal_reached(): if not curr_node.is_goal_and_already_visited: self.found_solution = True curr_node.is_goal_node = True print "Solution found, returning the goal reward", self.problem_env.reward_function.goal_reward self.update_goal_node_statistics( curr_node, self.problem_env.reward_function.goal_reward) return self.problem_env.reward_function.goal_reward if depth == self.planning_horizon: print "Depth limit reached" return 0 if DEBUG: print "At depth ", depth print "Is it time to pick?", self.problem_env.is_pick_time() action = self.choose_action(curr_node, depth) is_action_feasible = self.apply_action(curr_node, action) if not curr_node.is_action_tried(action): next_node = self.create_node(action, depth + 1, curr_node, not is_action_feasible) self.tree.add_node(next_node, action, curr_node) reward = self.problem_env.reward_function(curr_node.state, next_node.state, action, depth) next_node.parent_action_reward = reward next_node.sum_ancestor_action_rewards = next_node.parent.sum_ancestor_action_rewards + reward else: next_node = curr_node.children[action] reward = next_node.parent_action_reward print "Reward", reward if not is_action_feasible: # this (s,a) is a dead-end print "Infeasible action" if self.use_v_fcn: sum_rewards = reward + curr_node.parent.v_fcn[ curr_node.parent_action] print sum_rewards else: sum_rewards = reward else: sum_rewards = reward + self.discount_rate * self.simulate( next_node, node_to_search_from, depth + 1, new_traj) curr_node.update_node_statistics(action, sum_rewards, reward) if curr_node == node_to_search_from and curr_node.parent is not None: self.update_ancestor_node_statistics(curr_node.parent, curr_node.parent_action, sum_rewards) return sum_rewards def update_ancestor_node_statistics(self, node, action, child_sum_rewards): if node is None: return parent_reward_to_node = node.reward_history[action][0] parent_sum_rewards = parent_reward_to_node + child_sum_rewards # rwd up to parent + rwd from child to leaf node.update_node_statistics(action, parent_sum_rewards, parent_reward_to_node) self.update_ancestor_node_statistics(node.parent, node.parent_action, parent_sum_rewards) def apply_action(self, node, action): if node.is_operator_skeleton_node: print "Applying skeleton", action.type, action.discrete_parameters['object'], \ action.discrete_parameters['place_region'] is_feasible = self.problem_env.apply_operator_skeleton( node.state, action) else: is_feasible = self.problem_env.apply_operator_instance( node.state, action, self.check_reachability) if is_feasible: print "Applying instance", action.discrete_parameters[ 'object'], action.discrete_parameters[ 'place_region'], action.continuous_parameters['place'][ 'q_goal'] else: print "Applying infeasible instance", action.discrete_parameters[ 'object'], action.discrete_parameters['place_region'] return is_feasible def sample_continuous_parameters(self, node): if self.problem_env.name.find('one_arm') != -1: raise NotImplementedError else: if self.sampling_strategy == 'voo': action_parameters = [ np.hstack([ a.continuous_parameters['pick']['action_parameters'], a.continuous_parameters['place']['action_parameters'] ]) for a in node.Q.keys() ] q_values = node.Q.values() # todo # save all the mp values that did not work out smpled_param = node.sampling_agent.sample_next_point( action_parameters, q_values) if not smpled_param[ 'is_feasible'] and self.sampling_strategy == 'voo': node.sampling_agent.update_mp_infeasible_samples( smpled_param['samples']) else: smpled_param = node.sampling_agent.sample_next_point() node.needs_to_sample = False return smpled_param
class MCTS: def __init__(self, widening_parameter, exploration_parameters, sampling_strategy, sampling_strategy_exploration_parameter, c1, n_feasibility_checks, environment, use_progressive_widening, use_ucb, use_max_backup, pick_switch, voo_sampling_mode, voo_counter_ratio, n_switch): self.c1 = c1 self.widening_parameter = widening_parameter self.exploration_parameters = exploration_parameters self.time_limit = np.inf self.environment = environment if self.environment.name == 'convbelt': self.discount_rate = 0.99 # do we need this? else: self.discount_rate = 0.9 if self.environment.name.find('synthetic') != -1: self.depth_limit = 20 else: self.depth_limit = np.inf self.sampling_strategy = sampling_strategy self.sampling_strategy_exploration_parameter = sampling_strategy_exploration_parameter self.use_progressive_widening = use_progressive_widening self.voo_sampling_mode = voo_sampling_mode self.voo_counter_ratio = voo_counter_ratio self.use_ucb = use_ucb self.n_switch = n_switch self.n_switch_original = self.n_switch self.use_max_backup = use_max_backup self.pick_switch = pick_switch self.env = self.environment.env self.robot = self.environment.robot self.s0_node = self.create_node(None, depth=0, reward=0, is_init_node=True) self.original_s0_node = self.s0_node self.tree = MCTSTree(self.s0_node, self.exploration_parameters) self.found_solution = False self.goal_reward = 2 self.infeasible_reward = -2 self.n_feasibility_checks = n_feasibility_checks def create_sampling_agent(self, node, operator_skeleton): operator_name = operator_skeleton.type if operator_name == 'two_arm_pick' and self.environment.name == 'convbelt': return PreSampledPickGenerator() if self.sampling_strategy == 'unif': return UniformGenerator(operator_name, self.environment) elif self.sampling_strategy == 'voo': return VOOGenerator(operator_name, self.environment, self.sampling_strategy_exploration_parameter, self.c1, self.voo_sampling_mode, self.voo_counter_ratio) elif self.sampling_strategy == 'gpucb': return GPUCBGenerator(operator_name, self.environment, self.sampling_strategy_exploration_parameter) elif self.sampling_strategy == 'doo': return DOOGenerator(operator_skeleton, self.environment, self.sampling_strategy_exploration_parameter) elif self.sampling_strategy == 'randomized_doo': return RandomizedDOOGenerator( operator_skeleton, self.environment, self.sampling_strategy_exploration_parameter) else: print "Wrong sampling strategy" return -1 def create_node(self, parent_action, depth, reward, is_init_node): if self.environment.is_goal_reached(): operator_skeleton = None else: operator_skeleton = self.environment.get_applicable_op_skeleton( parent_action) state_saver = CustomStateSaver(self.environment.env) node = TreeNode(operator_skeleton, self.exploration_parameters, depth, state_saver, self.sampling_strategy, is_init_node, self.depth_limit) if not self.environment.is_goal_reached(): node.sampling_agent = self.create_sampling_agent( node, operator_skeleton) node.objects_not_in_goal = self.environment.objects_currently_not_in_goal node.parent_action_reward = reward node.parent_action = parent_action return node def retrace_best_plan(self): plan = [] _, _, best_leaf_node = self.tree.get_best_trajectory_sum_rewards_and_node( self.discount_rate) curr_node = best_leaf_node while not curr_node.parent is None: plan.append(curr_node.parent_action) curr_node = curr_node.parent plan = plan[::-1] return plan def get_best_goal_node(self): leaves = self.tree.get_leaf_nodes() goal_nodes = [leaf for leaf in leaves if leaf.is_goal_node] if len(goal_nodes) > 1: best_traj_reward, curr_node, _ = self.tree.get_best_trajectory_sum_rewards_and_node( self.discount_rate) else: curr_node = goal_nodes[0] return curr_node def switch_init_node(self, node): self.s0_node.is_init_node = False self.s0_node = node self.s0_node.is_init_node = True self.environment.reset_to_init_state(node) self.found_solution = False def log_current_tree_to_dot_file(self, iteration): if socket.gethostname() == 'dell-XPS-15-9560': write_dot_file(self.tree, iteration, '') def is_time_to_switch_initial_node(self): if self.environment.name.find('synth') != -1: n_feasible_actions = np.sum([ self.environment.is_action_feasible(a) for a in self.s0_node.A ]) if n_feasible_actions > self.n_switch: return True else: return False if self.s0_node.is_goal_node: return True is_pick_node = self.s0_node.operator_skeleton.type == 'two_arm_pick' if len(self.s0_node.Q) == 0: n_feasible_actions = 0 else: root_node_reward_history = self.s0_node.reward_history.values() root_node_reward_history = np.array( [np.max(R) for R in root_node_reward_history]) n_feasible_actions = np.sum(root_node_reward_history >= 0) if is_pick_node: if self.pick_switch: we_evaluated_the_node_enough = n_feasible_actions >= self.n_switch else: we_evaluated_the_node_enough = n_feasible_actions > 0 else: we_evaluated_the_node_enough = n_feasible_actions >= self.n_switch return we_evaluated_the_node_enough def choose_child_node_to_descend_to(self): is_child_goal_node = np.any( [c.is_goal_node for c in self.s0_node.children.values()]) if is_child_goal_node: best_node = self.tree.root self.n_switch += self.n_switch else: feasible_actions = [ a for a in self.s0_node.A if np.max(self.s0_node.reward_history[a]) > -2 ] feasible_q_values = [self.s0_node.Q[a] for a in feasible_actions] best_action = feasible_actions[np.argmax(feasible_q_values)] best_node = self.s0_node.children[best_action] return best_node def search(self, n_iter=100, max_time=np.inf): depth = 0 time_to_search = 0 search_time_to_reward = [] plan = None self.n_iter = n_iter for iteration in range(n_iter): print '*****SIMULATION ITERATION %d' % iteration print '*****Root node idx %d' % self.s0_node.idx self.environment.reset_to_init_state(self.s0_node) if self.is_time_to_switch_initial_node(): print "Switching root node!" if self.s0_node.A[0].type == 'two_arm_place': # self.s0_node.store_node_information(self.environment.name) # visualize_base_poses_and_q_values(self.s0_node.Q, self.environment) pass best_child_node = self.choose_child_node_to_descend_to() self.switch_init_node(best_child_node) stime = time.time() self.simulate(self.s0_node, depth) time_to_search += time.time() - stime """ tmp = [] if np.any([a.continuous_parameters['is_feasible'] for a in self.s0_node.A]): feasible_action = [a for a in self.s0_node.A if a.continuous_parameters['is_feasible']][0] #self.log_current_tree_to_dot_file(iteration) tmp.append(self.s0_node.Q[feasible_action]) import pdb;pdb.set_trace() """ # self.log_current_tree_to_dot_file(iteration) best_traj_rwd, progress, best_node = self.tree.get_best_trajectory_sum_rewards_and_node( self.discount_rate) search_time_to_reward.append( [time_to_search, iteration, best_traj_rwd, len(progress)]) plan = self.retrace_best_plan() # rewards = np.array([np.max(rlist) for rlist in self.s0_node.reward_history.values()]) n_feasible = np.sum( [self.s0_node.is_action_feasible(a) for a in self.s0_node.A]) print 'n feasible actions , n_switch ', n_feasible, self.n_switch print search_time_to_reward[-1], np.argmax( np.array(search_time_to_reward)[:, 2]) if time_to_search > max_time: break self.environment.reset_to_init_state(self.s0_node) return search_time_to_reward, self.s0_node.best_v, plan def choose_action(self, curr_node, depth): if not self.use_progressive_widening: is_synthetic = self.environment.name.find('synthetic') != -1 is_convbelt = self.environment.name.find('convbelt') != -1 is_mdr = self.environment.name.find( 'minimum_displacement_removal') != -1 if is_synthetic: w_param = self.widening_parameter * np.power(0.8, depth) elif is_mdr: w_param = self.widening_parameter * np.power(0.99, depth) elif is_convbelt: w_param = self.widening_parameter * np.power(0.99, depth) else: w_param = self.widening_parameter print "Widening parameter ", w_param if not curr_node.is_reevaluation_step( w_param, self.environment.infeasible_reward, self.use_progressive_widening, self.use_ucb): print "Is time to sample new action? True" new_continuous_parameters = self.sample_continuous_parameters( curr_node) curr_node.add_actions(new_continuous_parameters) action = curr_node.A[-1] else: print "Re-evaluation? True" if self.use_ucb: action = curr_node.perform_ucb_over_actions() else: action = curr_node.choose_new_arm() return action def update_node_statistics(self, curr_node, action, sum_rewards, reward): # todo rewrite this function curr_node.Nvisited += 1 is_action_never_tried = curr_node.N[action] == 0 if is_action_never_tried: curr_node.reward_history[action] = [reward] curr_node.N[action] += 1 curr_node.Q[action] = sum_rewards else: curr_node.reward_history[action].append(reward) curr_node.N[action] += 1 if self.environment.name.find('synthetic') == -1: if self.use_max_backup: if sum_rewards > curr_node.Q[action]: curr_node.Q[action] = sum_rewards else: curr_node.Q[action] += (sum_rewards - curr_node.Q[action] ) / float(curr_node.N[action]) else: if self.use_max_backup: if sum_rewards > curr_node.Q[action]: curr_node.Q[action] = sum_rewards else: curr_node.Q[action] += (sum_rewards - curr_node.Q[action] ) / float(curr_node.N[action]) @staticmethod def update_goal_node_statistics(curr_node, reward): # todo rewrite this function curr_node.Nvisited += 1 curr_node.reward = reward def visualize_samples_from_sampling_agent(self, node): action, status, doo_node, action_parameters = node.sampling_agent.sample_feasible_action( node, self.n_feasibility_checks) def simulate(self, curr_node, depth): print "Curr node idx", curr_node.idx if self.environment.is_goal_reached(): # arrived at the goal state if not curr_node.is_goal_and_already_visited: # todo mark the goal trajectory, and don't revisit the actions on the goal trajectory self.found_solution = True curr_node.is_goal_node = True print "Solution found, returning the goal reward", self.goal_reward self.update_goal_node_statistics(curr_node, self.goal_reward) return self.goal_reward if depth == self.depth_limit: if len(curr_node.parent.reward_history) > 0: print np.max(curr_node.parent.reward_history.values()) return 0 if DEBUG: print "At depth ", depth print "Is it time to pick?", self.environment.is_pick_time() action = self.choose_action(curr_node, depth) reward = self.environment.apply_operator_instance(action, curr_node) print "Executed ", action.type, action.continuous_parameters[ 'is_feasible'], action.discrete_parameters print "reward ", reward if not curr_node.is_action_tried(action): next_node = self.create_node(action, depth + 1, reward, is_init_node=False) self.tree.add_node(next_node, action, curr_node) next_node.sum_ancestor_action_rewards = next_node.parent.sum_ancestor_action_rewards + reward else: next_node = curr_node.children[action] is_infeasible_action = self.is_simulated_action_infeasible( reward, action) if is_infeasible_action: sum_rewards = reward else: sum_rewards = reward + self.discount_rate * self.simulate( next_node, depth + 1) self.update_node_statistics(curr_node, action, sum_rewards, reward) if curr_node.is_init_node and curr_node.parent is not None: self.update_ancestor_node_statistics(curr_node.parent, curr_node.parent_action, sum_rewards) return sum_rewards def is_simulated_action_infeasible(self, reward, action): if self.environment.name.find('synthetic') != -1: return not self.environment.is_action_feasible(action) else: return reward == self.environment.infeasible_reward def update_ancestor_node_statistics(self, node, action, child_sum_rewards): if node is None: return parent_reward_to_node = node.reward_history[action][0] parent_sum_rewards = parent_reward_to_node + self.discount_rate * child_sum_rewards self.update_node_statistics(node, action, parent_sum_rewards, parent_reward_to_node) self.update_ancestor_node_statistics(node.parent, node.parent_action, parent_sum_rewards) def sample_continuous_parameters(self, node): return node.sampling_agent.sample_next_point(node, self.n_feasibility_checks)
class MCTS: def __init__(self, parameters, problem_env, goal_entities, v_fcn, learned_q): # MCTS parameters self.widening_parameter = parameters.widening_parameter self.ucb_parameter = parameters.ucb_parameter self.time_limit = parameters.timelimit self.n_motion_plan_trials = parameters.n_motion_plan_trials self.use_ucb = parameters.use_ucb self.use_progressive_widening = parameters.pw self.n_feasibility_checks = parameters.n_feasibility_checks self.use_v_fcn = parameters.use_learned_q self.v_fcn = v_fcn self.learned_q = learned_q self.use_shaped_reward = parameters.use_shaped_reward self.planning_horizon = parameters.planning_horizon self.sampling_strategy = parameters.sampling_strategy self.explr_p = parameters.explr_p self.switch_frequency = parameters.switch_frequency # Hard-coded params self.check_reachability = True self.discount_rate = 1.0 # Environment setup self.problem_env = problem_env self.env = self.problem_env.env self.robot = self.problem_env.robot # MCTS initialization self.s0_node = None self.tree = MCTSTree(self.ucb_parameter) self.best_leaf_node = None self.goal_entities = goal_entities # Logging purpose self.search_time_to_reward = [] self.reward_lists = [] self.progress_list = [] self.found_solution = False self.swept_volume_constraint = None def load_pickled_tree(self, fname=None): if fname is None: fname = 'tmp_tree.pkl' self.tree = pickle.load(open(fname, 'r')) def visit_all_nodes(self, curr_node): children = curr_node.children.values() print curr_node in self.tree.nodes for c in children: self.visit_all_nodes(c) def save_tree(self, fname=None): if fname is None: fname = 'tmp_tree.pkl' self.tree.make_tree_picklable() pickle.dump(self.tree, open(fname, 'wb')) def load_tree(self, fname=None): if fname is None: fname = 'tmp_tree.pkl' self.tree = pickle.load(open(fname, 'r')) def get_node_at_idx(self, idx): for n in self.tree.nodes: if n.idx == idx: return n return None def create_sampling_agent(self, node): operator_skeleton = node.operator_skeleton if 'one_arm' in self.problem_env.name: dont_check_motion_existence = True else: dont_check_motion_existence = False if self.sampling_strategy == 'uniform': generator = UniformPaPGenerator(node, operator_skeleton, self.problem_env, None, n_candidate_params_to_smpl=self.n_motion_plan_trials, total_number_of_feasibility_checks=self.n_feasibility_checks, dont_check_motion_existence=dont_check_motion_existence) elif self.sampling_strategy == 'voo': return PaPVOOGenerator(node, operator_skeleton, self.problem_env, None, self.n_feasibility_checks, self.n_motion_plan_trials, dont_check_motion_existence, explr_p=self.explr_p, c1=1, sampling_mode='gaussian', counter_ratio=1) else: raise NotImplementedError return generator def compute_state(self, parent_node, parent_action): if self.problem_env.is_goal_reached(): state = parent_node.state else: if parent_node is None: parent_state = None else: parent_state = parent_node.state # where is the parent state? if self.problem_env.name.find('one_arm') != -1: state = OneArmPaPState(self.problem_env, parent_state=parent_state, parent_action=parent_action, goal_entities=self.goal_entities) else: if socket.gethostname() == 'dell-XPS-15-9560': if parent_node is None: idx = -1 else: idx = parent_node.idx fname = './tmp_%d.pkl' % idx if os.path.isfile(fname): state = pickle.load(open(fname, 'r')) state.make_plannable(self.problem_env) else: state = ShortestPathPaPState(self.problem_env, # what's this? parent_state=parent_state, parent_action=parent_action, goal_entities=self.goal_entities, planner='mcts') state.make_pklable() pickle.dump(state, open(fname, 'wb')) state.make_plannable(self.problem_env) else: state = ShortestPathPaPState(self.problem_env, # what's this? parent_state=parent_state, parent_action=parent_action, goal_entities=self.goal_entities, planner='mcts') return state def get_current_state(self, parent_node, parent_action, is_parent_action_infeasible): # this needs to be factored # why do I need a parent node? Can I just get away with parent state? is_operator_skeleton_node = (parent_node is None) or (not parent_node.is_operator_skeleton_node) if is_parent_action_infeasible: state = None elif is_operator_skeleton_node: state = self.compute_state(parent_node, parent_action) else: state = parent_node.state return state def create_node(self, parent_action, depth, parent_node, is_parent_action_infeasible, is_init_node=False): state_saver = utils.CustomStateSaver(self.problem_env.env) is_operator_skeleton_node = (parent_node is None) or (not parent_node.is_operator_skeleton_node) state = self.get_current_state(parent_node, parent_action, is_parent_action_infeasible) if is_operator_skeleton_node: applicable_op_skeletons = self.problem_env.get_applicable_ops(parent_action) node = DiscreteTreeNodeWithPriorQ(state, self.ucb_parameter, depth, state_saver, is_operator_skeleton_node, is_init_node, applicable_op_skeletons, self.learned_q) else: node = ContinuousTreeNode(state, parent_action, self.ucb_parameter, depth, state_saver, is_operator_skeleton_node, is_init_node) node.sampling_agent = self.create_sampling_agent(node) node.parent = parent_node node.parent_action = parent_action return node @staticmethod def get_best_child_node(node): if len(node.children) == 0: return None else: best_child_action_idx = np.argmax(node.Q.values()) best_child_action = node.Q.keys()[best_child_action_idx] return node.children[best_child_action] def retrace_best_plan(self): plan = [] _, _, best_leaf_node = self.tree.get_best_trajectory_sum_rewards_and_node(self.discount_rate) curr_node = best_leaf_node while not curr_node.parent is None: plan.append(curr_node.parent_action) curr_node = curr_node.parent plan = plan[::-1] return plan, best_leaf_node def get_best_goal_node(self): leaves = self.tree.get_leaf_nodes() goal_nodes = [leaf for leaf in leaves if leaf.is_goal_node] if len(goal_nodes) > 1: best_traj_reward, curr_node, _ = self.tree.get_best_trajectory_sum_rewards_and_node(self.discount_rate) else: curr_node = goal_nodes[0] return curr_node def switch_init_node(self, node): self.s0_node.is_init_node = False self.s0_node = node self.s0_node.is_init_node = True self.problem_env.reset_to_init_state(node) self.found_solution = False @staticmethod def choose_child_node_to_descend_to(node): # todo: implement the one with highest visitation if node.is_operator_skeleton_node and len(node.A) == 1: # descend to grand-child only_child_node = node.children.values()[0] best_action = only_child_node.Q.keys()[np.argmax(only_child_node.Q.values())] best_node = only_child_node.children[best_action] else: best_action = node.Q.keys()[np.argmax(node.Q.values())] best_node = node.children[best_action] return best_node def log_current_tree_to_dot_file(self, iteration, node_to_search_from): if socket.gethostname() == 'dell-XPS-15-9560': write_dot_file(self.tree, iteration, '', node_to_search_from) def log_performance(self, time_to_search, iteration): best_traj_rwd, progress, best_node = self.tree.get_best_trajectory_sum_rewards_and_node(self.discount_rate) self.search_time_to_reward.append([time_to_search, iteration, best_traj_rwd, progress, self.found_solution]) self.progress_list.append(progress) self.best_leaf_node = best_node def is_optimal_solution_found(self): best_traj_rwd, best_node, reward_list = self.tree.get_best_trajectory_sum_rewards_and_node(self.discount_rate) if self.found_solution: return True # if self.problem_env.reward_function.is_optimal_plan_found(best_traj_rwd): # print "Optimal score found" # return True # else: # return False else: return False def search(self, n_iter=np.inf, iteration_for_tree_logging=0, node_to_search_from=None, max_time=np.inf): depth = 0 time_to_search = 0 if node_to_search_from is None: self.s0_node = self.create_node(None, depth=0, parent_node=None, is_parent_action_infeasible=False, is_init_node=True) self.tree.set_root_node(self.s0_node) node_to_search_from = self.s0_node new_trajs = [] plan = [] if n_iter == np.inf: n_iter = 999999 for iteration in range(1, n_iter): print '*****SIMULATION ITERATION %d' % iteration self.problem_env.reset_to_init_state(node_to_search_from) new_traj = [] stime = time.time() self.simulate(node_to_search_from, node_to_search_from, depth, new_traj) time_to_search += time.time() - stime new_trajs.append(new_traj) # note that I need to evaluate all actions in a node to switch is_time_to_switch_node = iteration % self.switch_frequency == 0 # and the node should be feasible # I have to have a feasible action to switch if this is an instance node if is_time_to_switch_node: if node_to_search_from.is_operator_skeleton_node: node_to_search_from = node_to_search_from.get_child_with_max_value() else: max_child = node_to_search_from.get_child_with_max_value() if max_child.parent_action.continuous_parameters['is_feasible']: node_to_search_from = node_to_search_from.get_child_with_max_value() if iteration % 10 == 0: self.log_current_tree_to_dot_file(iteration_for_tree_logging + iteration, node_to_search_from) self.log_performance(time_to_search, iteration) print self.search_time_to_reward[iteration_for_tree_logging:] # break if the solution is found if self.is_optimal_solution_found(): print "Optimal score found" plan, _ = self.retrace_best_plan() break if time_to_search > max_time: print "Time is up" break self.problem_env.reset_to_init_state(node_to_search_from) return self.search_time_to_reward, plan def get_best_trajectory(self, node_to_search_from, trajectories): traj_rewards = [] curr_node = node_to_search_from for trj in trajectories: traj_sum_reward = 0 for aidx, a in enumerate(trj): traj_sum_reward += np.power(self.discount_rate, aidx) * curr_node.reward_history[a][0] curr_node = curr_node.children[a] traj_rewards.append(traj_sum_reward) return trajectories[np.argmax(traj_rewards)], curr_node def choose_action(self, curr_node): if curr_node.is_operator_skeleton_node: print "Skeleton node" # here, perform psa with the learned q action = curr_node.perform_ucb_over_actions() else: print 'Instance node' if curr_node.sampling_agent is None: # this happens if the tree has been pickled curr_node.sampling_agent = self.create_sampling_agent(curr_node) if not curr_node.is_reevaluation_step(self.widening_parameter, self.problem_env.reward_function.infeasible_reward, self.use_progressive_widening, self.use_ucb): print "Sampling new action" new_continuous_parameters = self.sample_continuous_parameters(curr_node) curr_node.add_actions(new_continuous_parameters) action = curr_node.A[-1] else: print "Re-evaluation of actions" if self.use_ucb: action = curr_node.perform_ucb_over_actions() else: action = curr_node.choose_new_arm() return action @staticmethod def update_goal_node_statistics(curr_node, reward): # todo rewrite this function curr_node.Nvisited += 1 curr_node.reward = reward def simulate(self, curr_node, node_to_search_from, depth, new_traj): if self.problem_env.reward_function.is_goal_reached(): if not curr_node.is_goal_and_already_visited: self.found_solution = True curr_node.is_goal_node = True print "Solution found, returning the goal reward", self.problem_env.reward_function.goal_reward self.update_goal_node_statistics(curr_node, self.problem_env.reward_function.goal_reward) return self.problem_env.reward_function.goal_reward if depth == self.planning_horizon: # would it ever get here? why does it not satisfy the goal? print "Depth limit reached" return 0 if DEBUG: print "At depth ", depth print "Is it time to pick?", self.problem_env.is_pick_time() action = self.choose_action(curr_node) is_action_feasible = self.apply_action(curr_node, action) if not curr_node.is_action_tried(action): next_node = self.create_node(action, depth + 1, curr_node, not is_action_feasible) self.tree.add_node(next_node, action, curr_node) reward = self.problem_env.reward_function(curr_node.state, next_node.state, action, depth) next_node.parent_action_reward = reward next_node.sum_ancestor_action_rewards = next_node.parent.sum_ancestor_action_rewards + reward else: next_node = curr_node.children[action] reward = next_node.parent_action_reward print "Reward", reward if not is_action_feasible: # this (s,a) is a dead-end print "Infeasible action" if self.use_v_fcn: sum_rewards = reward + curr_node.parent.v_fcn[curr_node.parent_action] print sum_rewards else: sum_rewards = reward else: sum_rewards = reward + self.discount_rate * self.simulate(next_node, node_to_search_from, depth + 1, new_traj) curr_node.update_node_statistics(action, sum_rewards, reward) if curr_node == node_to_search_from and curr_node.parent is not None: self.update_ancestor_node_statistics(curr_node.parent, curr_node.parent_action, sum_rewards) # todo return a plan return sum_rewards def update_ancestor_node_statistics(self, node, action, child_sum_rewards): if node is None: return parent_reward_to_node = node.reward_history[action][0] parent_sum_rewards = parent_reward_to_node + self.discount_rate * child_sum_rewards node.update_node_statistics(action, parent_sum_rewards, parent_reward_to_node) self.update_ancestor_node_statistics(node.parent, node.parent_action, parent_sum_rewards) def apply_action(self, node, action): if node.is_operator_skeleton_node: print "Applying skeleton", action.type, action.discrete_parameters['object'], \ action.discrete_parameters['region'] is_feasible = self.problem_env.apply_operator_skeleton(node.state, action) else: print "Applying instance", action.type, action.discrete_parameters['object'], action.discrete_parameters[ 'region'] is_feasible = self.problem_env.apply_operator_instance(node.state, action, self.check_reachability) return is_feasible def sample_continuous_parameters(self, node): if self.problem_env.name.find('one_arm') != -1: raise NotImplementedError else: if isinstance(node.state, StateWithoutCspacePredicates): current_collides = None else: current_collides = node.state.collisions_at_all_obj_pose_pairs smpled_param = node.sampling_agent.sample_next_point(cached_collisions=current_collides, cached_holding_collisions=None) return smpled_param