def get_node_hcount(self, node): is_infeasible_parent_action = node.state is None if is_infeasible_parent_action: return len( get_objects_to_move(node.parent.state, node.parent.state.problem_env)) else: return len(get_objects_to_move(node.state, node.state.problem_env))
def __init__(self, state, ucb_parameter, depth, state_saver, is_operator_skeleton_node, is_init_node, actions, learned_q): # psa is based on the number of objs to move DiscreteTreeNode.__init__(self, state, ucb_parameter, depth, state_saver, is_operator_skeleton_node, is_init_node, actions, learned_q) is_infeasible_state = self.state is None if is_infeasible_state: for a in self.A: self.Q[a] = 0 else: objs_to_move = get_objects_to_move(self.state, self.state.problem_env) for a in self.A: self.Q[a] = -len(objs_to_move)
def get_action_with_highest_ucb_value(self, actions, q_values): best_value = -np.inf if self.learned_q is not None: # todo make this more efficient by calling predict_with_raw_* init_q_values = [self.learned_q.predict(self.state, a) for a in actions] exp_sum = np.sum([np.exp(q) for q in init_q_values]) else: objects_to_move = get_objects_to_move(self.state, self.state.problem_env) init_q_values = [] for a in actions: obj_name = a.discrete_parameters['object'] region_name = a.discrete_parameters['region'] o_reachable = self.state.is_entity_reachable(obj_name) o_r_manip_free = self.state.binary_edges[(obj_name, region_name)][-1] o_needs_to_be_moved = obj_name in objects_to_move if o_reachable and o_r_manip_free and o_needs_to_be_moved: val = 1 else: val = 0 init_q_values.append(val) exp_sum = np.sum([np.exp(q) for q in init_q_values]) action_ucb_values = [] for action, value, learned_value in zip(actions, q_values, init_q_values): q_bonus = np.exp(learned_value) / float(exp_sum) ucb_value = value + q_bonus + self.compute_ucb_value(action) obj_name = action.discrete_parameters['object'] region_name = action.discrete_parameters['region'] print "%30s %30s Reachable? %d ManipFree? %d IsGoal? %d Q? %.5f QBonus? %.5f Q+UCB? %.5f" \ % (obj_name, region_name, self.state.is_entity_reachable(obj_name), self.state.binary_edges[(obj_name, region_name)][-1], obj_name in self.state.goal_entities, self.Q[action], q_bonus, ucb_value) action_ucb_values.append(ucb_value) if ucb_value > best_value: best_value = ucb_value boolean_idxs_with_highest_ucb = (np.max(action_ucb_values) == action_ucb_values).squeeze() best_action_idx = np.random.randint(np.sum(boolean_idxs_with_highest_ucb)) best_action = np.array(actions)[boolean_idxs_with_highest_ucb][best_action_idx] return best_action
def main(): parameters = parse_mover_problem_parameters() filename = 'pidx_%d_planner_seed_%d.pkl' % (parameters.pidx, parameters.planner_seed) save_dir = make_and_get_save_dir(parameters, filename) set_seed(parameters.pidx) problem_env = PaPMoverEnv(parameters.pidx) goal_object_names = [obj.GetName() for obj in problem_env.objects[:parameters.n_objs_pack]] goal_region_name = [problem_env.regions['home_region'].name] goal = goal_region_name + goal_object_names problem_env.set_goal(goal) goal_entities = goal_object_names + goal_region_name if parameters.use_shaped_reward: reward_function = ShapedRewardFunction(problem_env, goal_object_names, goal_region_name[0], parameters.planning_horizon) else: reward_function = GenericRewardFunction(problem_env, goal_object_names, goal_region_name[0], parameters.planning_horizon) motion_planner = OperatorBaseMotionPlanner(problem_env, 'prm') problem_env.set_reward_function(reward_function) problem_env.set_motion_planner(motion_planner) learned_q = None prior_q = None if parameters.use_learned_q: learned_q = load_learned_q(parameters, problem_env) v_fcn = lambda state: -len(get_objects_to_move(state, problem_env)) if parameters.planner == 'mcts': planner = MCTS(parameters, problem_env, goal_entities, prior_q, learned_q) elif parameters.planner == 'mcts_with_leaf_strategy': planner = MCTSWithLeafStrategy(parameters, problem_env, goal_entities, v_fcn, learned_q) else: raise NotImplementedError set_seed(parameters.planner_seed) search_time_to_reward, plan = planner.search(max_time=parameters.timelimit) pickle.dump({"search_time_to_reward": search_time_to_reward, 'plan': plan, 'n_nodes': len(planner.tree.get_discrete_nodes())}, open(save_dir+filename, 'wb'))
def main(): commit_hash = get_commit_hash() parameters = parse_mover_problem_parameters() filename = 'pidx_%d_planner_seed_%d.pkl' % (parameters.pidx, parameters.planner_seed) save_dir = make_and_get_save_dir(parameters, filename, commit_hash) solution_file_name = save_dir + filename is_problem_solved_before = os.path.isfile(solution_file_name) print solution_file_name if is_problem_solved_before and not parameters.f: print "***************Already solved********************" with open(solution_file_name, 'rb') as f: trajectory = pickle.load(f) tottime = trajectory['search_time_to_reward'][-1][2] print 'Time: %.2f ' % tottime sys.exit(-1) set_seed(parameters.pidx) problem_env = PaPMoverEnv(parameters.pidx) goal_objs = [ 'square_packing_box1', 'square_packing_box2', 'rectangular_packing_box3', 'rectangular_packing_box4' ] goal_region = 'home_region' problem_env.set_goal(goal_objs, goal_region) goal_entities = goal_objs + [goal_region] if parameters.use_shaped_reward: # uses the reward shaping per Ng et al. reward_function = ShapedRewardFunction(problem_env, goal_objs, goal_region, parameters.planning_horizon) else: reward_function = GenericRewardFunction(problem_env, goal_objs, goal_region, parameters.planning_horizon) motion_planner = OperatorBaseMotionPlanner(problem_env, 'prm') problem_env.set_reward_function(reward_function) problem_env.set_motion_planner(motion_planner) learned_q = None prior_q = None if parameters.use_learned_q: learned_q = load_learned_q(parameters, problem_env) v_fcn = lambda state: -len(get_objects_to_move(state, problem_env)) if parameters.planner == 'mcts': planner = MCTS(parameters, problem_env, goal_entities, prior_q, learned_q) elif parameters.planner == 'mcts_with_leaf_strategy': planner = MCTSWithLeafStrategy(parameters, problem_env, goal_entities, v_fcn, learned_q) else: raise NotImplementedError set_seed(parameters.planner_seed) stime = time.time() search_time_to_reward, n_feasibility_checks, plan = planner.search( max_time=parameters.timelimit) tottime = time.time() - stime print 'Time: %.2f ' % (tottime) # todo # save the entire tree pickle.dump( { "search_time_to_reward": search_time_to_reward, 'plan': plan, 'commit_hash': commit_hash, 'n_feasibility_checks': n_feasibility_checks, 'n_nodes': len(planner.tree.get_discrete_nodes()) }, open(save_dir + filename, 'wb'))