def test_planner(tiger_problem, planner, nsteps=3): """ Runs the action-feedback loop of Tiger problem POMDP Args: tiger_problem (TigerProblem): an instance of the tiger problem. planner (Planner): a planner nsteps (int): Maximum number of steps to run this loop. """ for i in range(nsteps): action = planner.plan(tiger_problem.agent) print("==== Step %d ====" % (i+1)) print("True state: %s" % tiger_problem.env.state) print("Belief: %s" % str(tiger_problem.agent.cur_belief)) print("Action: %s" % str(action)) print("Reward: %s" % str(tiger_problem.env.reward_model.sample(tiger_problem.env.state, action, None))) # Let's create some simulated real observation; Update the belief # Creating true observation for sanity checking solver behavior. # In general, this observation should be sampled from agent's observation model. real_observation = Observation(tiger_problem.env.state.name) print(">> Observation: %s" % real_observation) tiger_problem.agent.update_history(action, real_observation) planner.update(tiger_problem.agent, action, real_observation) if isinstance(planner, pomdp_py.POUCT): print("Num sims: %d" % planner.last_num_sims) print("Plan time: %.5f" % planner.last_planning_time) if isinstance(tiger_problem.agent.cur_belief, pomdp_py.Histogram): new_belief = pomdp_py.update_histogram_belief(tiger_problem.agent.cur_belief, action, real_observation, tiger_problem.agent.observation_model, tiger_problem.agent.transition_model) tiger_problem.agent.set_belief(new_belief)
def belief_update(agent, real_action, real_observation, next_robot_state, planner): """Updates the agent's belief; The belief update may happen through planner update (e.g. when planner is POMCP).""" # Updates the planner; In case of POMCP, agent's belief is also updated. planner.update(agent, real_action, real_observation) # Update agent's belief, when planner is not POMCP if not isinstance(planner, pomdp_py.POMCP): # Update belief for every object for objid in agent.cur_belief.object_beliefs: belief_obj = agent.cur_belief.object_belief(objid) if isinstance(belief_obj, pomdp_py.Histogram): if objid == agent.robot_id: # Assuming the agent can observe its own state: new_belief = pomdp_py.Histogram({next_robot_state: 1.0}) else: # This is doing # B(si') = normalizer * O(oi|si',sr',a) * sum_s T(si'|s,a)*B(si) # # Notes: First, objects are static; Second, # O(oi|s',a) ~= O(oi|si',sr',a) according to the definition # of the observation model in models/observation.py. Note # that the exact belief update rule for this OOPOMDP needs to use # a model like O(oi|si',sr',a) because it's intractable to # consider s' (that means all combinations of all object # states must be iterated). Of course, there could be work # around (out of scope) - Consider a volumetric observaiton, # instead of the object-pose observation. That means oi is a # set of pixels (2D) or voxels (3D). Note the real # observation, oi, is most likely sampled from O(oi|s',a) # because real world considers the occlusion between objects # (due to full state s'). The problem is how to compute the # probability of this oi given s' and a, where it's # intractable to obtain s'. To this end, we can make a # simplifying assumption that an object is contained within # one pixel (or voxel); The pixel (or voxel) is labeled to # indicate free space or object. The label of each pixel or # voxel is certainly a result of considering the full state # s. The occlusion can be handled nicely with the volumetric # observation definition. Then that assumption can reduce the # observation model from O(oi|s',a) to O(label_i|s',a) and # it becomes easy to define O(label_i=i|s',a) and O(label_i=FREE|s',a). # These ideas are used in my recent 3D object search work. new_belief = pomdp_py.update_histogram_belief( belief_obj, real_action, real_observation.for_obj(objid), agent.observation_model[objid], agent.transition_model[objid], # The agent knows the objects are static. static_transition=objid != agent.robot_id, oargs={"next_robot_state": next_robot_state}) else: raise ValueError("Unexpected program state."\ "Are you using the appropriate belief representation?") agent.cur_belief.set_object_belief(objid, new_belief)
def test_vi_pruning(pomdp_solve_path, return_policy_graph=True): print("[testing] test_vi_pruning") tiger = make_tiger() # Building a policy graph print("[testing] solving the tiger problem...") policy = vi_pruning(tiger.agent, pomdp_solve_path, discount_factor=0.95, options=["-horizon", "100"], remove_generated_files=True, return_policy_graph=return_policy_graph) assert str(policy.plan(tiger.agent)) == "listen",\ "Bad solution. Test failed." # Plan with the graph for several steps. So we should get high rewards # eventually in the tiger domain. got_high_reward = False for step in range(10): true_state = tiger.env.state action = policy.plan(tiger.agent) observation = tiger.agent.observation_model.sample(true_state, action) reward = tiger.env.reward_model.sample(true_state, action, None) print("[testing] simulating computed policy graph"\ "(step=%d, action=%s, observation=%s, reward=%d)" % (step, action, observation, reward)) # No belief update needed. Just update the policy graph if return_policy_graph: # We use policy graph. No belief update is needed. Just update the policy. policy.update(tiger.agent, action, observation) else: # belief update is needed new_belief = pomdp_py.update_histogram_belief( tiger.agent.cur_belief, action, observation, tiger.agent.observation_model, tiger.agent.transition_model) tiger.agent.set_belief(new_belief) assert reward == -1 or reward == 10, "Reward is negative. Failed." if reward == 10: got_high_reward = True assert got_high_reward, "Should have gotten high reward. Failed." print("Pass.")
def belief_update(agent, real_action, real_observation): # Update agent belief current_mpe_state = agent.cur_belief.mpe() next_robot_position = agent.transition_model.sample( current_mpe_state, real_action).robot_position next_state_space = set({}) for state in agent.cur_belief: next_state = copy.deepcopy(state) next_state.robot_position = next_robot_position next_state_space.add(next_state) new_belief = pomdp_py.update_histogram_belief( agent.cur_belief, real_action, real_observation, agent.observation_model, agent.transition_model, next_state_space=next_state_space) agent.set_belief(new_belief)
def test_sarsop(pomdpsol_path): print("[testing] test_sarsop") tiger = make_tiger() # Building a policy graph print("[testing] solving the tiger problem...") policy = sarsop(tiger.agent, pomdpsol_path, discount_factor=0.95, timeout=10, memory=20, precision=0.000001, remove_generated_files=True, logfile="test_sarsop.log") assert str(policy.plan(tiger.agent)) == "listen",\ "Bad solution. Test failed." assert os.path.exists("test_sarsop.log") os.remove("test_sarsop.log") # Plan with the graph for several steps. So we should get high rewards # eventually in the tiger domain. got_high_reward = False for step in range(10): true_state = tiger.env.state action = policy.plan(tiger.agent) observation = tiger.agent.observation_model.sample(true_state, action) reward = tiger.env.reward_model.sample(true_state, action, None) print("[testing] running computed policy graph"\ "(step=%d, action=%s, observation=%s, reward=%d)" % (step, action, observation, reward)) # belief update new_belief = pomdp_py.update_histogram_belief(tiger.agent.cur_belief, action, observation, tiger.agent.observation_model, tiger.agent.transition_model) tiger.agent.set_belief(new_belief) assert reward == -1 or reward == 10, "Reward is negative. Failed." if reward == 10: got_high_reward = True assert got_high_reward, "Should have gotten high reward. Failed." print("Pass.")
def test_planner(tiger_problem, planner, nsteps=3, debug_tree=False): """ Runs the action-feedback loop of Tiger problem POMDP Args: tiger_problem (TigerProblem): a problem instance planner (Planner): a planner nsteps (int): Maximum number of steps to run this loop. debug_tree (bool): True if get into the pdb with a TreeDebugger created as 'dd' variable. """ for i in range(nsteps): action = planner.plan(tiger_problem.agent) if debug_tree: from pomdp_py.utils import TreeDebugger dd = TreeDebugger(tiger_problem.agent.tree) import pdb pdb.set_trace() print("==== Step %d ====" % (i + 1)) print("True state:", tiger_problem.env.state) print("Belief:", tiger_problem.agent.cur_belief) print("Action:", action) # There is no state transition for the tiger domain. # In general, the ennvironment state can be transitioned # using # # reward = tiger_problem.env.state_transition(action, execute=True) # # Or, it is possible that you don't have control # over the environment change (e.g. robot acting # in real world); In that case, you could skip # the state transition and re-estimate the state # (e.g. through the perception stack on the robot). reward = tiger_problem.env.reward_model.sample(tiger_problem.env.state, action, None) print("Reward:", reward) # Let's create some simulated real observation; # Here, we use observation based on true state for sanity # checking solver behavior. In general, this observation # should be sampled from agent's observation model, as # # real_observation = tiger_problem.agent.observation_model.sample(tiger_problem.env.state, action) # # or coming from an external source (e.g. robot sensor # reading). Note that tiger_problem.env.state stores the # environment state after action execution. real_observation = TigerObservation(tiger_problem.env.state.name) print(">> Observation:", real_observation) tiger_problem.agent.update_history(action, real_observation) # Update the belief. If the planner is POMCP, planner.update # also automatically updates agent belief. planner.update(tiger_problem.agent, action, real_observation) if isinstance(planner, pomdp_py.POUCT): print("Num sims:", planner.last_num_sims) print("Plan time: %.5f" % planner.last_planning_time) if isinstance(tiger_problem.agent.cur_belief, pomdp_py.Histogram): new_belief = pomdp_py.update_histogram_belief( tiger_problem.agent.cur_belief, action, real_observation, tiger_problem.agent.observation_model, tiger_problem.agent.transition_model) tiger_problem.agent.set_belief(new_belief) if action.name.startswith("open"): # Make it clearer to see what actions are taken # until every time door is opened. print("\n")