def test_corridor_switch_no_clash_possible(solver_describer: SolverDescriber): grid = MapfGrid(['...', '@.@']) agents_starts = ((0, 0), (0, 2)) agents_goals = ((0, 2), (0, 0)) # These parameters are for making sure that the solver avoids collision regardless of reward efficiency env = MapfEnv(grid, 2, agents_starts, agents_goals, 0.1, 0.1, -0.001, 0, -1) info = {} policy = solver_describer.func(env, info) # Assert no conflict is possible interesting_state = env.locations_to_state(((1, 1), (0, 1))) expected_possible_actions = [ vector_action_to_integer((STAY, UP)), vector_action_to_integer((DOWN, UP)) ] assert policy.act(interesting_state) in expected_possible_actions # Check the policy performance reward, clashed, _ = evaluate_policy(policy, 100, 100) # Make sure no clash happened assert not clashed # Assert the reward is reasonable assert reward >= 100.0 * env.reward_of_living
def test_copy_mapf_env(self): grid = MapfGrid(['....', '....', '....', '....', '....']) env = MapfEnv(grid, 1, ((0, 0), ), ((4, 0), ), 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) env.step(vector_action_to_integer((RIGHT, ))) env_copy = copy(env) env_copy.step(vector_action_to_integer((RIGHT, )))
def test_reward_single_agent_makespan(self): grid = MapfGrid(['....', '....', '....', '....', '....']) start_locations = ((0, 0), ) goal_locations = ((4, 0), ) determinstic_env = MapfEnv(grid, 1, start_locations, goal_locations, 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) total_reward = 0 down_action = vector_action_to_integer((DOWN, )) _, r, _, _ = determinstic_env.step(down_action) total_reward += r _, r, _, _ = determinstic_env.step(down_action) total_reward += r _, r, _, _ = determinstic_env.step(down_action) total_reward += r s, r, done, _ = determinstic_env.step(down_action) total_reward += r self.assertEqual(s, determinstic_env.locations_to_state(goal_locations)) self.assertEqual(r, REWARD_OF_LIVING + REWARD_OF_GOAL) self.assertEqual(total_reward, REWARD_OF_GOAL + 4 * REWARD_OF_LIVING)
def test_similar_transitions_probability_summed(self): grid = MapfGrid(['..', '..']) env = MapfEnv(grid, 1, ((0, 0), ), ((1, 1), ), 0.1, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) a = vector_action_to_integer((STAY, STAY)) self.assertEqual(env.P[env.s][a], [((1, False), env.s, REWARD_OF_LIVING, False)])
def test_reward_multiagent_soc_stay_actions(self): grid = MapfGrid(['....', '....', '....', '....']) start_locations = ((0, 0), (3, 3), (1, 1)) goal_locations = ((0, 1), (1, 3), (1, 2)) determinstic_env = MapfEnv(grid, 3, start_locations, goal_locations, 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.SoC) right_stay_stay = vector_action_to_integer((RIGHT, STAY, STAY)) s, r, done, _ = determinstic_env.step(right_stay_stay) self.assertEqual(r, -3)
def test_reawrd_multiagent_makespan(self): grid = MapfGrid(['....', '....', '....', '....']) start_locations = ((0, 0), (3, 3), (1, 1)) goal_locations = ((0, 1), (1, 3), (1, 2)) determinstic_env = MapfEnv(grid, 3, start_locations, goal_locations, 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) total_reward = 0 right_up_right = vector_action_to_integer((RIGHT, UP, RIGHT)) s, r, done, _ = determinstic_env.step(right_up_right) total_reward += r self.assertFalse(done) stay_up_stay = vector_action_to_integer((STAY, UP, STAY)) s, r, done, _ = determinstic_env.step(stay_up_stay) total_reward += r self.assertEqual(s, determinstic_env.locations_to_state(goal_locations)) self.assertTrue(done) self.assertEqual(total_reward, 2 * REWARD_OF_LIVING + REWARD_OF_GOAL)
def test_action_from_terminal_state_has_no_effect(self): grid = MapfGrid(['..', '..']) env = MapfEnv(grid, 1, ((0, 0), ), ((1, 1), ), 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) state, reward, done, _ = env.step(vector_action_to_integer((RIGHT, ))) self.assertEqual(reward, REWARD_OF_LIVING) self.assertEqual(done, False) state, reward, done, _ = env.step(vector_action_to_integer((DOWN, ))) self.assertEqual(reward, REWARD_OF_LIVING + REWARD_OF_GOAL) self.assertEqual(done, True) # now, after the game is finished - do another step and make sure it has not effect. state_after_done, reward_after_done, done_after_done, _ = env.step( vector_action_to_integer((UP, ))) self.assertEqual(state_after_done, state) self.assertEqual(done_after_done, True) self.assertEqual(reward_after_done, 0) # another time like I'm trying to reach the goal state_after_done, reward_after_done, done_after_done, _ = env.step( vector_action_to_integer((DOWN, ))) self.assertEqual(state_after_done, state) self.assertEqual(done_after_done, True) self.assertEqual(reward_after_done, 0)
def act(self, joint_state): if joint_state in self.policy_cache: return self.policy_cache[joint_state] joint_action = () forbidden_states = set() for agent in range(self.env.n_agents): # TODO: the problem is that the best response is according to joint state even though we are in state s. # TODO: we shouldn't actually step in this part... local_action = best_response(self, joint_state, agent, forbidden_states, False) joint_action = joint_action + (ACTIONS[local_action], ) best_action = vector_action_to_integer(joint_action) self.policy_cache[joint_state] = best_action return best_action
def test_switch_spots_is_a_collision(self): grid = MapfGrid(['..']) agents_starts = ( (0, 0), (0, 1), ) agents_goals = ((0, 1), (0, 0)) determinstic_env = MapfEnv(grid, 2, agents_starts, agents_goals, 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) s, r, done, _ = determinstic_env.step( vector_action_to_integer((RIGHT, LEFT))) # Assert the game terminated in a collision self.assertEqual(done, True) self.assertEqual(r, REWARD_OF_LIVING + REWARD_OF_CLASH)
def test_roni_scenario_with_id(self): # TODO: this test only pass when the first action in the ACTIONS array is STAY, # fix it to work without the cheating grid = MapfGrid(['.@.', '.@.', '...']) agents_starts = ((0, 0), (0, 2)) agents_goals = ((2, 0), (2, 2)) env = MapfEnv(grid, 2, agents_starts, agents_goals, 0.1, 0.01, -1, 1, -0.1) independent_joiont_policy = solve_independently_and_cross( env, [[0], [1]], partial(value_iteration, 1.0), {}) interesting_state = env.locations_to_state(((0, 0), (0, 2))) # Assert independent_joint_policy just choose the most efficient action self.assertEqual(independent_joiont_policy.act(interesting_state), vector_action_to_integer((DOWN, DOWN))) # Assert no conflict self.assertEqual(detect_conflict(env, independent_joiont_policy), None)
def test_colliding_agents_state_is_terminal_and_negative_reward(self): map_file_path = os.path.abspath( os.path.join(__file__, MAPS_DIR, 'empty-8-8/empty-8-8.map')) grid = MapfGrid(parse_map_file(map_file_path)) # agents are starting a agent_starts = ((0, 0), (0, 2)) agents_goals = ((7, 7), (5, 5)) env = MapfEnv(grid, 2, agent_starts, agents_goals, FAIL_PROB, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) transitions = [ ((round(prob, 2), collision), next_state, reward, done) for ((prob, collision), next_state, reward, done) in env.P[env.s][vector_action_to_integer((RIGHT, LEFT))] ] self.assertIn(((0.64, True), env.locations_to_state( ((0, 1), (0, 1))), REWARD_OF_LIVING + REWARD_OF_CLASH, True), set(transitions))
def get_q(self, agent, joint_state, local_action): if joint_state in self.q_partial_table[agent]: if local_action in self.q_partial_table[agent][joint_state]: return self.q_partial_table[agent][joint_state][local_action] # Calculate Q[s][a] for each possible local action all_stay = (STAY, ) * self.env.n_agents joint_action_vector = all_stay[:agent] + ( ACTIONS[local_action], ) + all_stay[agent + 1:] joint_action = vector_action_to_integer(joint_action_vector) # Compute Q[s][a]. In case of a possible clash set the reward to -infinity q_value = 0 for prob, next_state, reward, done in self.env.P[joint_state][ joint_action]: if reward == self.env.reward_of_clash and done: q_value = -math.inf q_value += prob * (reward + (self.gamma * self.v[next_state])) self.q_partial_table[agent][joint_state][local_action] = q_value return self.q_partial_table[agent][joint_state][local_action]
def test_vector_action_to_integer(self): self.assertEqual((DOWN, UP), integer_action_to_vector(vector_action_to_integer((DOWN, UP)), 2))
def multi_agent_turn_based_rtdp_single_iteration(policy: MultiagentRtdpPolicy, info: Dict): s = policy.env.reset() done = False start = time.time() path = [] total_reward = 0 # # debug # print('--------start iteration---------------') steps = 0 while not done and steps < 1000: steps += 1 trajectory_actions = [] forbidden_states = set() joint_action_vector = (STAY, ) * policy.env.n_agents # Calculate local action for agent in range(policy.env.n_agents): local_action = best_response(policy, s, agent, forbidden_states, False) trajectory_actions.append(local_action) joint_action_vector = joint_action_vector[:agent] + ( ACTIONS[local_action], ) + joint_action_vector[agent + 1:] # # debug # policy.env.render() # print(f'selected action: {joint_action_vector}') # time.sleep(0.2) # Compose the joint action joint_action = vector_action_to_integer(joint_action_vector) path.append((s, joint_action)) # update the current state for agent in reversed(range(policy.env.n_agents)): # update q(s, agent, action) based on the last state policy.v_update(s) policy.q_update(agent, s, trajectory_actions[agent], joint_action) policy.visited_states[s] = policy.visited_states[s] + 1 # step s, r, done, _ = policy.env.step(joint_action) total_reward += r # # debug # policy.env.render() # # Backward update # while path: # s, joint_action = path.pop() # policy.v_update(s) # joint_action_vector = integer_action_to_vector(joint_action, policy.env.n_agents) # for agent in reversed(range(policy.env.n_agents)): # local_action = vector_action_to_integer((joint_action_vector[agent],)) # policy.q_update(agent, s, local_action, joint_action) # # debug # print('--------end iteration---------------') return total_reward
def test_couple_detect_conflict_3_agents_multiple_agents_in_group(self): """This test may sometime be used to test detecting a conflict for only a couple of agents. The test will make sure that agent 0 got no conflicts with 1 and 2 while agents 1 and 2 do get a conflict. Now agent 1 will be a part of a group contains both agent 0 and 1 ([0,1]). This way agent 1 index in its group will be 1 and not 0. This case is catching a bug I had previously. """ grid = MapfGrid(['...', '...', '...']) agents_starts = ((0, 0), (2, 0), (2, 2)) agents_goals = ((0, 2), (2, 2), (2, 0)) env = MapfEnv(grid, 3, agents_starts, agents_goals, 0, 0, -1, 1, -0.01) single_agent_env = MapfEnv(grid, 1, (agents_starts[0], ), (agents_goals[0], ), 0, 0, -1, 1, -0.01) env01 = get_local_view(env, [0, 1]) # >>S # SSS # SSS policy0 = { 0: ACTIONS.index(RIGHT), 1: ACTIONS.index(STAY), 2: ACTIONS.index(STAY), 3: ACTIONS.index(RIGHT), 4: ACTIONS.index(STAY), 5: ACTIONS.index(STAY), 6: ACTIONS.index(STAY), 7: ACTIONS.index(STAY), 8: ACTIONS.index(STAY), } # SSS # SSS # >>S policy1 = { 0: ACTIONS.index(STAY), 1: ACTIONS.index(STAY), 2: ACTIONS.index(RIGHT), 3: ACTIONS.index(STAY), 4: ACTIONS.index(STAY), 5: ACTIONS.index(RIGHT), 6: ACTIONS.index(STAY), 7: ACTIONS.index(STAY), 8: ACTIONS.index(STAY), } # policy01 is a cross between agent 0 and agent 1 policy01 = {} for s0 in range(9): for s1 in range(9): joint_state = env01.locations_to_state( (single_agent_env.state_to_locations(s0)[0], single_agent_env.state_to_locations(s1)[0])) policy01[joint_state] = vector_action_to_integer( (integer_action_to_vector(policy0[s0], 1)[0], integer_action_to_vector(policy1[s1], 1)[0])) # SSS # SSS # S<< policy2 = { 0: ACTIONS.index(STAY), 1: ACTIONS.index(STAY), 2: ACTIONS.index(STAY), 3: ACTIONS.index(STAY), 4: ACTIONS.index(STAY), 5: ACTIONS.index(LEFT), 6: ACTIONS.index(STAY), 7: ACTIONS.index(STAY), 8: ACTIONS.index(LEFT), } joint_policy = CrossedPolicy(env, [ DictPolicy(env01, 1.0, policy01), DictPolicy(get_local_view(env, [2]), 1.0, policy2) ], [[0, 1], [2]]) aux_local_env = get_local_view(env, [0]) # Assert a conflict is found for agents 1 and 2 self.assertEqual( couple_detect_conflict(env, joint_policy, 2, 1), ((2, aux_local_env.locations_to_state( ((2, 2), )), aux_local_env.locations_to_state(((2, 1), ))), (1, aux_local_env.locations_to_state( ((2, 0), )), aux_local_env.locations_to_state(((2, 1), ))))) # Assert no conflict is found for agents 0 and 1 self.assertIsNone(couple_detect_conflict(env, joint_policy, 0, 1)) # Assert no conflict is found for agents 0 and 2 self.assertIsNone(couple_detect_conflict(env, joint_policy, 0, 2))
def test_transition_function_empty_grid(self): """Assert the basic steps are done right. * Define an empty 8x8 environment with two agents starting at (0,0),(7,7) and desire to reach (0,2),(5,7). * Perform one (RIGHT, UP) step and assert that the transitions are correct. * Perform another (RIGHT, UP) step from the most probable next state from before ((0,1), (6,7)) and assert that the transitions are correct again, including the terminal one. """ map_file_path = os.path.abspath( os.path.join(__file__, MAPS_DIR, 'empty-8-8/empty-8-8.map')) grid = MapfGrid(parse_map_file(map_file_path)) # agents are starting a agent_starts = ((0, 0), (7, 7)) agents_goals = ((0, 2), (5, 7)) env = MapfEnv(grid, 2, agent_starts, agents_goals, FAIL_PROB, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) first_step_transitions = [ ((round(prob, 2), collision), next_state, reward, done) for ((prob, collision), next_state, reward, done) in env.P[env.s][vector_action_to_integer((RIGHT, UP))] ] self.assertEqual( set(first_step_transitions), { ((0.64, False), env.locations_to_state( ((0, 1), (6, 7))), REWARD_OF_LIVING, False), # (RIGHT, UP) ((0.08, False), env.locations_to_state( ((1, 0), (6, 7))), REWARD_OF_LIVING, False), # (DOWN, UP) ((0.08, False), env.locations_to_state( ((0, 0), (6, 7))), REWARD_OF_LIVING, False), # (UP, UP) ((0.08, False), env.locations_to_state( ((0, 1), (7, 7))), REWARD_OF_LIVING, False), # (RIGHT, RIGHT) ((0.08, False), env.locations_to_state( ((0, 1), (7, 6))), REWARD_OF_LIVING, False), # (RIGHT, LEFT) ((0.01, False), env.locations_to_state( ((1, 0), (7, 7))), REWARD_OF_LIVING, False), # (DOWN, RIGHT) ((0.01, False), env.locations_to_state( ((1, 0), (7, 6))), REWARD_OF_LIVING, False), # (DOWN, LEFT) ((0.01, False), env.locations_to_state( ((0, 0), (7, 7))), REWARD_OF_LIVING, False), # (UP, RIGHT) ((0.01, False), env.locations_to_state( ((0, 0), (7, 6))), REWARD_OF_LIVING, False) # (UP, LEFT) }) wish_state = env.locations_to_state(((0, 1), (6, 7))) second_step_transitions = [ ((round(prob, 2), collision), next_state, reward, done) for ((prob, collision), next_state, reward, done) in env.P[wish_state][vector_action_to_integer((RIGHT, UP))] ] # [(0,0), (7,7)] self.assertEqual( set(second_step_transitions), { ((0.64, False), env.locations_to_state( ((0, 2), (5, 7))), REWARD_OF_LIVING + REWARD_OF_GOAL, True), # (RIGHT, UP) ((0.08, False), env.locations_to_state( ((1, 1), (5, 7))), REWARD_OF_LIVING, False), # (DOWN, UP) ((0.08, False), env.locations_to_state( ((0, 1), (5, 7))), REWARD_OF_LIVING, False), # (UP, UP) ((0.08, False), env.locations_to_state( ((0, 2), (6, 7))), REWARD_OF_LIVING, False), # (RIGHT, RIGHT) ((0.08, False), env.locations_to_state( ((0, 2), (6, 6))), REWARD_OF_LIVING, False), # (RIGHT, LEFT) ((0.01, False), env.locations_to_state( ((1, 1), (6, 7))), REWARD_OF_LIVING, False), # (DOWN, RIGHT) ((0.01, False), env.locations_to_state( ((1, 1), (6, 6))), REWARD_OF_LIVING, False), # (DOWN, LEFT) ((0.01, False), env.locations_to_state( ((0, 1), (6, 7))), REWARD_OF_LIVING, False), # (UP, RIGHT) ((0.01, False), env.locations_to_state( ((0, 1), (6, 6))), REWARD_OF_LIVING, False) # (UP, LEFT) })