def test_reward_single_agent_makespan(self): grid = MapfGrid(['....', '....', '....', '....', '....']) start_locations = ((0, 0), ) goal_locations = ((4, 0), ) determinstic_env = MapfEnv(grid, 1, start_locations, goal_locations, 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) total_reward = 0 down_action = vector_action_to_integer((DOWN, )) _, r, _, _ = determinstic_env.step(down_action) total_reward += r _, r, _, _ = determinstic_env.step(down_action) total_reward += r _, r, _, _ = determinstic_env.step(down_action) total_reward += r s, r, done, _ = determinstic_env.step(down_action) total_reward += r self.assertEqual(s, determinstic_env.locations_to_state(goal_locations)) self.assertEqual(r, REWARD_OF_LIVING + REWARD_OF_GOAL) self.assertEqual(total_reward, REWARD_OF_GOAL + 4 * REWARD_OF_LIVING)
def test_copy_mapf_env(self): grid = MapfGrid(['....', '....', '....', '....', '....']) env = MapfEnv(grid, 1, ((0, 0), ), ((4, 0), ), 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) env.step(vector_action_to_integer((RIGHT, ))) env_copy = copy(env) env_copy.step(vector_action_to_integer((RIGHT, )))
def test_reward_multiagent_soc_stay_actions(self): grid = MapfGrid(['....', '....', '....', '....']) start_locations = ((0, 0), (3, 3), (1, 1)) goal_locations = ((0, 1), (1, 3), (1, 2)) determinstic_env = MapfEnv(grid, 3, start_locations, goal_locations, 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.SoC) right_stay_stay = vector_action_to_integer((RIGHT, STAY, STAY)) s, r, done, _ = determinstic_env.step(right_stay_stay) self.assertEqual(r, -3)
def test_reawrd_multiagent_makespan(self): grid = MapfGrid(['....', '....', '....', '....']) start_locations = ((0, 0), (3, 3), (1, 1)) goal_locations = ((0, 1), (1, 3), (1, 2)) determinstic_env = MapfEnv(grid, 3, start_locations, goal_locations, 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) total_reward = 0 right_up_right = vector_action_to_integer((RIGHT, UP, RIGHT)) s, r, done, _ = determinstic_env.step(right_up_right) total_reward += r self.assertFalse(done) stay_up_stay = vector_action_to_integer((STAY, UP, STAY)) s, r, done, _ = determinstic_env.step(stay_up_stay) total_reward += r self.assertEqual(s, determinstic_env.locations_to_state(goal_locations)) self.assertTrue(done) self.assertEqual(total_reward, 2 * REWARD_OF_LIVING + REWARD_OF_GOAL)
def test_action_from_terminal_state_has_no_effect(self): grid = MapfGrid(['..', '..']) env = MapfEnv(grid, 1, ((0, 0), ), ((1, 1), ), 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) state, reward, done, _ = env.step(vector_action_to_integer((RIGHT, ))) self.assertEqual(reward, REWARD_OF_LIVING) self.assertEqual(done, False) state, reward, done, _ = env.step(vector_action_to_integer((DOWN, ))) self.assertEqual(reward, REWARD_OF_LIVING + REWARD_OF_GOAL) self.assertEqual(done, True) # now, after the game is finished - do another step and make sure it has not effect. state_after_done, reward_after_done, done_after_done, _ = env.step( vector_action_to_integer((UP, ))) self.assertEqual(state_after_done, state) self.assertEqual(done_after_done, True) self.assertEqual(reward_after_done, 0) # another time like I'm trying to reach the goal state_after_done, reward_after_done, done_after_done, _ = env.step( vector_action_to_integer((DOWN, ))) self.assertEqual(state_after_done, state) self.assertEqual(done_after_done, True) self.assertEqual(reward_after_done, 0)
def test_switch_spots_is_a_collision(self): grid = MapfGrid(['..']) agents_starts = ( (0, 0), (0, 1), ) agents_goals = ((0, 1), (0, 0)) determinstic_env = MapfEnv(grid, 2, agents_starts, agents_goals, 0, REWARD_OF_CLASH, REWARD_OF_GOAL, REWARD_OF_LIVING, OptimizationCriteria.Makespan) s, r, done, _ = determinstic_env.step( vector_action_to_integer((RIGHT, LEFT))) # Assert the game terminated in a collision self.assertEqual(done, True) self.assertEqual(r, REWARD_OF_LIVING + REWARD_OF_CLASH)
def lrtdp( heuristic_function: Callable[[MapfEnv], Callable[[int], float]], max_iterations: int, gamma: float, epsilon: float, env: MapfEnv, info: Dict, ) -> Policy: info['iterations'] = [] # initialize V to an upper bound env.reset() initial_state = env.s policy = LrtdpPolicy(env, gamma, heuristic_function(env)) # follow the greedy policy, for each transition do a bellman update on V n_iterations = 0 while initial_state not in policy.solved and n_iterations < max_iterations: n_iterations += 1 s = env.s start = time.time() path = [] # LRTDP Trial while s not in policy.solved: # env.render() a = greedy_action(env, s, policy.v, gamma) path.append((s, a)) # print(f'action {integer_action_to_vector(a, env.n_agents)} chosen') # time.sleep(1) new_v_s = sum([ prob * (reward + gamma * policy.v[next_state]) for prob, next_state, reward, done in env.P[s][a] ]) policy.v_partial_table[s] = new_v_s # simulate the step and sample a new state s, r, done, _ = env.step(a) if done: # add the state to done, the action does not matter path.append((s, 0)) break # iteration finished while path: state, action = path.pop() if not check_solved(policy, state, epsilon): break info['iterations'].append({ 'n_moves': len(path), 'time': round(time.time() - start, 2), 'n_states_solved': len(policy.solved), 'final_reward': r, }) env.reset() env.reset() return policy