def test_env_3(env3_robots): env_data = env3_robots.get_env_metadata() env3_robots.reset() state = env3_robots.get_current_state() action = [0, 0, 0] action = n_from_prod(env_data['sets'], action) new_state, reward, done, _ = env3_robots.step(action) new_state_from_obs = get_state_from_observation(new_state) assert done == False assert reward == -0.666666 assert state.robots_data == new_state_from_obs.robots_data assert state.time + 1 == new_state_from_obs.time assert new_state_from_obs.positions == [2, 2, 3] assert env3_robots.state.all() == np.array(new_state).all() for i in range(3): action = [0, 0, 0] action = n_from_prod(env_data['sets'], action) new_state, reward, done, _ = env3_robots.step(action) action = [0, 0, 70] action = n_from_prod(env_data['sets'], action) new_state, reward, done, _ = env3_robots.step(action) assert done == False assert reward == 0.001 assert get_state_from_observation(new_state).time == 6 assert get_state_from_observation(new_state).robots_data == [15, 30, 0]
def test_one_minus_one_reward_good_env_4(env4_robots): env_data = env4_robots.get_env_metadata() action = [17, 0, 0, 0, 0, 13] action = n_from_prod(env_data['sets'], action) # {(r1 -> r0: 2), , # (r2 -> r3: 13)} interpreted_action = env4_robots.get_action_from_space(action) reward_giver = OneMinusOneRewardGiverAllowIllegal() assert reward_giver.give_reward(State([10, 10, 15, 0], 10, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory']) == 0 assert reward_giver.give_reward( State([10, 10, 10, 10], 10, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory']) == settings.REWARD_FOR_INVALID_TRANSFER + 1 action = [25, 0, 0, 0, 0, 13] action = n_from_prod(env_data['sets'], action) # {(r1 -> r0: 10), , # (r2 -> r3: 13)} interpreted_action = env4_robots.get_action_from_space(action) assert reward_giver.give_reward(State([10, 10, 15, 0], 10, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory']) == 0
def test_one_minus_one_reward_bad_env_4(env4_robots): env_data = env4_robots.get_env_metadata() action = [1, 2, 20, 15, 13, 28] action = n_from_prod(env_data['sets'], action) # {(r0 -> r1: 1), (r0 -> r2: 2), (r3 -> r0: 5), # (r1 -> r2: 15), (r1 -> r3: 13), (r3 -> r2: 13)} interpreted_action = env4_robots.get_action_from_space(action) reward_giver = OneMinusOneRewardGiverAllowIllegal() assert reward_giver.give_reward( State([10, 10, 10, 10], 8, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory'] ) == 5 * settings.REWARD_FOR_INVALID_MEETING + settings.REWARD_FOR_INVALID_TRANSFER assert reward_giver.give_reward( State([0, 0, 0, 0], 8, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory'] ) == 5 * settings.REWARD_FOR_INVALID_MEETING + settings.REWARD_FOR_INVALID_TRANSFER action = [0] * (env_data['num_robots'] * (env_data['num_robots'] - 1) // 2) action = n_from_prod(env_data['sets'], action) interpreted_action = env4_robots.get_action_from_space(action) assert reward_giver.give_reward(State([0, 0, 0, 0], 8, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory']) == 0
def test_apply_action_test_4_env(env4_robots): env_data = env4_robots.get_env_metadata() env4_robots.reset() state = env4_robots.get_current_state() state.time = 8 action = [19, 16, 0, 0, 0, 18] action = n_from_prod(env_data['sets'], action) interpreted_action = env4_robots.get_action_from_space(action) new_state = apply_action_allow_illegal(state, interpreted_action, env_data['max_memory'], env_data['cycles']) assert new_state == State(robots_data=[15, 6, 12, 7], time=9, positions=[2, 2, 5, 5]) action = [26, 16, 0, 0, 0, 22] action = n_from_prod(env_data['sets'], action) interpreted_action = env4_robots.get_action_from_space(action) new_state = apply_action_allow_illegal(state, interpreted_action, env_data['max_memory'], env_data['cycles']) assert new_state == State(robots_data=[11, 10, 9, 10], time=9, positions=[2, 2, 5, 5])
def step(self, action: int) -> Tuple[np.array, float, bool, Dict]: # check if the action is in the action space if not isinstance(action, list): action = int(action) else: assert self.action_space.contains( action), f'{action}, {type(action)} invalid' action = n_from_prod(self.__sets, action) interpreted_state = get_state_from_observation( self.state) # convert observation to interpreted state interpreted_action = self.get_action_from_space(action) self.__state_action = interpreted_action reward = self.__reward_class.give_reward(interpreted_state, interpreted_action, self.__meetings, self.__cycles_lengths, self.__max_memory) if reward == settings.REWARD_FOR_INVALID_ACTION: # new_state = interpreted_state new_state = apply_action_only_increase_time_move_robots( interpreted_state, interpreted_action, self.__max_memory, self.__cycles) else: new_state = self.__action_apply(interpreted_state, interpreted_action, self.__max_memory, self.__cycles) self.state = np.array(get_observation_from_state(new_state)) return self.state, reward / 1000000, check_if_done( new_state, settings.MAXIMUM_NUM_ITER), {}
def test_one_minus_one_reward_bad_env_3(env3_robots): env_data = env3_robots.get_env_metadata() action = [15, 20, 56] action = n_from_prod(env_data['sets'], action) interpreted_action = env3_robots.get_action_from_space(action) reward_giver = OneMinusOneRewardGiverAllowIllegal() assert reward_giver.give_reward( State([15, 20, 1], 5, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory']) == 2 * settings.REWARD_FOR_INVALID_MEETING - 1
def test_apply_action_env_3_robots(env3_robots): env_data = env3_robots.get_env_metadata() env3_robots.reset() state = env3_robots.get_current_state() action = [61, 0, 0] action = n_from_prod(env_data['sets'], action) interpreted_action = env3_robots.get_action_from_space(action) new_state = apply_action_allow_illegal(state, interpreted_action, env_data['max_memory'], env_data['cycles']) assert new_state == State(robots_data=[21, 9, 15], time=2, positions=[2, 2, 3]) action = [61, 0, 0] action = n_from_prod(env_data['sets'], action) interpreted_action = env3_robots.get_action_from_space(action) assert apply_action_allow_illegal(state, interpreted_action, env_data['max_memory'], env_data['cycles']) == new_state
def test_env_termination(env4_robots): env_data = env4_robots.get_env_metadata() env4_robots.reset() action_map_terminate = { '-1': [0] * 6, '2': [25] + [0] * 5, '4': [0, 25] + [0] * 4, '6': [0] * 5 + [25], '8': [0, 25] + [0] * 4 } for i in range(1, 9): if action_map_terminate.get(str(i), None): action = action_map_terminate[str(i)] action = n_from_prod(env_data['sets'], action) _, __, done, ___ = env4_robots.step(action) else: action = action_map_terminate['-1'] action = n_from_prod(env_data['sets'], action) _, __, done, ___ = env4_robots.step(action) assert done == True
def test_one_minus_one_reward_good_env_3(env3_robots): env_data = env3_robots.get_env_metadata() action = [57, 0, 0] action = n_from_prod(env_data['sets'], action) interpreted_action = env3_robots.get_action_from_space(action) reward_giver = OneMinusOneRewardGiverAllowIllegal() assert reward_giver.give_reward(State([2, 2, 0], 10, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory']) == 1 action = [58, 0, 0] action = n_from_prod(env_data['sets'], action) interpreted_action = env3_robots.get_action_from_space(action) assert reward_giver.give_reward( State([0, 0, 0], 10, None), interpreted_action, env_data['meetings'], env_data['cycle_lengths'], env_data['max_memory']) == settings.REWARD_FOR_INVALID_TRANSFER