def __init__(self, initial_state: tuple = (0, 0), default_reward: tuple = (-1, -1), seed: int = 0, n_transition: float = 0.95, diagonals: int = 9, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (objective 1, objective 2) :param seed: Seed used for np.random.RandomState method. :param n_transition: if is 1, always do the action indicated. (Original is about 0.6) :param diagonals: Number of diagonals to be used to build this environment (allows experimenting with an identical environment, but considering only the first k diagonals) (By default 9 - all). """ # the original full-size environment. mesh_shape = (min(max(diagonals + 1, 1), 10), min(max(diagonals + 1, 1), 10)) # Dictionary with final states as keys, and treasure amounts as values. diagonals_states = { x for x in zip(range(0, diagonals + 1, 1), range(diagonals, -1, -1)) } # Generate finals states with its reward finals = { state: (Vector(state) + 1) * 10 for state in diagonals_states } # Pareto optimal PyramidMDP.pareto_optimal = { Vector(state) + 1 for state in diagonals_states } # Filter obstacles states obstacles = frozenset((x, y) for x, y in finals.keys() for y in range(y, diagonals + 1) if (x, y) not in finals) # Default reward (objective_1, objective_2) default_reward = Vector(default_reward) # Transaction assert 0 <= n_transition <= 1. self.n_transition = n_transition super().__init__(mesh_shape=mesh_shape, initial_state=initial_state, default_reward=default_reward, finals=finals, obstacles=obstacles, seed=seed, action_space=action_space)
def __init__(self, initial_state: tuple = ((0, 0), False), default_reward: tuple = (0, 0), seed: int = 0, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (objective 1, objective 2) :param seed: Seed used for np.random.RandomState method. :param action_space: """ # List of all treasures and its reward. finals = { (8, 0): Vector([1, 9]), (8, 2): Vector([3, 9]), (8, 4): Vector([5, 9]), (8, 6): Vector([7, 9]), (8, 8): Vector([9, 9]), (0, 8): Vector([9, 1]), (2, 8): Vector([9, 3]), (4, 8): Vector([9, 5]), (6, 8): Vector([9, 7]), } # Define mesh shape mesh_shape = (9, 9) # Set obstacles obstacles = frozenset({(2, 2), (2, 3), (3, 2)}) # Default reward plus time (objective 1, objective 2, time) default_reward += (-1,) default_reward = Vector(default_reward) # Build the observation space (position (x, y), bonus) observation_space = gym.spaces.Tuple( ( gym.spaces.Tuple( (gym.spaces.Discrete(mesh_shape[0]), gym.spaces.Discrete(mesh_shape[1])) ), spaces.Boolean() ) ) super().__init__(mesh_shape=mesh_shape, default_reward=default_reward, initial_state=initial_state, finals=finals, obstacles=obstacles, observation_space=observation_space, seed=seed, action_space=action_space) # Pits marks which returns the agent to the start location. self.pits = { (7, 1), (7, 3), (7, 5), (1, 7), (3, 7), (5, 7) } # X2 bonus self.bonus = [ (3, 3) ]
def evaluate_bounds_are_of_type(self, bounds, type_id): sum = 0 position = Vector(0, 0) # Bottom horizontal bound position.y = bounds.y_min for x in range(bounds.x_min, bounds.x_max): position.x = x sum += self.evaluate_tile_type(position, type_id) # Top horizontal bound position.y = bounds.y_max - 1 for x in range(bounds.x_min, bounds.x_max): position.x = x sum += self.evaluate_tile_type(position, type_id) # Left vertical bound position.x = bounds.x_min for y in range(bounds.y_min, bounds.y_max - 1): position.y = y sum += self.evaluate_tile_type(position, type_id) # Right vertical bound position.x = bounds.x_max - 1 for y in range(bounds.y_min + 1, bounds.y_max - 1): position.y = y sum += self.evaluate_tile_type(position, type_id) return sum
def __init__(self, initial_state: tuple = ((0, 0), False), default_reward: tuple = (0, 0), seed: int = 0): """ :param initial_state: Initial state where start the agent. :param default_reward: (objective 1, objective 2) :param seed: Seed used for np.random.RandomState method. """ # Create a bag action space action_space = Bag([]) action_space.seed(seed) super().__init__(seed=seed, initial_state=initial_state, default_reward=default_reward, action_space=action_space) # Set obstacles self.obstacles = frozenset({(2, 2)}) # PITS are finals states in this variant self.finals.update({state: Vector([-50, -50]) for state in self.pits}) self.pits = list()
def __init__(self, initial_state: tuple = (5, 2), default_reward: tuple = (0, -1), seed: int = 0, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (mission_success, radiation) :param seed: Seed used for np.random.RandomState method. """ # List of all treasures and its reward. finals = {} finals.update({(0, i): 20 for i in range(5)}) finals.update({(9, i): 10 for i in range(3)}) finals.update({(12, i): 30 for i in range(5)}) obstacles = frozenset() mesh_shape = (13, 5) default_reward = Vector(default_reward) super().__init__(mesh_shape=mesh_shape, seed=seed, initial_state=initial_state, default_reward=default_reward, finals=finals, obstacles=obstacles, action_space=action_space) self.asteroids = { (5, 0), (4, 1), (6, 1), (3, 2), (7, 2), (4, 3), (6, 3), (5, 4) } # Define radiations states (If the agent is on any of these, then receive -100 penalization) self.radiations = set() self.radiations = self.radiations.union({(1, i) for i in range(5)}) self.radiations = self.radiations.union({(10, i) for i in range(5)}) self.radiations = self.radiations.union({(11, i) for i in range(5)})
def test_transition_reward(self): # In this environment doesn't mind initial state to get the reward state = self.environment.observation_space.sample() # Doesn't mind action too. action = self.environment.action_space.sample() # Asteroids states for asteroid_state in self.environment.asteroids: self.assertEqual( Vector((-100, -1)), self.environment.transition_reward(state=state, action=action, next_state=asteroid_state)) # Radiations states for radiation_state in self.environment.radiations: self.assertEqual( Vector((0, -11)), self.environment.transition_reward(state=state, action=action, next_state=radiation_state)) # Finals states for final_state, final_reward in self.environment.finals.items(): self.assertEqual( Vector((final_reward, -1)), self.environment.transition_reward(state=state, action=action, next_state=final_state)) simple_states = self.environment.states() - set( self.environment.finals.keys()).union( self.environment.radiations).union(self.environment.asteroids) for simple_state in simple_states: self.assertEqual( Vector((0, -1)), self.environment.transition_reward(state=state, action=action, next_state=simple_state))
def setUp(self): # An observation space observation_space = gym.spaces.Discrete(7) # Default reward default_reward = Vector([1, 2, 1]) # Set initial_seed to 0 to testing. self.environment = Environment(observation_space=observation_space, default_reward=default_reward, seed=0)
def evaluate_tiles_in_bounds_are_of_type(self, bounds, type_id): sum = 0 position = Vector(0, 0) for x in range(bounds.x_min, bounds.x_max): position.x = x for y in range(bounds.y_min, bounds.y_max): position.y = y sum += self.evaluate_tile_type(position, type_id) return sum
def __init__(self, initial_state: tuple = (0, 0), default_reward: tuple = (0,), seed: int = 0, columns: int = 0, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (treasure_value, ) :param seed: Seed used for np.random.RandomState method. """ original_mesh_shape = (10, 11) # Reduce the number of diagonals if columns < 1 or columns > original_mesh_shape[0]: columns = original_mesh_shape[0] # List of all treasures and its reward. finals = { (0, 1): 5, (1, 2): 80, (2, 3): 120, (3, 4): 140, (4, 4): 145, (5, 4): 150, (6, 7): 163, (7, 7): 166, (8, 9): 173, (9, 10): 175, } # Filter finals states finals = dict(filter(lambda x: x[0][0] < columns, finals.items())) obstacles = frozenset() obstacles = obstacles.union([(0, y) for y in range(2, 11)]) obstacles = obstacles.union([(1, y) for y in range(3, 11)]) obstacles = obstacles.union([(2, y) for y in range(4, 11)]) obstacles = obstacles.union([(3, y) for y in range(5, 11)]) obstacles = obstacles.union([(4, y) for y in range(5, 11)]) obstacles = obstacles.union([(5, y) for y in range(5, 11)]) obstacles = obstacles.union([(6, y) for y in range(8, 11)]) obstacles = obstacles.union([(7, y) for y in range(8, 11)]) obstacles = obstacles.union([(8, y) for y in range(10, 11)]) # Filter obstacles states obstacles = frozenset(filter(lambda x: x[0] < columns, obstacles)) # Resize mesh_shape mesh_shape = (columns, 11) # Default reward plus time (time_inverted, treasure_value, water_pressure) default_reward = (-1,) + default_reward + (0,) default_reward = Vector(default_reward) super().__init__(mesh_shape=mesh_shape, seed=seed, default_reward=default_reward, initial_state=initial_state, finals=finals, obstacles=obstacles, action_space=action_space)
def test_transition_reward(self): # In this environment doesn't mind initial state to get the reward state = self.environment.observation_space.sample() # Doesn't mind action too. action = self.environment.action_space.sample() # An intermediate state self.assertEqual( self.environment.transition_reward(state=state, action=action, next_state=(1, 1)), Vector((-1, 0, -2))) # A final state self.assertEqual( self.environment.transition_reward(state=state, action=action, next_state=(1, 2)), Vector([-1, 80, -3]))
def is_room(self, start_position, area_bounds): found_floor = False current_position = Vector(start_position.x, start_position.y + 1) # Check vertical for the room bounds while (current_position.y <= area_bounds.y_max and not found_floor): if not self.is_tile_of_type(current_position, TILE_TYPES["WALL"]): found_floor = True current_position.y -= 1 else: current_position.y += 1 room_bounds_y = current_position.y next_position = room_bounds_y + 1 if not found_floor or room_bounds_y == start_position.y: return False, next_position, None # Check horizontal for the room bounds found_floor = False current_position.x += 1 while (current_position.x <= area_bounds.x_max and not found_floor): if not self.is_tile_of_type(current_position, TILE_TYPES["WALL"]): found_floor = True current_position.x -= 1 else: current_position.x += 1 room_bounds_x = current_position.x if not found_floor or room_bounds_x == start_position.x: return False, next_position, None # Check vertical with the bounds found for y for y in range(start_position.y, room_bounds_y + 1): current_position.y = y if not self.is_tile_of_type(current_position, TILE_TYPES["WALL"]): return False, next_position, None # Check horizontal with the bounds found for y current_position.y = start_position.y for x in range(start_position.x, room_bounds_x + 1): current_position.x = x if not self.is_tile_of_type(current_position, TILE_TYPES["WALL"]): return False, next_position, None if abs(start_position.x - room_bounds_x) == 1 or \ abs(start_position.y - room_bounds_y) == 1: return False, next_position, None room_bounds = Bounds(start_position.x, start_position.y, room_bounds_x, room_bounds_y) return True, next_position, room_bounds
def setUp(self): # Mesh shape mesh_shape = (7, 7) # Default reward default_reward = Vector([1, 2, 1]) # Obstacles obstacles = frozenset({ (0, 0), (1, 1) }) # Set initial_seed to 0 to testing. self.environment = EnvMesh(mesh_shape=mesh_shape, default_reward=default_reward, seed=0, obstacles=obstacles)
def __init__(self, initial_state: tuple = ((2, 4), (0, 0)), default_reward: tuple = (0, 0, 0), seed: int = 0, p_attack: float = 0.1, mesh_shape: tuple = (5, 5), gold_positions: frozenset = frozenset({(2, 0)}), gem_positions: frozenset = frozenset({(4, 1)}), observation_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (enemy_attack, gold, gems) :param seed: Seed used for np.random.RandomState method. :param p_attack: Probability that a enemy attacks when agent stay in an enemy position. """ default_reward = Vector(default_reward) if observation_space is None: # Build the observation space (position(x, y), quantity(gold, gems)) observation_space = gym.spaces.Tuple( (gym.spaces.Tuple((gym.spaces.Discrete(mesh_shape[0]), gym.spaces.Discrete(mesh_shape[1]))), gym.spaces.Tuple( (gym.spaces.Discrete(2), gym.spaces.Discrete(2))))) # Define final states finals = frozenset() # Super constructor call. super().__init__(mesh_shape=mesh_shape, seed=seed, initial_state=initial_state, default_reward=default_reward, observation_space=observation_space, finals=finals) # Positions where there are gold. self.gold_positions = gold_positions # Positions where there is a gem. self.gem_positions = gem_positions # States where there are enemies_positions self.enemies_positions = {(3, 0), (2, 1)} self.p_attack = p_attack self.home_position = (2, 4) self.checkpoints_states = self._checkpoints_states()
def find_rooms(self, bounds): room_areas = list() position = Vector(0, 0) # Evaluate the area for rooms next_position = 0 for x in range(bounds.x_min, bounds.x_max): position.x = x for y in range(bounds.y_min, bounds.y_max): position.y = y if self.is_tile_of_type(position, TILE_TYPES["WALL"]): # Found wall tile result, next_position, room_bounds = self.is_room( position, bounds) if result: room_areas.append(room_bounds) self.rooms = room_areas return room_areas
config = configparser.ConfigParser() config.read(config_file) # TODO: add errors handling # TODO: move all to the new class start_point = Point( float(config['START POINT']['x']), float(config['START POINT']['y']), float(config['START POINT']['z']), ) dimensions = Vector( float(config['SCENE DIMENSIONS']['dx']), float(config['SCENE DIMENSIONS']['dy']), float(config['SCENE DIMENSIONS']['dz']), ) stl_file = config['OTHERS']['STL file path'] condition = float(config['OTHERS']['minimum volume']) result_file_path = config['OTHERS']['result file path'] stl = STL(stl_file) print('> Generate octree...') root = Node(start_point, dimensions) get_grid(root, condition=condition, object=stl) ### NP.ARRAY ### # arr = array([], dtype=float)
def __init__(self, seed: int = 0, initial_state: int = 0, default_reward: tuple = (0, 0)): """ :param seed: Initial initial_seed. The same is used for _action_space, observation_space, and random number generator :param initial_state: start position for all episodes. :param default_reward: Default reward returned by the environment when a reward is not defined (objective 1, objective 2). """ # Create the observation space observation_space = gym.spaces.Discrete(7) # Default reward default_reward = Vector(default_reward) # Super call constructor super().__init__(observation_space=observation_space, seed=seed, initial_state=initial_state, default_reward=default_reward) # Rewards dictionary self.rewards_dictionary = { 0: { self.actions['COUNTER_CLOCKWISE']: Vector([3, -1]), self.actions['CLOCKWISE']: Vector([-1, 3]) }, 1: { self.actions['COUNTER_CLOCKWISE']: Vector([3, -1]), self.actions['CLOCKWISE']: Vector([-1, 0]) }, 2: { self.actions['COUNTER_CLOCKWISE']: Vector([3, -1]), self.actions['CLOCKWISE']: Vector([-1, 0]) }, 3: { self.actions['COUNTER_CLOCKWISE']: Vector([3, -1]), self.actions['CLOCKWISE']: Vector([-1, 0]) }, 4: { self.actions['CLOCKWISE']: Vector([-1, 3]), self.actions['COUNTER_CLOCKWISE']: Vector([0, -1]) }, 5: { self.actions['CLOCKWISE']: Vector([-1, 3]), self.actions['COUNTER_CLOCKWISE']: Vector([0, -1]) }, 6: { self.actions['CLOCKWISE']: Vector([-1, 3]), self.actions['COUNTER_CLOCKWISE']: Vector([0, -1]) } } # Possible p_stochastic from a position to another self.possible_transitions = { 0: { self.actions['COUNTER_CLOCKWISE']: 1, self.actions['CLOCKWISE']: 4 }, 1: { self.actions['COUNTER_CLOCKWISE']: 2, self.actions['CLOCKWISE']: 0 }, 2: { self.actions['COUNTER_CLOCKWISE']: 3, self.actions['CLOCKWISE']: 1 }, 3: { self.actions['COUNTER_CLOCKWISE']: 0, self.actions['CLOCKWISE']: 2 }, 4: { self.actions['CLOCKWISE']: 5, self.actions['COUNTER_CLOCKWISE']: 0 }, 5: { self.actions['CLOCKWISE']: 6, self.actions['COUNTER_CLOCKWISE']: 4 }, 6: { self.actions['CLOCKWISE']: 0, self.actions['COUNTER_CLOCKWISE']: 5 } }
def __init__(self, initial_state: tuple = (0, 0), default_reward: tuple = (0, ), columns: int = 10, seed: int = 0, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (time_inverted, treasure_value) :param columns: Number of columns to be used to build this environment (allows experimenting with an identical environment, but considering only the first k columns) (By default 10 - all). :param seed: Seed used for np.random.RandomState method. :param action_space: Specific action space """ # the original full-size environment. original_mesh_shape = (10, 11) if columns < 1 or columns > original_mesh_shape[0]: columns = original_mesh_shape[0] # Dictionary with final states as keys, and treasure amounts as values. finals = { (0, 1): 1, (1, 2): 2, (2, 3): 3, (3, 4): 5, (4, 4): 8, (5, 4): 16, (6, 7): 24, (7, 7): 50, (8, 9): 74, (9, 10): 124, } # Filter finals states finals = dict(filter(lambda x: x[0][0] < columns, finals.items())) # Filter obstacles states obstacles = frozenset() obstacles = obstacles.union([(0, y) for y in range(2, 11)]) obstacles = obstacles.union([(1, y) for y in range(3, 11)]) obstacles = obstacles.union([(2, y) for y in range(4, 11)]) obstacles = obstacles.union([(3, y) for y in range(5, 11)]) obstacles = obstacles.union([(4, y) for y in range(5, 11)]) obstacles = obstacles.union([(5, y) for y in range(5, 11)]) obstacles = obstacles.union([(6, y) for y in range(8, 11)]) obstacles = obstacles.union([(7, y) for y in range(8, 11)]) obstacles = obstacles.union([(8, y) for y in range(10, 11)]) obstacles = frozenset(filter(lambda x: x[0] < columns, obstacles)) # Subspace of the environment to be considered mesh_shape = (columns, 11) # Default reward plus time (time_inverted, treasure_value) default_reward = (-1, ) + default_reward default_reward = Vector(default_reward) super().__init__(mesh_shape=mesh_shape, initial_state=initial_state, default_reward=default_reward, finals=finals, obstacles=obstacles, seed=seed, action_space=action_space)
class DeepSeaTreasure(EnvMesh): # Possible actions _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3} # Pareto optimal policy vector-values pareto_optimal = [ Vector([-1, 1]), Vector([-3, 2]), Vector([-5, 3]), Vector([-7, 5]), Vector([-8, 8]), Vector([-9, 16]), Vector([-13, 24]), Vector([-14, 50]), Vector([-17, 74]), Vector([-19, 124]) ] # Experiments common hypervolume reference hv_reference = Vector((-25, 0)) def __init__(self, initial_state: tuple = (0, 0), default_reward: tuple = (0, ), columns: int = 10, seed: int = 0, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (time_inverted, treasure_value) :param columns: Number of columns to be used to build this environment (allows experimenting with an identical environment, but considering only the first k columns) (By default 10 - all). :param seed: Seed used for np.random.RandomState method. :param action_space: Specific action space """ # the original full-size environment. original_mesh_shape = (10, 11) if columns < 1 or columns > original_mesh_shape[0]: columns = original_mesh_shape[0] # Dictionary with final states as keys, and treasure amounts as values. finals = { (0, 1): 1, (1, 2): 2, (2, 3): 3, (3, 4): 5, (4, 4): 8, (5, 4): 16, (6, 7): 24, (7, 7): 50, (8, 9): 74, (9, 10): 124, } # Filter finals states finals = dict(filter(lambda x: x[0][0] < columns, finals.items())) # Filter obstacles states obstacles = frozenset() obstacles = obstacles.union([(0, y) for y in range(2, 11)]) obstacles = obstacles.union([(1, y) for y in range(3, 11)]) obstacles = obstacles.union([(2, y) for y in range(4, 11)]) obstacles = obstacles.union([(3, y) for y in range(5, 11)]) obstacles = obstacles.union([(4, y) for y in range(5, 11)]) obstacles = obstacles.union([(5, y) for y in range(5, 11)]) obstacles = obstacles.union([(6, y) for y in range(8, 11)]) obstacles = obstacles.union([(7, y) for y in range(8, 11)]) obstacles = obstacles.union([(8, y) for y in range(10, 11)]) obstacles = frozenset(filter(lambda x: x[0] < columns, obstacles)) # Subspace of the environment to be considered mesh_shape = (columns, 11) # Default reward plus time (time_inverted, treasure_value) default_reward = (-1, ) + default_reward default_reward = Vector(default_reward) super().__init__(mesh_shape=mesh_shape, initial_state=initial_state, default_reward=default_reward, finals=finals, obstacles=obstacles, seed=seed, action_space=action_space) def step(self, action: int) -> (tuple, Vector, bool, dict): """ Given an action, do a step :param action: :return: (position, (time_inverted, treasure_value), final, extra) """ # Initialize rewards as vector reward = self.default_reward.copy() # Update current position self.current_state = self.next_state(action=action) # Get treasure value reward[1] = self.finals.get(self.current_state, self.default_reward[1]) # Set extra info = {} # Check is_final final = self.is_final(self.current_state) return self.current_state, reward, final, info def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector: """ Given a state, an action and a next state, return the corresponding reward. :param state: :param action: :param next_state: :return: """ # Default reward reward = self.default_reward.copy() # Get treasure reward reward[1] = self.finals.get(next_state, self.default_reward[1]) return reward
clock = pygame.time.Clock() font = pygame.font.SysFont(None, 25) WHITE = (255, 255, 255) BLACK = (0, 0, 0) RED = (255, 0, 0) GREEN = (0, 255, 0) BLUE = (0, 0, 255) FRAME_RATE = 60 WIDTH = 1000 HEIGHT = 800 FRICTION = 0 ELASTICITY = 1.0 GRAVITY = Vector(0, 0) BALL_SIZE = 50 INITIAL_VELOCITY_SCALAR = 5 GAME_DISPLAY = pygame.display.set_mode((WIDTH, HEIGHT)) GAME_DISPLAY.fill(BLACK) mouse_pos = None modify_type = None modify_up = False modify_down = False balls = [] def update_balls(balls): updated_balls = []
class MoPuddleWorld(EnvMesh): # Possible actions _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3} # Experiments common hypervolume reference hv_reference = Vector([-50, -150]) def __init__(self, default_reward: tuple = (10, 0), penalize_non_goal: float = -1, seed: int = 0, final_state: tuple = (19, 0), action_space: gym.spaces = None): """ :param default_reward: (non_goal_reached, puddle_penalize) :param penalize_non_goal: While agent does not reach a final position get a penalize. :param seed: Initial initial_seed. The same is used for _action_space, observation_space, and random number generator :param final_state: This environment only has a final position. """ self.final_state = final_state mesh_shape = (20, 20) default_reward = VectorDecimal(default_reward) super().__init__(mesh_shape=mesh_shape, seed=seed, default_reward=default_reward, action_space=action_space) self.puddles = frozenset() self.puddles = self.puddles.union([(x, y) for x in range(0, 11) for y in range(3, 7)]) self.puddles = self.puddles.union([(x, y) for x in range(6, 10) for y in range(2, 14)]) self.penalize_non_goal = penalize_non_goal self.current_state = self.reset() # Get free spaces self.free_spaces = set(self.states() - self.puddles) def step(self, action: int) -> (tuple, VectorDecimal, bool, dict): """ Given an action, do a step :param action: :return: (position, (non_goal_reached, puddle_penalize), final, extra) """ # Initialize reward as vector reward = self.default_reward.copy() # Update previous position self.current_state = self.next_state(action=action) # If agent is in treasure final = self.is_final(self.current_state) # Set final reward if not final: reward[0] = self.penalize_non_goal # if the current position is in an puddle if self.current_state in self.puddles: # Set penalization per distance reward[1] = self.calc_puddle_penalization(state=self.current_state) # Set extra info = {} return self.current_state, reward, final, info def calc_puddle_penalization(self, state: tuple) -> float: """ Return a float that represents a penalization, the penalization is the lowest distance between current state and the nearest border in manhattan distance. :param state: :return: """ # Min distance found! min_distance = min( cityblock(self.current_state, state) for state in self.free_spaces) # Set penalization per distance return -min_distance def reset(self) -> tuple: """ Get random non-goal position to current_value :return: """ # Reset to initial seed self.seed(seed=self.initial_seed) random_space = None while random_space is None or random_space == self.final_state: random_space = self.observation_space.sample() self.current_state = random_space return self.current_state def is_final(self, state: tuple = None) -> bool: """ Is final if agent is on final position :param state: :return: """ return state == self.final_state def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector: """ Return reward for reach `next_state` from `position` using `action`. :param state: initial position :param action: action to do :param next_state: next position reached :return: """ # Initialize reward as vector reward = self.default_reward.copy() # If agent is in treasure final = self.is_final(next_state) # Set final reward if not final: reward[0] = self.penalize_non_goal # if the current position is in an puddle if next_state in self.puddles: # Min distance found! min_distance = min( cityblock(next_state, state) for state in self.free_spaces) # Set penalization per distance reward[1] = -min_distance return reward def states(self) -> set: """ Return all possible states of this environment. :return: """ # Unpack spaces x_position, y_position = self.observation_space.spaces return set((x, y) for x in range(x_position.n) for y in range(y_position.n)).difference({self.final_state})
}, 'W_{0.01}': { 'color': 'c', 'marker': 'state' }, 'W_{0.005}': { 'color': 'b', 'marker': 'd' }, 'W_{0.001}': { 'color': 'k', 'marker': 'o' } } vector_reference = Vector((-25, 0)) def pareto_graph(data: dict): # Columns columns = list(data.keys())[0] # Prepare hypervolume to dumps data pareto_file = Path(__file__).parent.joinpath( 'article/output/pareto_{}.m'.format(columns)) # If any parents doesn't exist, make it. pareto_file.parent.mkdir(parents=True, exist_ok=True) data = data[columns]
def test_vectors(self): a = Vector([1, 1]) b = Vector([0, 12]) self.assertAlmostEqual(a.angle(b), math.pi / 4)
class PressurizedBountifulSeaTreasure(EnvMesh): # Possible actions _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3} # Pareto optimal pareto_optimal = [ (-1, 5, -2), (-3, 80, -3), (-5, 120, -4), (-7, 140, -5), (-8, 145, -6), (-9, 150, -6), (-13, 163, -8), (-14, 166, -8), (-17, 173, -10), (-19, 175, -11) ] # Experiments common hypervolume reference hv_reference = Vector([-25, 0, -120]) def __init__(self, initial_state: tuple = (0, 0), default_reward: tuple = (0,), seed: int = 0, columns: int = 0, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (treasure_value, ) :param seed: Seed used for np.random.RandomState method. """ original_mesh_shape = (10, 11) # Reduce the number of diagonals if columns < 1 or columns > original_mesh_shape[0]: columns = original_mesh_shape[0] # List of all treasures and its reward. finals = { (0, 1): 5, (1, 2): 80, (2, 3): 120, (3, 4): 140, (4, 4): 145, (5, 4): 150, (6, 7): 163, (7, 7): 166, (8, 9): 173, (9, 10): 175, } # Filter finals states finals = dict(filter(lambda x: x[0][0] < columns, finals.items())) obstacles = frozenset() obstacles = obstacles.union([(0, y) for y in range(2, 11)]) obstacles = obstacles.union([(1, y) for y in range(3, 11)]) obstacles = obstacles.union([(2, y) for y in range(4, 11)]) obstacles = obstacles.union([(3, y) for y in range(5, 11)]) obstacles = obstacles.union([(4, y) for y in range(5, 11)]) obstacles = obstacles.union([(5, y) for y in range(5, 11)]) obstacles = obstacles.union([(6, y) for y in range(8, 11)]) obstacles = obstacles.union([(7, y) for y in range(8, 11)]) obstacles = obstacles.union([(8, y) for y in range(10, 11)]) # Filter obstacles states obstacles = frozenset(filter(lambda x: x[0] < columns, obstacles)) # Resize mesh_shape mesh_shape = (columns, 11) # Default reward plus time (time_inverted, treasure_value, water_pressure) default_reward = (-1,) + default_reward + (0,) default_reward = Vector(default_reward) super().__init__(mesh_shape=mesh_shape, seed=seed, default_reward=default_reward, initial_state=initial_state, finals=finals, obstacles=obstacles, action_space=action_space) def step(self, action: int) -> (tuple, Vector, bool, dict): """ Given an action, do a step :param action: :return: (position, (time_inverted, treasure_value), final, extra) """ # Initialize reward as vector reward = self.default_reward.copy() # Update previous position self.current_state = self.next_state(action=action) # Get treasure value reward[1] = self.finals.get(self.current_state, self.default_reward[1]) # Water pressure (y-coordinate) reward[2] = -(self.current_state[1] + 1) # Set extra info = {} # Check is_final final = self.is_final(self.current_state) return self.current_state, reward, final, info def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector: """ Return reward for reach `next_state` from `position` using `action`. :param state: initial position :param action: action to do :param next_state: next position reached :return: """ # Default reward reward = self.default_reward.copy() # Get treasure reward reward[1] = self.finals.get(next_state, self.default_reward[1]) # Water pressure (y-coordinate) reward[2] = -(next_state[1] + 1) return reward
class SpaceExploration(EnvMesh): # Possible actions _actions = {'UP': 0, 'UP RIGHT': 1, 'RIGHT': 2, 'DOWN RIGHT': 3, 'DOWN': 4, 'DOWN LEFT': 5, 'LEFT': 6, 'UP LEFT': 7} # Experiments common hypervolume reference hv_reference = Vector([-100, -150]) def __init__(self, initial_state: tuple = (5, 2), default_reward: tuple = (0, -1), seed: int = 0, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (mission_success, radiation) :param seed: Seed used for np.random.RandomState method. """ # List of all treasures and its reward. finals = {} finals.update({(0, i): 20 for i in range(5)}) finals.update({(9, i): 10 for i in range(3)}) finals.update({(12, i): 30 for i in range(5)}) obstacles = frozenset() mesh_shape = (13, 5) default_reward = Vector(default_reward) super().__init__(mesh_shape=mesh_shape, seed=seed, initial_state=initial_state, default_reward=default_reward, finals=finals, obstacles=obstacles, action_space=action_space) self.asteroids = { (5, 0), (4, 1), (6, 1), (3, 2), (7, 2), (4, 3), (6, 3), (5, 4) } # Define radiations states (If the agent is on any of these, then receive -100 penalization) self.radiations = set() self.radiations = self.radiations.union({(1, i) for i in range(5)}) self.radiations = self.radiations.union({(10, i) for i in range(5)}) self.radiations = self.radiations.union({(11, i) for i in range(5)}) def step(self, action: int) -> (tuple, Vector, bool, dict): """ Given an action, do a step :param action: :return: (position, (mission_success, radiation), final, extra) """ # Initialize reward as vector reward = self.default_reward.copy() # Update previous state self.current_state = self.next_state(action=action) # If the ship crash with asteroid, the ship is destroyed. else mission success. reward[0] = -100 if self.current_state in self.asteroids else self.finals.get( self.current_state, self.default_reward[0] ) # If agent is in a radiation position, the penalty is -11, else is default radiation reward[1] = -11 if self.current_state in self.radiations else self.default_reward[1] # Check if is_final final = self.is_final(self.current_state) # Set extra info = {} return self.current_state, reward, final, info def next_position(self, action: int, position: tuple) -> (tuple, bool): """ Given an action and a position, return the next position reached. :param action: :param position: :return: """ # Get my position x, y = position # Get observations spaces observation_space_x, observation_space_y = self.observation_space.spaces # Do movement in cyclic mesh if action == self.actions['UP']: y = ue.move_up(y=y, limit=observation_space_y.n) elif action == self.actions['RIGHT']: x = ue.move_right(x=x, limit=observation_space_x.n) elif action == self.actions['DOWN']: y = ue.move_down(y=y, limit=observation_space_y.n) elif action == self.actions['LEFT']: x = ue.move_left(x=x, limit=observation_space_x.n) elif action == self.actions['UP RIGHT']: y = ue.move_up(y=y, limit=observation_space_y.n) x = ue.move_right(x=x, limit=observation_space_x.n) elif action == self.actions['DOWN RIGHT']: y = ue.move_down(y=y, limit=observation_space_y.n) x = ue.move_right(x=x, limit=observation_space_x.n) elif action == self.actions['DOWN LEFT']: y = ue.move_down(y=y, limit=observation_space_y.n) x = ue.move_left(x=x, limit=observation_space_x.n) elif action == self.actions['UP LEFT']: y = ue.move_up(y=y, limit=observation_space_y.n) x = ue.move_left(x=x, limit=observation_space_x.n) # Set next position next_position = x, y return next_position, True def next_state(self, action: int, state: tuple = None) -> tuple: """ Calc next position with current position and action given, in this environment is 8-neighbors. :param state: If a position is given, do action from that position. :param action: from action_space :return: """ # Get my position position = state if state else self.current_state next_position, is_valid = self.next_position(action=action, position=position) if not self.observation_space.contains(next_position) or not is_valid: next_position = position # Return (x, y) position return next_position def is_final(self, state: tuple = None) -> bool: """ Is final if agent crash with asteroid or is on final position. :param state: :return: """ # Check if agent crash with asteroid crash = state in self.asteroids # Check if agent is in final position final = state in self.finals.keys() return crash or final def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector: """ Return reward for reach `next_state` from `state` using `action`. :param state: initial position :param action: action to do :param next_state: next position reached :return: """ # Initialize reward as vector reward = self.default_reward.copy() # If the ship crash with asteroid, the ship is destroyed. else mission success. reward[0] = -100 if next_state in self.asteroids else self.finals.get( next_state, reward[0] ) # If agent is in a radiation position, the penalty is -11, else is default radiation reward[1] = -11 if next_state in self.radiations else reward[1] return reward
class ResourceGathering(EnvMesh): # Possible actions _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3} # Reference hv_reference = Vector((-10, -10, -10)) def __init__(self, initial_state: tuple = ((2, 4), (0, 0)), default_reward: tuple = (0, 0, 0), seed: int = 0, p_attack: float = 0.1, mesh_shape: tuple = (5, 5), gold_positions: frozenset = frozenset({(2, 0)}), gem_positions: frozenset = frozenset({(4, 1)}), observation_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (enemy_attack, gold, gems) :param seed: Seed used for np.random.RandomState method. :param p_attack: Probability that a enemy attacks when agent stay in an enemy position. """ default_reward = Vector(default_reward) if observation_space is None: # Build the observation space (position(x, y), quantity(gold, gems)) observation_space = gym.spaces.Tuple( (gym.spaces.Tuple((gym.spaces.Discrete(mesh_shape[0]), gym.spaces.Discrete(mesh_shape[1]))), gym.spaces.Tuple( (gym.spaces.Discrete(2), gym.spaces.Discrete(2))))) # Define final states finals = frozenset() # Super constructor call. super().__init__(mesh_shape=mesh_shape, seed=seed, initial_state=initial_state, default_reward=default_reward, observation_space=observation_space, finals=finals) # Positions where there are gold. self.gold_positions = gold_positions # Positions where there is a gem. self.gem_positions = gem_positions # States where there are enemies_positions self.enemies_positions = {(3, 0), (2, 1)} self.p_attack = p_attack self.home_position = (2, 4) self.checkpoints_states = self._checkpoints_states() def _checkpoints_states(self) -> set: """ Return states where the agent will get favorable reward. :return: """ return set( itertools.product({self.home_position}, {(1, 0), (0, 1), (1, 1)})) def step(self, action: int) -> (tuple, Vector, bool, dict): """ Given an action, do a step :param action: :return: """ # Initialize reward as vector reward = self.default_reward.copy() # Extract previous state previous_state = self.current_state # Update previous position self.current_state = self.next_state(action=action) if self.warning_action( state=previous_state, action=action) and self.current_state[0] == self.home_position: reward[0] = -1 # If we reach any checkpoint elif self.current_state in self.checkpoints_states: reward[1:3] = self.current_state[1] # Set extra info = {} # In this environment always return False final = self.is_final() return self.current_state, reward, final, info def next_state(self, action: int, state: tuple = None) -> tuple: """ Calc next position with current position and action given. Default is 4-neighbors (UP, LEFT, DOWN, RIGHT) :param state: If a position is given, do action from that position. :param action: from action_space :return: a new position (or old if is invalid action) """ # Unpack complex state (position, objects(gold, gem)) position, objects = state if state else self.current_state # Calc next position next_position, is_valid = self.next_position(action=action, position=position) # If the next_position isn't valid, reset to the previous position if not self.observation_space[0].contains( next_position) or not is_valid: next_position = position if next_position in self.gold_positions: objects = 1, objects[1] elif next_position in self.gem_positions: objects = objects[0], 1 elif next_position in self.enemies_positions and self.p_attack >= self.np_random.uniform( ): next_position, objects = self.initial_state next_position = self.home_position return next_position, objects def reset(self) -> tuple: """ Reset environment to zero. :return: """ # Reset to initial seed self.seed(seed=self.initial_seed) self.current_state = self.initial_state return self.current_state def states(self) -> set: """ Return all states from this environment :return: """ # Unpack spaces x_position, y_position = self.observation_space[0] # Calc basic states basic_states = set( (x, y) for x in range(x_position.n) for y in range(y_position.n)).difference(self.obstacles) # Calc product of basic states with objects states = set( itertools.product(basic_states, { (0, 0), (0, 1), (1, 0), (1, 1) })).difference(self.finals).difference( set( # Cannot be in gold positions without gold. itertools.product(self.gold_positions, { (0, 0), (0, 1) })).union( # Cannot be in gem positions without gem. itertools.product(self.gem_positions, { (0, 0), (1, 0) })).union( # Cannot be in home position with gem and/or gold. self.checkpoints_states)) # Return all spaces return states def warning_action(self, state: tuple, action: int) -> bool: """ Check if that in that state the agent can be attacked. :param state: :param action: :return: """ return ((state[0] == (3, 1) or state[0] == (3, 0)) and action == self.actions['UP']) or \ (state[0] == (3, 1) and action == self.actions['LEFT']) or \ (state[0] == (4, 0) and action == self.actions['LEFT']) or \ (state[0] == (2, 2) and action == self.actions['UP']) or \ (state[0] == (1, 1) and action == self.actions['RIGHT']) or \ (state[0] == (2, 0) and action == self.actions['DOWN']) or \ (state[0] == (2, 0) and action == self.actions['RIGHT']) def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector: """ Return reward for reach `next_state` from `state` using `action`. :param state: initial position :param action: action to do :param next_state: next position reached :return: """ # Initialize reward as vector reward = self.default_reward.copy() if self.warning_action( state=state, action=action) and next_state[0] == self.home_position: reward[:] = -1, 0, 0 elif next_state in self.checkpoints_states: reward[1:3] = next_state[1] return reward def transition_probability(self, state: tuple, action: int, next_state: tuple) -> float: """ Return probability to reach `next_state` from `state` using `action`. :param state: initial position :param action: action to do :param next_state: next position reached :return: """ transition_probability = 1. if self.warning_action(state=state, action=action): transition_probability = self.p_attack if ( next_state[0] == self.home_position) else 1. - self.p_attack return transition_probability def reachable_states(self, state: tuple, action: int) -> set: """ Return all reachable states for pair (state, action) given. :param state: :param action: :return: """ reachable_states = set() # If current state is on checkpoints (in home position with any resource) then reset resources if state in self.checkpoints_states: state = (state[0], (0, 0)) if (state[0] == (3, 1) or state[0] == (3, 0)) and action == self.actions['UP']: reachable_states.add(((3, 0), state[1])) reachable_states.add((self.home_position, (0, 0))) elif state[0] == (3, 1) and action == self.actions['LEFT']: reachable_states.add(((2, 1), state[1])) reachable_states.add((self.home_position, (0, 0))) elif state[0] == (4, 0) and action == self.actions['LEFT']: reachable_states.add(((3, 0), state[1])) reachable_states.add((self.home_position, (0, 0))) elif state[0] == (2, 2) and action == self.actions['UP']: reachable_states.add(((2, 1), state[1])) reachable_states.add((self.home_position, (0, 0))) elif state[0] == (1, 1) and action == self.actions['RIGHT']: reachable_states.add(((2, 1), state[1])) reachable_states.add((self.home_position, (0, 0))) elif state[0] == (2, 0) and action == self.actions['DOWN']: reachable_states.add(((2, 1), state[1])) reachable_states.add((self.home_position, (0, 0))) elif state[0] == (2, 0) and action == self.actions['RIGHT']: reachable_states.add(((3, 0), state[1])) reachable_states.add((self.home_position, (0, 0))) else: reachable_states.add(self.next_state(action=action, state=state)) # Return all possible states reachable with any action return reachable_states def is_final(self, state: tuple = None) -> bool: """ Return always false (No episodic task) :return: """ return False
def do_iteration(self) -> None: """ Does an iteration (In this case a Sweeps) :return: """ # Increment total sweeps self.total_sweeps += 1 # Do a copy of v2 v2 = self.v.copy() # Removes all items from the dictionary self.v.clear() # For each state available for s in self.environment.states(): # A(state) <- Extract all actions available from position `state` self.environment.current_state = s # Vector of Empty sets t = dict() # Get all actions available actions = self.environment.action_space.copy() # For each a in action_space for a in actions: # Empty set for this a (T(a)) t_a = set() # Get all reachable states for that pair of (state, a) s2_set = self.environment.reachable_states(state=s, action=a) lv = list() for s2 in s2_set: # If this position is unknown return empty set lv.append(v2.get(s2, [Vector(self.initial_q_value)])) # Calc cartesian product of each reachable states cartesian_product = itertools.product(*lv) for product in cartesian_product: summation = self.environment.default_reward.zero_vector for j, s2 in enumerate(s2_set): # Probability to reach that position p = self.environment.transition_probability( state=s, action=a, next_state=s2) # Reward to reach that position r = self.environment.transition_reward(state=s, action=a, next_state=s2) # Get previous value per gamma previous_value = product[j] * self.gamma # Summation summation += (r + previous_value) * p # T(a) <- T(a) U {.....} t_a = t_a.union({summation}) t.update({a: t_a}) # u_t <- U T(a) u_t = set.union(*t.values()) # Remove duplicates and after transform to list u_t = set( map( lambda x: un.round_with_precision(x, Vector. decimal_precision), u_t)) # V(state) <- filter[u_t] self.v.update({s: self.filter_vectors(vectors=u_t)})
class BonusWorld(EnvMesh): # Possible actions _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3} # Experiments common hypervolume reference hv_reference = Vector([0, 0, -150]) def __init__(self, initial_state: tuple = ((0, 0), False), default_reward: tuple = (0, 0), seed: int = 0, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (objective 1, objective 2) :param seed: Seed used for np.random.RandomState method. :param action_space: """ # List of all treasures and its reward. finals = { (8, 0): Vector([1, 9]), (8, 2): Vector([3, 9]), (8, 4): Vector([5, 9]), (8, 6): Vector([7, 9]), (8, 8): Vector([9, 9]), (0, 8): Vector([9, 1]), (2, 8): Vector([9, 3]), (4, 8): Vector([9, 5]), (6, 8): Vector([9, 7]), } # Define mesh shape mesh_shape = (9, 9) # Set obstacles obstacles = frozenset({(2, 2), (2, 3), (3, 2)}) # Default reward plus time (objective 1, objective 2, time) default_reward += (-1,) default_reward = Vector(default_reward) # Build the observation space (position (x, y), bonus) observation_space = gym.spaces.Tuple( ( gym.spaces.Tuple( (gym.spaces.Discrete(mesh_shape[0]), gym.spaces.Discrete(mesh_shape[1])) ), spaces.Boolean() ) ) super().__init__(mesh_shape=mesh_shape, default_reward=default_reward, initial_state=initial_state, finals=finals, obstacles=obstacles, observation_space=observation_space, seed=seed, action_space=action_space) # Pits marks which returns the agent to the start location. self.pits = { (7, 1), (7, 3), (7, 5), (1, 7), (3, 7), (5, 7) } # X2 bonus self.bonus = [ (3, 3) ] def step(self, action: int) -> (tuple, Vector, bool, dict): """ Given an action, do a step :param action: :return: (position, (objective 1, objective 2, time), final, extra) """ # Initialize reward as vector reward = self.default_reward.copy() # Unpack next position for reward position, bonus = self.next_state(action=action) # Get treasure value reward[0], reward[1] = self.finals.get(position, (self.default_reward[0], self.default_reward[1])) # If the bonus is activated, double the reward. if bonus: reward[0] *= 2 reward[1] *= 2 # Set extra info = {} # Update current position self.current_state = position, bonus # Check is_final final = self.is_final(self.current_state) return self.current_state, reward, final, info def next_state(self, action: int, state: tuple = None) -> tuple: """ Given a state and an action, return the next state :param action: :param state: :return: """ # Unpack complex position (position, bonus_activated) position, bonus = state if state else self.current_state # Calc next position next_position, is_valid = self.next_position(action=action, position=position) # If the next_position isn't valid, reset to the previous position if not self.observation_space[0].contains(next_position) or not is_valid: next_position = position # If agent is in pit, it'state returned at initial position and deactivate the bonus. if next_position in self.pits: next_position, bonus = self.initial_state bonus = False # Check if the agent has activated the bonus elif next_position in self.bonus: bonus = True # Build next position return next_position, bonus def is_final(self, state: tuple = None) -> bool: """ Is final if agent is on final position. :param state: :return: """ return state[0] in self.finals.keys() def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector: """ Given a state, an action and a next state, return the corresponding reward. :param state: :param action: :param next_state: :return: """ # Separate position from bonus_activated position, bonus_activated = next_state # Default reward reward = self.default_reward.copy() # Get treasure value reward[0], reward[1] = self.finals.get(position, (reward[0], reward[1])) # If the bonus is activated, double the reward. if bonus_activated: reward[0] *= 2 reward[1] *= 2 return reward def states(self) -> set: """ Return a set with all states of this environment :return: """ # Unpack spaces position, bonus_activate = self.observation_space.spaces x_position, y_position = position.spaces # Get all positions all_positions = {(x, y) for x in range(x_position.n) for y in range(y_position.n)} # Get obstacles, finals positions and pits finals_obstacles_and_pits = self.obstacles.union(set(self.finals.keys())).union(self.pits) # Generate available states available_states = set(product(all_positions - finals_obstacles_and_pits, {True, False})) # Remove impossible states available_states = available_states - { ((3, 3), False) } # Return all available spaces return available_states
class PyramidMDP(EnvMesh): # Possible actions _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3} # Pareto optimal policy vector-values pareto_optimal = [] # Experiments common hypervolume reference hv_reference = Vector((-20, -20)) def __init__(self, initial_state: tuple = (0, 0), default_reward: tuple = (-1, -1), seed: int = 0, n_transition: float = 0.95, diagonals: int = 9, action_space: gym.spaces = None): """ :param initial_state: Initial state where start the agent. :param default_reward: (objective 1, objective 2) :param seed: Seed used for np.random.RandomState method. :param n_transition: if is 1, always do the action indicated. (Original is about 0.6) :param diagonals: Number of diagonals to be used to build this environment (allows experimenting with an identical environment, but considering only the first k diagonals) (By default 9 - all). """ # the original full-size environment. mesh_shape = (min(max(diagonals + 1, 1), 10), min(max(diagonals + 1, 1), 10)) # Dictionary with final states as keys, and treasure amounts as values. diagonals_states = { x for x in zip(range(0, diagonals + 1, 1), range(diagonals, -1, -1)) } # Generate finals states with its reward finals = { state: (Vector(state) + 1) * 10 for state in diagonals_states } # Pareto optimal PyramidMDP.pareto_optimal = { Vector(state) + 1 for state in diagonals_states } # Filter obstacles states obstacles = frozenset((x, y) for x, y in finals.keys() for y in range(y, diagonals + 1) if (x, y) not in finals) # Default reward (objective_1, objective_2) default_reward = Vector(default_reward) # Transaction assert 0 <= n_transition <= 1. self.n_transition = n_transition super().__init__(mesh_shape=mesh_shape, initial_state=initial_state, default_reward=default_reward, finals=finals, obstacles=obstacles, seed=seed, action_space=action_space) def step(self, action: int) -> (tuple, Vector, bool, dict): """ Given an action, do a step :param action: :return: (position, (time_inverted, treasure_value), final, extra) """ # Get probability action action = self.__probability_action(action=action) # Initialize rewards as vector reward = self.default_reward.copy() # Update current position self.current_state = self.next_state(action=action) # Get treasure value reward = self.finals.get(self.current_state, reward) # Set extra info = {} # Check is_final final = self.is_final(self.current_state) return self.current_state, reward, final, info def __probability_action(self, action: int) -> int: """ Decide probability action after apply probabilistic p_stochastic. :param action: :return: """ # Get a random uniform number [0., 1.] random = self.np_random.uniform() # If random is greater than self.n_transition, get a random action if random > self.n_transition: action = self.action_space.sample() return action def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector: """ Return reward for reach `next_state` from `position` using `action`. :param state: initial position :param action: action to do :param next_state: next position reached :return: """ # Default reward return self.finals.get(next_state, self.default_reward.copy()) def transition_probability(self, state: tuple, action: int, next_state: tuple) -> float: """ Return probability to reach `next_state` from `position` using `action`. :param state: initial position :param action: action to do :param next_state: next position reached :return: """ # Probability desired_probability = self.n_transition desired_transition = ( (action == self.actions['UP'] and ue.is_on_up_or_same_position( state=state, next_state=next_state)) or (action == self.actions['RIGHT'] and ue.is_on_right_or_same_position( state=state, next_position=next_state)) or (action == self.actions['DOWN'] and ue.is_on_down_or_same_position( state=state, next_state=next_state)) or (action == self.actions['LEFT'] and ue.is_on_left_or_same_position( state=state, next_state=next_state))) if not desired_transition: desired_probability = (1. - self.n_transition) / self.action_space.n return desired_probability def reachable_states(self, state: tuple, action: int) -> set: """ Return all reachable states for pair (state, a) given. :param state: :param action: :return: """ # Set current state with state indicated self.current_state = state # Get all actions available actions = self.action_space.copy() # Return all possible states reachable with any action return {self.next_state(action=a, state=state) for a in actions}