Example #1
    def __init__(self,
                 initial_state: tuple = (0, 0),
                 default_reward: tuple = (-1, -1),
                 seed: int = 0,
                 n_transition: float = 0.95,
                 diagonals: int = 9,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (objective 1, objective 2)
        :param seed: Seed used for np.random.RandomState method.
        :param n_transition: if is 1, always do the action indicated. (Original is about 0.6)
        :param diagonals: Number of diagonals to be used to build this environment (allows experimenting with an
                        identical environment, but considering only the first k diagonals) (By default 9 - all).

        # the original full-size environment.
        mesh_shape = (min(max(diagonals + 1, 1),
                          10), min(max(diagonals + 1, 1), 10))

        # Dictionary with final states as keys, and treasure amounts as values.
        diagonals_states = {
            for x in zip(range(0, diagonals + 1, 1), range(diagonals, -1, -1))

        # Generate finals states with its reward
        finals = {
            state: (Vector(state) + 1) * 10
            for state in diagonals_states

        # Pareto optimal
        PyramidMDP.pareto_optimal = {
            Vector(state) + 1
            for state in diagonals_states

        # Filter obstacles states
        obstacles = frozenset((x, y) for x, y in finals.keys()
                              for y in range(y, diagonals + 1)
                              if (x, y) not in finals)

        # Default reward (objective_1, objective_2)
        default_reward = Vector(default_reward)

        # Transaction
        assert 0 <= n_transition <= 1.
        self.n_transition = n_transition

Example #2
    def __init__(self, initial_state: tuple = ((0, 0), False), default_reward: tuple = (0, 0), seed: int = 0,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (objective 1, objective 2)
        :param seed: Seed used for np.random.RandomState method.
        :param action_space:

        # List of all treasures and its reward.
        finals = {
            (8, 0): Vector([1, 9]),
            (8, 2): Vector([3, 9]),
            (8, 4): Vector([5, 9]),
            (8, 6): Vector([7, 9]),
            (8, 8): Vector([9, 9]),

            (0, 8): Vector([9, 1]),
            (2, 8): Vector([9, 3]),
            (4, 8): Vector([9, 5]),
            (6, 8): Vector([9, 7]),

        # Define mesh shape
        mesh_shape = (9, 9)

        # Set obstacles
        obstacles = frozenset({(2, 2), (2, 3), (3, 2)})

        # Default reward plus time (objective 1, objective 2, time)
        default_reward += (-1,)
        default_reward = Vector(default_reward)

        # Build the observation space (position (x, y), bonus)
        observation_space = gym.spaces.Tuple(
                    (gym.spaces.Discrete(mesh_shape[0]), gym.spaces.Discrete(mesh_shape[1]))

        super().__init__(mesh_shape=mesh_shape, default_reward=default_reward, initial_state=initial_state,
                         finals=finals, obstacles=obstacles, observation_space=observation_space, seed=seed,

        # Pits marks which returns the agent to the start location.
        self.pits = {
            (7, 1), (7, 3), (7, 5), (1, 7), (3, 7), (5, 7)

        # X2 bonus
        self.bonus = [
            (3, 3)
Example #3
    def evaluate_bounds_are_of_type(self, bounds, type_id):
        sum = 0
        position = Vector(0, 0)

        # Bottom horizontal bound
        position.y = bounds.y_min
        for x in range(bounds.x_min, bounds.x_max):
            position.x = x
            sum += self.evaluate_tile_type(position, type_id)

        # Top horizontal bound
        position.y = bounds.y_max - 1
        for x in range(bounds.x_min, bounds.x_max):
            position.x = x
            sum += self.evaluate_tile_type(position, type_id)

        # Left vertical bound
        position.x = bounds.x_min
        for y in range(bounds.y_min, bounds.y_max - 1):
            position.y = y
            sum += self.evaluate_tile_type(position, type_id)

        # Right vertical bound
        position.x = bounds.x_max - 1
        for y in range(bounds.y_min + 1, bounds.y_max - 1):
            position.y = y
            sum += self.evaluate_tile_type(position, type_id)

        return sum
Example #4
    def __init__(self,
                 initial_state: tuple = ((0, 0), False),
                 default_reward: tuple = (0, 0),
                 seed: int = 0):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (objective 1, objective 2)
        :param seed: Seed used for np.random.RandomState method.

        # Create a bag action space
        action_space = Bag([])


        # Set obstacles
        self.obstacles = frozenset({(2, 2)})

        # PITS are finals states in this variant
        self.finals.update({state: Vector([-50, -50]) for state in self.pits})

        self.pits = list()
Example #5
    def __init__(self, initial_state: tuple = (5, 2), default_reward: tuple = (0, -1), seed: int = 0,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (mission_success, radiation)
        :param seed: Seed used for np.random.RandomState method.

        # List of all treasures and its reward.
        finals = {}
        finals.update({(0, i): 20 for i in range(5)})
        finals.update({(9, i): 10 for i in range(3)})
        finals.update({(12, i): 30 for i in range(5)})

        obstacles = frozenset()
        mesh_shape = (13, 5)
        default_reward = Vector(default_reward)

        super().__init__(mesh_shape=mesh_shape, seed=seed, initial_state=initial_state, default_reward=default_reward,
                         finals=finals, obstacles=obstacles, action_space=action_space)

        self.asteroids = {
            (5, 0), (4, 1), (6, 1), (3, 2), (7, 2), (4, 3), (6, 3), (5, 4)

        # Define radiations states (If the agent is on any of these, then receive -100 penalization)
        self.radiations = set()
        self.radiations = self.radiations.union({(1, i) for i in range(5)})
        self.radiations = self.radiations.union({(10, i) for i in range(5)})
        self.radiations = self.radiations.union({(11, i) for i in range(5)})
Example #6
    def test_transition_reward(self):

        # In this environment doesn't mind initial state to get the reward
        state = self.environment.observation_space.sample()

        # Doesn't mind action too.
        action = self.environment.action_space.sample()

        # Asteroids states
        for asteroid_state in self.environment.asteroids:
                Vector((-100, -1)),

        # Radiations states
        for radiation_state in self.environment.radiations:
                Vector((0, -11)),

        # Finals states
        for final_state, final_reward in self.environment.finals.items():
                Vector((final_reward, -1)),

        simple_states = self.environment.states() - set(

        for simple_state in simple_states:
                Vector((0, -1)),
Example #7
    def setUp(self):
        # An observation space
        observation_space = gym.spaces.Discrete(7)

        # Default reward
        default_reward = Vector([1, 2, 1])

        # Set initial_seed to 0 to testing.
        self.environment = Environment(observation_space=observation_space,
Example #8
    def evaluate_tiles_in_bounds_are_of_type(self, bounds, type_id):
        sum = 0
        position = Vector(0, 0)

        for x in range(bounds.x_min, bounds.x_max):
            position.x = x
            for y in range(bounds.y_min, bounds.y_max):
                position.y = y
                sum += self.evaluate_tile_type(position, type_id)

        return sum
Example #9
    def __init__(self, initial_state: tuple = (0, 0), default_reward: tuple = (0,), seed: int = 0, columns: int = 0,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (treasure_value, )
        :param seed: Seed used for np.random.RandomState method.

        original_mesh_shape = (10, 11)

        # Reduce the number of diagonals
        if columns < 1 or columns > original_mesh_shape[0]:
            columns = original_mesh_shape[0]

        # List of all treasures and its reward.
        finals = {
            (0, 1): 5,
            (1, 2): 80,
            (2, 3): 120,
            (3, 4): 140,
            (4, 4): 145,
            (5, 4): 150,
            (6, 7): 163,
            (7, 7): 166,
            (8, 9): 173,
            (9, 10): 175,

        # Filter finals states
        finals = dict(filter(lambda x: x[0][0] < columns, finals.items()))

        obstacles = frozenset()
        obstacles = obstacles.union([(0, y) for y in range(2, 11)])
        obstacles = obstacles.union([(1, y) for y in range(3, 11)])
        obstacles = obstacles.union([(2, y) for y in range(4, 11)])
        obstacles = obstacles.union([(3, y) for y in range(5, 11)])
        obstacles = obstacles.union([(4, y) for y in range(5, 11)])
        obstacles = obstacles.union([(5, y) for y in range(5, 11)])
        obstacles = obstacles.union([(6, y) for y in range(8, 11)])
        obstacles = obstacles.union([(7, y) for y in range(8, 11)])
        obstacles = obstacles.union([(8, y) for y in range(10, 11)])

        # Filter obstacles states
        obstacles = frozenset(filter(lambda x: x[0] < columns, obstacles))

        # Resize mesh_shape
        mesh_shape = (columns, 11)

        # Default reward plus time (time_inverted, treasure_value, water_pressure)
        default_reward = (-1,) + default_reward + (0,)
        default_reward = Vector(default_reward)

        super().__init__(mesh_shape=mesh_shape, seed=seed, default_reward=default_reward, initial_state=initial_state,
                         finals=finals, obstacles=obstacles, action_space=action_space)
Example #10
    def test_transition_reward(self):

        # In this environment doesn't mind initial state to get the reward
        state = self.environment.observation_space.sample()

        # Doesn't mind action too.
        action = self.environment.action_space.sample()

        # An intermediate state
                                               next_state=(1, 1)),
            Vector((-1, 0, -2)))

        # A final state
                                               next_state=(1, 2)),
            Vector([-1, 80, -3]))
Example #11
    def is_room(self, start_position, area_bounds):
        found_floor = False
        current_position = Vector(start_position.x, start_position.y + 1)

        # Check vertical for the room bounds
        while (current_position.y <= area_bounds.y_max and not found_floor):
            if not self.is_tile_of_type(current_position, TILE_TYPES["WALL"]):
                found_floor = True
                current_position.y -= 1
                current_position.y += 1
        room_bounds_y = current_position.y
        next_position = room_bounds_y + 1

        if not found_floor or room_bounds_y == start_position.y:
            return False, next_position, None

        # Check horizontal for the room bounds
        found_floor = False
        current_position.x += 1

        while (current_position.x <= area_bounds.x_max and not found_floor):
            if not self.is_tile_of_type(current_position, TILE_TYPES["WALL"]):
                found_floor = True
                current_position.x -= 1
                current_position.x += 1

        room_bounds_x = current_position.x
        if not found_floor or room_bounds_x == start_position.x:
            return False, next_position, None

        # Check vertical with the bounds found for y
        for y in range(start_position.y, room_bounds_y + 1):
            current_position.y = y
            if not self.is_tile_of_type(current_position, TILE_TYPES["WALL"]):
                return False, next_position, None

        # Check horizontal with the bounds found for y
        current_position.y = start_position.y
        for x in range(start_position.x, room_bounds_x + 1):
            current_position.x = x
            if not self.is_tile_of_type(current_position, TILE_TYPES["WALL"]):
                return False, next_position, None

        if abs(start_position.x - room_bounds_x) == 1 or \
           abs(start_position.y - room_bounds_y) == 1:
            return False, next_position, None

        room_bounds = Bounds(start_position.x, start_position.y, room_bounds_x,
        return True, next_position, room_bounds
Example #12
    def setUp(self):
        # Mesh shape
        mesh_shape = (7, 7)

        # Default reward
        default_reward = Vector([1, 2, 1])

        # Obstacles
        obstacles = frozenset({
            (0, 0), (1, 1)

        # Set initial_seed to 0 to testing.
        self.environment = EnvMesh(mesh_shape=mesh_shape, default_reward=default_reward, seed=0, obstacles=obstacles)
Example #13
    def __init__(self,
                 initial_state: tuple = ((2, 4), (0, 0)),
                 default_reward: tuple = (0, 0, 0),
                 seed: int = 0,
                 p_attack: float = 0.1,
                 mesh_shape: tuple = (5, 5),
                 gold_positions: frozenset = frozenset({(2, 0)}),
                 gem_positions: frozenset = frozenset({(4, 1)}),
                 observation_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (enemy_attack, gold, gems)
        :param seed: Seed used for np.random.RandomState method.
        :param p_attack: Probability that a enemy attacks when agent stay in an enemy position.

        default_reward = Vector(default_reward)

        if observation_space is None:
            # Build the observation space (position(x, y), quantity(gold, gems))
            observation_space = gym.spaces.Tuple(
                     (gym.spaces.Discrete(2), gym.spaces.Discrete(2)))))

        # Define final states
        finals = frozenset()

        # Super constructor call.

        # Positions where there are gold.
        self.gold_positions = gold_positions

        # Positions where there is a gem.
        self.gem_positions = gem_positions

        # States where there are enemies_positions
        self.enemies_positions = {(3, 0), (2, 1)}
        self.p_attack = p_attack
        self.home_position = (2, 4)

        self.checkpoints_states = self._checkpoints_states()
Example #14
    def find_rooms(self, bounds):
        room_areas = list()

        position = Vector(0, 0)
        # Evaluate the area for rooms
        next_position = 0
        for x in range(bounds.x_min, bounds.x_max):
            position.x = x
            for y in range(bounds.y_min, bounds.y_max):
                position.y = y
                if self.is_tile_of_type(position, TILE_TYPES["WALL"]):
                    # Found wall tile
                    result, next_position, room_bounds = self.is_room(
                        position, bounds)
                    if result:
        self.rooms = room_areas
        return room_areas
Example #15
    config = configparser.ConfigParser()

    # TODO: add errors handling
    # TODO: move all to the new class

    start_point = Point(
        float(config['START POINT']['x']),
        float(config['START POINT']['y']),
        float(config['START POINT']['z']),

    dimensions = Vector(
        float(config['SCENE DIMENSIONS']['dx']),
        float(config['SCENE DIMENSIONS']['dy']),
        float(config['SCENE DIMENSIONS']['dz']),

    stl_file = config['OTHERS']['STL file path']
    condition = float(config['OTHERS']['minimum volume'])
    result_file_path = config['OTHERS']['result file path']

    stl = STL(stl_file)

    print('> Generate octree...')
    root = Node(start_point, dimensions)
    get_grid(root, condition=condition, object=stl)

    ### NP.ARRAY ###
    # arr = array([], dtype=float)
Example #16
    def __init__(self,
                 seed: int = 0,
                 initial_state: int = 0,
                 default_reward: tuple = (0, 0)):
        :param seed: Initial initial_seed. The same is used for _action_space,
                     observation_space, and random number generator
        :param initial_state: start position for all episodes.
        :param default_reward: Default reward returned by the environment when
                               a reward is not defined (objective 1, objective 2).

        # Create the observation space
        observation_space = gym.spaces.Discrete(7)

        # Default reward
        default_reward = Vector(default_reward)

        # Super call constructor

        # Rewards dictionary
        self.rewards_dictionary = {
            0: {
                self.actions['COUNTER_CLOCKWISE']: Vector([3, -1]),
                self.actions['CLOCKWISE']: Vector([-1, 3])
            1: {
                self.actions['COUNTER_CLOCKWISE']: Vector([3, -1]),
                self.actions['CLOCKWISE']: Vector([-1, 0])
            2: {
                self.actions['COUNTER_CLOCKWISE']: Vector([3, -1]),
                self.actions['CLOCKWISE']: Vector([-1, 0])
            3: {
                self.actions['COUNTER_CLOCKWISE']: Vector([3, -1]),
                self.actions['CLOCKWISE']: Vector([-1, 0])
            4: {
                self.actions['CLOCKWISE']: Vector([-1, 3]),
                self.actions['COUNTER_CLOCKWISE']: Vector([0, -1])
            5: {
                self.actions['CLOCKWISE']: Vector([-1, 3]),
                self.actions['COUNTER_CLOCKWISE']: Vector([0, -1])
            6: {
                self.actions['CLOCKWISE']: Vector([-1, 3]),
                self.actions['COUNTER_CLOCKWISE']: Vector([0, -1])

        # Possible p_stochastic from a position to another
        self.possible_transitions = {
            0: {
                self.actions['COUNTER_CLOCKWISE']: 1,
                self.actions['CLOCKWISE']: 4
            1: {
                self.actions['COUNTER_CLOCKWISE']: 2,
                self.actions['CLOCKWISE']: 0
            2: {
                self.actions['COUNTER_CLOCKWISE']: 3,
                self.actions['CLOCKWISE']: 1
            3: {
                self.actions['COUNTER_CLOCKWISE']: 0,
                self.actions['CLOCKWISE']: 2
            4: {
                self.actions['CLOCKWISE']: 5,
                self.actions['COUNTER_CLOCKWISE']: 0
            5: {
                self.actions['CLOCKWISE']: 6,
                self.actions['COUNTER_CLOCKWISE']: 4
            6: {
                self.actions['CLOCKWISE']: 0,
                self.actions['COUNTER_CLOCKWISE']: 5
Example #17
    def __init__(self,
                 initial_state: tuple = (0, 0),
                 default_reward: tuple = (0, ),
                 columns: int = 10,
                 seed: int = 0,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (time_inverted, treasure_value)
        :param columns: Number of columns to be used to build this environment (allows experimenting with an identical
                        environment, but considering only the first k columns) (By default 10 - all).
        :param seed: Seed used for np.random.RandomState method.
        :param action_space: Specific action space

        # the original full-size environment.
        original_mesh_shape = (10, 11)

        if columns < 1 or columns > original_mesh_shape[0]:
            columns = original_mesh_shape[0]

        # Dictionary with final states as keys, and treasure amounts as values.
        finals = {
            (0, 1): 1,
            (1, 2): 2,
            (2, 3): 3,
            (3, 4): 5,
            (4, 4): 8,
            (5, 4): 16,
            (6, 7): 24,
            (7, 7): 50,
            (8, 9): 74,
            (9, 10): 124,

        # Filter finals states
        finals = dict(filter(lambda x: x[0][0] < columns, finals.items()))

        # Filter obstacles states
        obstacles = frozenset()
        obstacles = obstacles.union([(0, y) for y in range(2, 11)])
        obstacles = obstacles.union([(1, y) for y in range(3, 11)])
        obstacles = obstacles.union([(2, y) for y in range(4, 11)])
        obstacles = obstacles.union([(3, y) for y in range(5, 11)])
        obstacles = obstacles.union([(4, y) for y in range(5, 11)])
        obstacles = obstacles.union([(5, y) for y in range(5, 11)])
        obstacles = obstacles.union([(6, y) for y in range(8, 11)])
        obstacles = obstacles.union([(7, y) for y in range(8, 11)])
        obstacles = obstacles.union([(8, y) for y in range(10, 11)])
        obstacles = frozenset(filter(lambda x: x[0] < columns, obstacles))

        # Subspace of the environment to be considered
        mesh_shape = (columns, 11)

        # Default reward plus time (time_inverted, treasure_value)
        default_reward = (-1, ) + default_reward
        default_reward = Vector(default_reward)

Example #18
class DeepSeaTreasure(EnvMesh):
    # Possible actions
    _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3}

    # Pareto optimal policy vector-values
    pareto_optimal = [
        Vector([-1, 1]),
        Vector([-3, 2]),
        Vector([-5, 3]),
        Vector([-7, 5]),
        Vector([-8, 8]),
        Vector([-9, 16]),
        Vector([-13, 24]),
        Vector([-14, 50]),
        Vector([-17, 74]),
        Vector([-19, 124])

    # Experiments common hypervolume reference
    hv_reference = Vector((-25, 0))

    def __init__(self,
                 initial_state: tuple = (0, 0),
                 default_reward: tuple = (0, ),
                 columns: int = 10,
                 seed: int = 0,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (time_inverted, treasure_value)
        :param columns: Number of columns to be used to build this environment (allows experimenting with an identical
                        environment, but considering only the first k columns) (By default 10 - all).
        :param seed: Seed used for np.random.RandomState method.
        :param action_space: Specific action space

        # the original full-size environment.
        original_mesh_shape = (10, 11)

        if columns < 1 or columns > original_mesh_shape[0]:
            columns = original_mesh_shape[0]

        # Dictionary with final states as keys, and treasure amounts as values.
        finals = {
            (0, 1): 1,
            (1, 2): 2,
            (2, 3): 3,
            (3, 4): 5,
            (4, 4): 8,
            (5, 4): 16,
            (6, 7): 24,
            (7, 7): 50,
            (8, 9): 74,
            (9, 10): 124,

        # Filter finals states
        finals = dict(filter(lambda x: x[0][0] < columns, finals.items()))

        # Filter obstacles states
        obstacles = frozenset()
        obstacles = obstacles.union([(0, y) for y in range(2, 11)])
        obstacles = obstacles.union([(1, y) for y in range(3, 11)])
        obstacles = obstacles.union([(2, y) for y in range(4, 11)])
        obstacles = obstacles.union([(3, y) for y in range(5, 11)])
        obstacles = obstacles.union([(4, y) for y in range(5, 11)])
        obstacles = obstacles.union([(5, y) for y in range(5, 11)])
        obstacles = obstacles.union([(6, y) for y in range(8, 11)])
        obstacles = obstacles.union([(7, y) for y in range(8, 11)])
        obstacles = obstacles.union([(8, y) for y in range(10, 11)])
        obstacles = frozenset(filter(lambda x: x[0] < columns, obstacles))

        # Subspace of the environment to be considered
        mesh_shape = (columns, 11)

        # Default reward plus time (time_inverted, treasure_value)
        default_reward = (-1, ) + default_reward
        default_reward = Vector(default_reward)


    def step(self, action: int) -> (tuple, Vector, bool, dict):
        Given an action, do a step
        :param action:
        :return: (position, (time_inverted, treasure_value), final, extra)

        # Initialize rewards as vector
        reward = self.default_reward.copy()

        # Update current position
        self.current_state = self.next_state(action=action)

        # Get treasure value
        reward[1] = self.finals.get(self.current_state, self.default_reward[1])

        # Set extra
        info = {}

        # Check is_final
        final = self.is_final(self.current_state)

        return self.current_state, reward, final, info

    def transition_reward(self, state: tuple, action: int,
                          next_state: tuple) -> Vector:
        Given a state, an action and a next state, return the corresponding reward.
        :param state:
        :param action:
        :param next_state:

        # Default reward
        reward = self.default_reward.copy()

        # Get treasure reward
        reward[1] = self.finals.get(next_state, self.default_reward[1])

        return reward
Example #19
clock = pygame.time.Clock()
font = pygame.font.SysFont(None, 25)

WHITE = (255, 255, 255)
BLACK = (0, 0, 0)

RED = (255, 0, 0)
GREEN = (0, 255, 0)
BLUE = (0, 0, 255)

WIDTH = 1000
HEIGHT = 800
GRAVITY = Vector(0, 0)

GAME_DISPLAY = pygame.display.set_mode((WIDTH, HEIGHT))

mouse_pos = None
modify_type = None
modify_up = False
modify_down = False
balls = []

def update_balls(balls):
    updated_balls = []
Example #20
class MoPuddleWorld(EnvMesh):
    # Possible actions
    _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3}

    # Experiments common hypervolume reference
    hv_reference = Vector([-50, -150])

    def __init__(self,
                 default_reward: tuple = (10, 0),
                 penalize_non_goal: float = -1,
                 seed: int = 0,
                 final_state: tuple = (19, 0),
                 action_space: gym.spaces = None):
        :param default_reward: (non_goal_reached, puddle_penalize)
        :param penalize_non_goal: While agent does not reach a final position get a penalize.
        :param seed: Initial initial_seed. The same is used for _action_space,
                     observation_space, and random number generator
        :param final_state: This environment only has a final position.

        self.final_state = final_state
        mesh_shape = (20, 20)
        default_reward = VectorDecimal(default_reward)


        self.puddles = frozenset()
        self.puddles = self.puddles.union([(x, y) for x in range(0, 11)
                                           for y in range(3, 7)])
        self.puddles = self.puddles.union([(x, y) for x in range(6, 10)
                                           for y in range(2, 14)])
        self.penalize_non_goal = penalize_non_goal

        self.current_state = self.reset()

        # Get free spaces
        self.free_spaces = set(self.states() - self.puddles)

    def step(self, action: int) -> (tuple, VectorDecimal, bool, dict):
        Given an action, do a step
        :param action:
        :return: (position, (non_goal_reached, puddle_penalize), final, extra)

        # Initialize reward as vector
        reward = self.default_reward.copy()

        # Update previous position
        self.current_state = self.next_state(action=action)

        # If agent is in treasure
        final = self.is_final(self.current_state)

        # Set final reward
        if not final:
            reward[0] = self.penalize_non_goal

        # if the current position is in an puddle
        if self.current_state in self.puddles:
            # Set penalization per distance
            reward[1] = self.calc_puddle_penalization(state=self.current_state)

        # Set extra
        info = {}

        return self.current_state, reward, final, info

    def calc_puddle_penalization(self, state: tuple) -> float:
        Return a float that represents a penalization, the penalization is the lowest distance between current state
        and the nearest border in manhattan distance.
        :param state:
        # Min distance found!
        min_distance = min(
            cityblock(self.current_state, state) for state in self.free_spaces)

        # Set penalization per distance
        return -min_distance

    def reset(self) -> tuple:
        Get random non-goal position to current_value

        # Reset to initial seed

        random_space = None

        while random_space is None or random_space == self.final_state:
            random_space = self.observation_space.sample()

        self.current_state = random_space
        return self.current_state

    def is_final(self, state: tuple = None) -> bool:
        Is final if agent is on final position
        :param state:
        return state == self.final_state

    def transition_reward(self, state: tuple, action: int,
                          next_state: tuple) -> Vector:
        Return reward for reach `next_state` from `position` using `action`.

        :param state: initial position
        :param action: action to do
        :param next_state: next position reached

        # Initialize reward as vector
        reward = self.default_reward.copy()

        # If agent is in treasure
        final = self.is_final(next_state)

        # Set final reward
        if not final:
            reward[0] = self.penalize_non_goal

        # if the current position is in an puddle
        if next_state in self.puddles:
            # Min distance found!
            min_distance = min(
                cityblock(next_state, state) for state in self.free_spaces)

            # Set penalization per distance
            reward[1] = -min_distance

        return reward

    def states(self) -> set:
        Return all possible states of this environment.

        # Unpack spaces
        x_position, y_position = self.observation_space.spaces

        return set((x, y) for x in range(x_position.n)
                   for y in range(y_position.n)).difference({self.final_state})
Example #21
    'W_{0.01}': {
        'color': 'c',
        'marker': 'state'
    'W_{0.005}': {
        'color': 'b',
        'marker': 'd'
    'W_{0.001}': {
        'color': 'k',
        'marker': 'o'

vector_reference = Vector((-25, 0))

def pareto_graph(data: dict):
    # Columns
    columns = list(data.keys())[0]

    # Prepare hypervolume to dumps data
    pareto_file = Path(__file__).parent.joinpath(

    # If any parents doesn't exist, make it.
    pareto_file.parent.mkdir(parents=True, exist_ok=True)

    data = data[columns]
Example #22
 def test_vectors(self):
     a = Vector([1, 1])
     b = Vector([0, 12])
     self.assertAlmostEqual(a.angle(b), math.pi / 4)
Example #23
class PressurizedBountifulSeaTreasure(EnvMesh):
    # Possible actions
    _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3}

    # Pareto optimal
    pareto_optimal = [
        (-1, 5, -2), (-3, 80, -3), (-5, 120, -4), (-7, 140, -5), (-8, 145, -6), (-9, 150, -6), (-13, 163, -8),
        (-14, 166, -8), (-17, 173, -10), (-19, 175, -11)

    # Experiments common hypervolume reference
    hv_reference = Vector([-25, 0, -120])

    def __init__(self, initial_state: tuple = (0, 0), default_reward: tuple = (0,), seed: int = 0, columns: int = 0,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (treasure_value, )
        :param seed: Seed used for np.random.RandomState method.

        original_mesh_shape = (10, 11)

        # Reduce the number of diagonals
        if columns < 1 or columns > original_mesh_shape[0]:
            columns = original_mesh_shape[0]

        # List of all treasures and its reward.
        finals = {
            (0, 1): 5,
            (1, 2): 80,
            (2, 3): 120,
            (3, 4): 140,
            (4, 4): 145,
            (5, 4): 150,
            (6, 7): 163,
            (7, 7): 166,
            (8, 9): 173,
            (9, 10): 175,

        # Filter finals states
        finals = dict(filter(lambda x: x[0][0] < columns, finals.items()))

        obstacles = frozenset()
        obstacles = obstacles.union([(0, y) for y in range(2, 11)])
        obstacles = obstacles.union([(1, y) for y in range(3, 11)])
        obstacles = obstacles.union([(2, y) for y in range(4, 11)])
        obstacles = obstacles.union([(3, y) for y in range(5, 11)])
        obstacles = obstacles.union([(4, y) for y in range(5, 11)])
        obstacles = obstacles.union([(5, y) for y in range(5, 11)])
        obstacles = obstacles.union([(6, y) for y in range(8, 11)])
        obstacles = obstacles.union([(7, y) for y in range(8, 11)])
        obstacles = obstacles.union([(8, y) for y in range(10, 11)])

        # Filter obstacles states
        obstacles = frozenset(filter(lambda x: x[0] < columns, obstacles))

        # Resize mesh_shape
        mesh_shape = (columns, 11)

        # Default reward plus time (time_inverted, treasure_value, water_pressure)
        default_reward = (-1,) + default_reward + (0,)
        default_reward = Vector(default_reward)

        super().__init__(mesh_shape=mesh_shape, seed=seed, default_reward=default_reward, initial_state=initial_state,
                         finals=finals, obstacles=obstacles, action_space=action_space)

    def step(self, action: int) -> (tuple, Vector, bool, dict):
        Given an action, do a step
        :param action:
        :return: (position, (time_inverted, treasure_value), final, extra)

        # Initialize reward as vector
        reward = self.default_reward.copy()

        # Update previous position
        self.current_state = self.next_state(action=action)

        # Get treasure value
        reward[1] = self.finals.get(self.current_state, self.default_reward[1])

        # Water pressure (y-coordinate)
        reward[2] = -(self.current_state[1] + 1)

        # Set extra
        info = {}

        # Check is_final
        final = self.is_final(self.current_state)

        return self.current_state, reward, final, info

    def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector:
        Return reward for reach `next_state` from `position` using `action`.

        :param state: initial position
        :param action: action to do
        :param next_state: next position reached
        # Default reward
        reward = self.default_reward.copy()

        # Get treasure reward
        reward[1] = self.finals.get(next_state, self.default_reward[1])

        # Water pressure (y-coordinate)
        reward[2] = -(next_state[1] + 1)

        return reward
Example #24
class SpaceExploration(EnvMesh):
    # Possible actions
    _actions = {'UP': 0, 'UP RIGHT': 1, 'RIGHT': 2, 'DOWN RIGHT': 3, 'DOWN': 4, 'DOWN LEFT': 5, 'LEFT': 6, 'UP LEFT': 7}

    # Experiments common hypervolume reference
    hv_reference = Vector([-100, -150])

    def __init__(self, initial_state: tuple = (5, 2), default_reward: tuple = (0, -1), seed: int = 0,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (mission_success, radiation)
        :param seed: Seed used for np.random.RandomState method.

        # List of all treasures and its reward.
        finals = {}
        finals.update({(0, i): 20 for i in range(5)})
        finals.update({(9, i): 10 for i in range(3)})
        finals.update({(12, i): 30 for i in range(5)})

        obstacles = frozenset()
        mesh_shape = (13, 5)
        default_reward = Vector(default_reward)

        super().__init__(mesh_shape=mesh_shape, seed=seed, initial_state=initial_state, default_reward=default_reward,
                         finals=finals, obstacles=obstacles, action_space=action_space)

        self.asteroids = {
            (5, 0), (4, 1), (6, 1), (3, 2), (7, 2), (4, 3), (6, 3), (5, 4)

        # Define radiations states (If the agent is on any of these, then receive -100 penalization)
        self.radiations = set()
        self.radiations = self.radiations.union({(1, i) for i in range(5)})
        self.radiations = self.radiations.union({(10, i) for i in range(5)})
        self.radiations = self.radiations.union({(11, i) for i in range(5)})

    def step(self, action: int) -> (tuple, Vector, bool, dict):
        Given an action, do a step
        :param action:
        :return: (position, (mission_success, radiation), final, extra)

        # Initialize reward as vector
        reward = self.default_reward.copy()

        # Update previous state
        self.current_state = self.next_state(action=action)

        # If the ship crash with asteroid, the ship is destroyed. else mission success.
        reward[0] = -100 if self.current_state in self.asteroids else self.finals.get(
            self.current_state, self.default_reward[0]

        # If agent is in a radiation position, the penalty is -11, else is default radiation
        reward[1] = -11 if self.current_state in self.radiations else self.default_reward[1]

        # Check if is_final
        final = self.is_final(self.current_state)

        # Set extra
        info = {}

        return self.current_state, reward, final, info

    def next_position(self, action: int, position: tuple) -> (tuple, bool):
        Given an action and a position, return the next position reached.
        :param action:
        :param position:

        # Get my position
        x, y = position

        # Get observations spaces
        observation_space_x, observation_space_y = self.observation_space.spaces

        # Do movement in cyclic mesh
        if action == self.actions['UP']:
            y = ue.move_up(y=y, limit=observation_space_y.n)
        elif action == self.actions['RIGHT']:
            x = ue.move_right(x=x, limit=observation_space_x.n)
        elif action == self.actions['DOWN']:
            y = ue.move_down(y=y, limit=observation_space_y.n)
        elif action == self.actions['LEFT']:
            x = ue.move_left(x=x, limit=observation_space_x.n)
        elif action == self.actions['UP RIGHT']:
            y = ue.move_up(y=y, limit=observation_space_y.n)
            x = ue.move_right(x=x, limit=observation_space_x.n)
        elif action == self.actions['DOWN RIGHT']:
            y = ue.move_down(y=y, limit=observation_space_y.n)
            x = ue.move_right(x=x, limit=observation_space_x.n)
        elif action == self.actions['DOWN LEFT']:
            y = ue.move_down(y=y, limit=observation_space_y.n)
            x = ue.move_left(x=x, limit=observation_space_x.n)
        elif action == self.actions['UP LEFT']:
            y = ue.move_up(y=y, limit=observation_space_y.n)
            x = ue.move_left(x=x, limit=observation_space_x.n)

        # Set next position
        next_position = x, y

        return next_position, True

    def next_state(self, action: int, state: tuple = None) -> tuple:
        Calc next position with current position and action given, in this environment is 8-neighbors.
        :param state: If a position is given, do action from that position.
        :param action: from action_space

        # Get my position
        position = state if state else self.current_state

        next_position, is_valid = self.next_position(action=action, position=position)

        if not self.observation_space.contains(next_position) or not is_valid:
            next_position = position

        # Return (x, y) position
        return next_position

    def is_final(self, state: tuple = None) -> bool:
        Is final if agent crash with asteroid or is on final position.
        :param state:

        # Check if agent crash with asteroid
        crash = state in self.asteroids

        # Check if agent is in final position
        final = state in self.finals.keys()

        return crash or final

    def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector:
        Return reward for reach `next_state` from `state` using `action`.

        :param state: initial position
        :param action: action to do
        :param next_state: next position reached
        # Initialize reward as vector
        reward = self.default_reward.copy()

        # If the ship crash with asteroid, the ship is destroyed. else mission success.
        reward[0] = -100 if next_state in self.asteroids else self.finals.get(
            next_state, reward[0]

        # If agent is in a radiation position, the penalty is -11, else is default radiation
        reward[1] = -11 if next_state in self.radiations else reward[1]

        return reward
Example #25
class ResourceGathering(EnvMesh):
    # Possible actions
    _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3}

    # Reference
    hv_reference = Vector((-10, -10, -10))

    def __init__(self,
                 initial_state: tuple = ((2, 4), (0, 0)),
                 default_reward: tuple = (0, 0, 0),
                 seed: int = 0,
                 p_attack: float = 0.1,
                 mesh_shape: tuple = (5, 5),
                 gold_positions: frozenset = frozenset({(2, 0)}),
                 gem_positions: frozenset = frozenset({(4, 1)}),
                 observation_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (enemy_attack, gold, gems)
        :param seed: Seed used for np.random.RandomState method.
        :param p_attack: Probability that a enemy attacks when agent stay in an enemy position.

        default_reward = Vector(default_reward)

        if observation_space is None:
            # Build the observation space (position(x, y), quantity(gold, gems))
            observation_space = gym.spaces.Tuple(
                     (gym.spaces.Discrete(2), gym.spaces.Discrete(2)))))

        # Define final states
        finals = frozenset()

        # Super constructor call.

        # Positions where there are gold.
        self.gold_positions = gold_positions

        # Positions where there is a gem.
        self.gem_positions = gem_positions

        # States where there are enemies_positions
        self.enemies_positions = {(3, 0), (2, 1)}
        self.p_attack = p_attack
        self.home_position = (2, 4)

        self.checkpoints_states = self._checkpoints_states()

    def _checkpoints_states(self) -> set:
        Return states where the agent will get favorable reward.
        return set(
            itertools.product({self.home_position}, {(1, 0), (0, 1), (1, 1)}))

    def step(self, action: int) -> (tuple, Vector, bool, dict):
        Given an action, do a step
        :param action:

        # Initialize reward as vector
        reward = self.default_reward.copy()

        # Extract previous state
        previous_state = self.current_state

        # Update previous position
        self.current_state = self.next_state(action=action)

        if self.warning_action(
                action=action) and self.current_state[0] == self.home_position:
            reward[0] = -1
        # If we reach any checkpoint
        elif self.current_state in self.checkpoints_states:
            reward[1:3] = self.current_state[1]

        # Set extra
        info = {}

        # In this environment always return False
        final = self.is_final()

        return self.current_state, reward, final, info

    def next_state(self, action: int, state: tuple = None) -> tuple:
        Calc next position with current position and action given. Default is 4-neighbors (UP, LEFT, DOWN, RIGHT)
        :param state: If a position is given, do action from that position.
        :param action: from action_space
        :return: a new position (or old if is invalid action)
        # Unpack complex state (position, objects(gold, gem))
        position, objects = state if state else self.current_state

        # Calc next position
        next_position, is_valid = self.next_position(action=action,

        # If the next_position isn't valid, reset to the previous position
        if not self.observation_space[0].contains(
                next_position) or not is_valid:
            next_position = position

        if next_position in self.gold_positions:
            objects = 1, objects[1]

        elif next_position in self.gem_positions:
            objects = objects[0], 1

        elif next_position in self.enemies_positions and self.p_attack >= self.np_random.uniform(
            next_position, objects = self.initial_state
            next_position = self.home_position

        return next_position, objects

    def reset(self) -> tuple:
        Reset environment to zero.

        # Reset to initial seed

        self.current_state = self.initial_state

        return self.current_state

    def states(self) -> set:
        Return all states from this environment

        # Unpack spaces
        x_position, y_position = self.observation_space[0]

        # Calc basic states
        basic_states = set(
            (x, y) for x in range(x_position.n)
            for y in range(y_position.n)).difference(self.obstacles)

        # Calc product of basic states with objects
        states = set(
            itertools.product(basic_states, {
                (0, 0), (0, 1), (1, 0), (1, 1)
                    # Cannot be in gold positions without gold.
                    itertools.product(self.gold_positions, {
                        (0, 0), (0, 1)
                        # Cannot be in gem positions without gem.
                        itertools.product(self.gem_positions, {
                            (0, 0), (1, 0)
                            # Cannot be in home position with gem and/or gold.

        # Return all spaces
        return states

    def warning_action(self, state: tuple, action: int) -> bool:
        Check if that in that state the agent can be attacked.
        :param state:
        :param action:
        return ((state[0] == (3, 1) or state[0] == (3, 0)) and action == self.actions['UP']) or \
               (state[0] == (3, 1) and action == self.actions['LEFT']) or \
               (state[0] == (4, 0) and action == self.actions['LEFT']) or \
               (state[0] == (2, 2) and action == self.actions['UP']) or \
               (state[0] == (1, 1) and action == self.actions['RIGHT']) or \
               (state[0] == (2, 0) and action == self.actions['DOWN']) or \
               (state[0] == (2, 0) and action == self.actions['RIGHT'])

    def transition_reward(self, state: tuple, action: int,
                          next_state: tuple) -> Vector:
        Return reward for reach `next_state` from `state` using `action`.

        :param state: initial position
        :param action: action to do
        :param next_state: next position reached
        # Initialize reward as vector
        reward = self.default_reward.copy()

        if self.warning_action(
                action=action) and next_state[0] == self.home_position:
            reward[:] = -1, 0, 0
        elif next_state in self.checkpoints_states:
            reward[1:3] = next_state[1]

        return reward

    def transition_probability(self, state: tuple, action: int,
                               next_state: tuple) -> float:
        Return probability to reach `next_state` from `state` using `action`.

        :param state: initial position
        :param action: action to do
        :param next_state: next position reached
        transition_probability = 1.

        if self.warning_action(state=state, action=action):
            transition_probability = self.p_attack if (
                next_state[0] == self.home_position) else 1. - self.p_attack

        return transition_probability

    def reachable_states(self, state: tuple, action: int) -> set:
        Return all reachable states for pair (state, action) given.
        :param state:
        :param action:
        reachable_states = set()

        # If current state is on checkpoints (in home position with any resource) then reset resources
        if state in self.checkpoints_states:
            state = (state[0], (0, 0))

        if (state[0] == (3, 1)
                or state[0] == (3, 0)) and action == self.actions['UP']:
            reachable_states.add(((3, 0), state[1]))
            reachable_states.add((self.home_position, (0, 0)))
        elif state[0] == (3, 1) and action == self.actions['LEFT']:
            reachable_states.add(((2, 1), state[1]))
            reachable_states.add((self.home_position, (0, 0)))
        elif state[0] == (4, 0) and action == self.actions['LEFT']:
            reachable_states.add(((3, 0), state[1]))
            reachable_states.add((self.home_position, (0, 0)))
        elif state[0] == (2, 2) and action == self.actions['UP']:
            reachable_states.add(((2, 1), state[1]))
            reachable_states.add((self.home_position, (0, 0)))
        elif state[0] == (1, 1) and action == self.actions['RIGHT']:
            reachable_states.add(((2, 1), state[1]))
            reachable_states.add((self.home_position, (0, 0)))
        elif state[0] == (2, 0) and action == self.actions['DOWN']:
            reachable_states.add(((2, 1), state[1]))
            reachable_states.add((self.home_position, (0, 0)))
        elif state[0] == (2, 0) and action == self.actions['RIGHT']:
            reachable_states.add(((3, 0), state[1]))
            reachable_states.add((self.home_position, (0, 0)))
            reachable_states.add(self.next_state(action=action, state=state))

        # Return all possible states reachable with any action
        return reachable_states

    def is_final(self, state: tuple = None) -> bool:
        Return always false (No episodic task)
        return False
Example #26
    def do_iteration(self) -> None:
        Does an iteration (In this case a Sweeps)

        # Increment total sweeps
        self.total_sweeps += 1

        # Do a copy of v2
        v2 = self.v.copy()

        # Removes all items from the dictionary

        # For each state available
        for s in self.environment.states():

            # A(state) <- Extract all actions available from position `state`
            self.environment.current_state = s

            # Vector of Empty sets
            t = dict()

            # Get all actions available
            actions = self.environment.action_space.copy()

            # For each a in action_space
            for a in actions:

                # Empty set for this a (T(a))
                t_a = set()

                # Get all reachable states for that pair of (state, a)
                s2_set = self.environment.reachable_states(state=s, action=a)

                lv = list()

                for s2 in s2_set:
                    # If this position is unknown return empty set
                    lv.append(v2.get(s2, [Vector(self.initial_q_value)]))

                # Calc cartesian product of each reachable states
                cartesian_product = itertools.product(*lv)

                for product in cartesian_product:

                    summation = self.environment.default_reward.zero_vector

                    for j, s2 in enumerate(s2_set):
                        # Probability to reach that position
                        p = self.environment.transition_probability(
                            state=s, action=a, next_state=s2)

                        # Reward to reach that position
                        r = self.environment.transition_reward(state=s,

                        # Get previous value per gamma
                        previous_value = product[j] * self.gamma

                        # Summation
                        summation += (r + previous_value) * p

                    # T(a) <- T(a) U {.....}
                    t_a = t_a.union({summation})

                    t.update({a: t_a})

            # u_t <- U T(a)
            u_t = set.union(*t.values())

            # Remove duplicates and after transform to list
            u_t = set(
                    lambda x: un.round_with_precision(x, Vector.
                                                      decimal_precision), u_t))

            # V(state) <- filter[u_t]
            self.v.update({s: self.filter_vectors(vectors=u_t)})
Example #27
class BonusWorld(EnvMesh):
    # Possible actions
    _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3}

    # Experiments common hypervolume reference
    hv_reference = Vector([0, 0, -150])

    def __init__(self, initial_state: tuple = ((0, 0), False), default_reward: tuple = (0, 0), seed: int = 0,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (objective 1, objective 2)
        :param seed: Seed used for np.random.RandomState method.
        :param action_space:

        # List of all treasures and its reward.
        finals = {
            (8, 0): Vector([1, 9]),
            (8, 2): Vector([3, 9]),
            (8, 4): Vector([5, 9]),
            (8, 6): Vector([7, 9]),
            (8, 8): Vector([9, 9]),

            (0, 8): Vector([9, 1]),
            (2, 8): Vector([9, 3]),
            (4, 8): Vector([9, 5]),
            (6, 8): Vector([9, 7]),

        # Define mesh shape
        mesh_shape = (9, 9)

        # Set obstacles
        obstacles = frozenset({(2, 2), (2, 3), (3, 2)})

        # Default reward plus time (objective 1, objective 2, time)
        default_reward += (-1,)
        default_reward = Vector(default_reward)

        # Build the observation space (position (x, y), bonus)
        observation_space = gym.spaces.Tuple(
                    (gym.spaces.Discrete(mesh_shape[0]), gym.spaces.Discrete(mesh_shape[1]))

        super().__init__(mesh_shape=mesh_shape, default_reward=default_reward, initial_state=initial_state,
                         finals=finals, obstacles=obstacles, observation_space=observation_space, seed=seed,

        # Pits marks which returns the agent to the start location.
        self.pits = {
            (7, 1), (7, 3), (7, 5), (1, 7), (3, 7), (5, 7)

        # X2 bonus
        self.bonus = [
            (3, 3)

    def step(self, action: int) -> (tuple, Vector, bool, dict):
        Given an action, do a step
        :param action:
        :return: (position, (objective 1, objective 2, time), final, extra)

        # Initialize reward as vector
        reward = self.default_reward.copy()

        # Unpack next position for reward
        position, bonus = self.next_state(action=action)

        # Get treasure value
        reward[0], reward[1] = self.finals.get(position, (self.default_reward[0], self.default_reward[1]))

        # If the bonus is activated, double the reward.
        if bonus:
            reward[0] *= 2
            reward[1] *= 2

        # Set extra
        info = {}

        # Update current position
        self.current_state = position, bonus

        # Check is_final
        final = self.is_final(self.current_state)

        return self.current_state, reward, final, info

    def next_state(self, action: int, state: tuple = None) -> tuple:
        Given a state and an action, return the next state
        :param action:
        :param state:

        # Unpack complex position (position, bonus_activated)
        position, bonus = state if state else self.current_state

        # Calc next position
        next_position, is_valid = self.next_position(action=action, position=position)

        # If the next_position isn't valid, reset to the previous position
        if not self.observation_space[0].contains(next_position) or not is_valid:
            next_position = position

        # If agent is in pit, it'state returned at initial position and deactivate the bonus.
        if next_position in self.pits:
            next_position, bonus = self.initial_state
            bonus = False

        # Check if the agent has activated the bonus
        elif next_position in self.bonus:
            bonus = True

        # Build next position
        return next_position, bonus

    def is_final(self, state: tuple = None) -> bool:
        Is final if agent is on final position.
        :param state:
        return state[0] in self.finals.keys()

    def transition_reward(self, state: tuple, action: int, next_state: tuple) -> Vector:
        Given a state, an action and a next state, return the corresponding reward.
        :param state:
        :param action:
        :param next_state:

        # Separate position from bonus_activated
        position, bonus_activated = next_state

        # Default reward
        reward = self.default_reward.copy()

        # Get treasure value
        reward[0], reward[1] = self.finals.get(position, (reward[0], reward[1]))

        # If the bonus is activated, double the reward.
        if bonus_activated:
            reward[0] *= 2
            reward[1] *= 2

        return reward

    def states(self) -> set:
        Return a set with all states of this environment

        # Unpack spaces
        position, bonus_activate = self.observation_space.spaces
        x_position, y_position = position.spaces

        # Get all positions
        all_positions = {(x, y) for x in range(x_position.n) for y in range(y_position.n)}

        # Get obstacles, finals positions and pits
        finals_obstacles_and_pits = self.obstacles.union(set(self.finals.keys())).union(self.pits)

        # Generate available states
        available_states = set(product(all_positions - finals_obstacles_and_pits, {True, False}))

        # Remove impossible states
        available_states = available_states - {
            ((3, 3), False)

        # Return all available spaces
        return available_states
Example #28
class PyramidMDP(EnvMesh):
    # Possible actions
    _actions = {'UP': 0, 'RIGHT': 1, 'DOWN': 2, 'LEFT': 3}

    # Pareto optimal policy vector-values
    pareto_optimal = []

    # Experiments common hypervolume reference
    hv_reference = Vector((-20, -20))

    def __init__(self,
                 initial_state: tuple = (0, 0),
                 default_reward: tuple = (-1, -1),
                 seed: int = 0,
                 n_transition: float = 0.95,
                 diagonals: int = 9,
                 action_space: gym.spaces = None):
        :param initial_state: Initial state where start the agent.
        :param default_reward: (objective 1, objective 2)
        :param seed: Seed used for np.random.RandomState method.
        :param n_transition: if is 1, always do the action indicated. (Original is about 0.6)
        :param diagonals: Number of diagonals to be used to build this environment (allows experimenting with an
                        identical environment, but considering only the first k diagonals) (By default 9 - all).

        # the original full-size environment.
        mesh_shape = (min(max(diagonals + 1, 1),
                          10), min(max(diagonals + 1, 1), 10))

        # Dictionary with final states as keys, and treasure amounts as values.
        diagonals_states = {
            for x in zip(range(0, diagonals + 1, 1), range(diagonals, -1, -1))

        # Generate finals states with its reward
        finals = {
            state: (Vector(state) + 1) * 10
            for state in diagonals_states

        # Pareto optimal
        PyramidMDP.pareto_optimal = {
            Vector(state) + 1
            for state in diagonals_states

        # Filter obstacles states
        obstacles = frozenset((x, y) for x, y in finals.keys()
                              for y in range(y, diagonals + 1)
                              if (x, y) not in finals)

        # Default reward (objective_1, objective_2)
        default_reward = Vector(default_reward)

        # Transaction
        assert 0 <= n_transition <= 1.
        self.n_transition = n_transition


    def step(self, action: int) -> (tuple, Vector, bool, dict):
        Given an action, do a step
        :param action:
        :return: (position, (time_inverted, treasure_value), final, extra)

        # Get probability action
        action = self.__probability_action(action=action)

        # Initialize rewards as vector
        reward = self.default_reward.copy()

        # Update current position
        self.current_state = self.next_state(action=action)

        # Get treasure value
        reward = self.finals.get(self.current_state, reward)

        # Set extra
        info = {}

        # Check is_final
        final = self.is_final(self.current_state)

        return self.current_state, reward, final, info

    def __probability_action(self, action: int) -> int:
        Decide probability action after apply probabilistic p_stochastic.
        :param action:

        # Get a random uniform number [0., 1.]
        random = self.np_random.uniform()

        # If random is greater than self.n_transition, get a random action
        if random > self.n_transition:
            action = self.action_space.sample()

        return action

    def transition_reward(self, state: tuple, action: int,
                          next_state: tuple) -> Vector:
        Return reward for reach `next_state` from `position` using `action`.

        :param state: initial position
        :param action: action to do
        :param next_state: next position reached
        # Default reward
        return self.finals.get(next_state, self.default_reward.copy())

    def transition_probability(self, state: tuple, action: int,
                               next_state: tuple) -> float:
        Return probability to reach `next_state` from `position` using `action`.
        :param state: initial position
        :param action: action to do
        :param next_state: next position reached
        # Probability
        desired_probability = self.n_transition

        desired_transition = (
            (action == self.actions['UP'] and ue.is_on_up_or_same_position(
                state=state, next_state=next_state))
            or (action == self.actions['RIGHT']
                and ue.is_on_right_or_same_position(
                    state=state, next_position=next_state)) or
            (action == self.actions['DOWN'] and ue.is_on_down_or_same_position(
                state=state, next_state=next_state)) or
            (action == self.actions['LEFT'] and ue.is_on_left_or_same_position(
                state=state, next_state=next_state)))

        if not desired_transition:
            desired_probability = (1. -
                                   self.n_transition) / self.action_space.n

        return desired_probability

    def reachable_states(self, state: tuple, action: int) -> set:
        Return all reachable states for pair (state, a) given.
        :param state:
        :param action:
        # Set current state with state indicated
        self.current_state = state

        # Get all actions available
        actions = self.action_space.copy()

        # Return all possible states reachable with any action
        return {self.next_state(action=a, state=state) for a in actions}