Esempio n. 1
0
class IdentityEnv(Env):
    def __init__(self, dim, ep_length=100):
        """
        Identity environment for testing purposes

        :param dim: (int) the size of the dimensions you want to learn
        :param ep_length: (int) the length of each episodes in timesteps
        """
        self.action_space = Discrete(dim)
        self.observation_space = self.action_space
        self.ep_length = ep_length
        self.current_step = 0
        self.dim = dim
        self.reset()

    def reset(self):
        self.current_step = 0
        self._choose_next_state()
        return self.state

    def step(self, action):
        reward = self._get_reward(action)
        self._choose_next_state()
        self.current_step += 1
        done = self.current_step >= self.ep_length
        return self.state, reward, done, {}

    def _choose_next_state(self):
        self.state = self.action_space.sample()

    def _get_reward(self, action):
        return 1 if np.all(self.state == action) else 0

    def render(self, mode='human'):
        pass
Esempio n. 2
0
class IdentityEnv(Env):
    def __init__(
            self,
            dim,
            ep_length=100,
    ):

        self.action_space = Discrete(dim)
        self.reset()

    def reset(self):
        self._choose_next_state()
        self.observation_space = self.action_space

        return self.state

    def step(self, actions):
        rew = self._get_reward(actions)
        self._choose_next_state()
        return self.state, rew, False, {}

    def _choose_next_state(self):
        self.state = self.action_space.sample()

    def _get_reward(self, actions):
        return 1 if self.state == actions else 0
Esempio n. 3
0
class IdentityEnv(Env):
    def __init__(
        self,
        dim,
        ep_length=100,
    ):

        self.action_space = Discrete(dim)
        self.reset()

    def reset(self):
        self._choose_next_state()
        self.observation_space = self.action_space

        return self.state

    def step(self, actions):
        rew = self._get_reward(actions)
        self._choose_next_state()
        return self.state, rew, False, {}

    def _choose_next_state(self):
        self.state = self.action_space.sample()

    def _get_reward(self, actions):
        return 1 if self.state == actions else 0
class ValueFunction:
    # In this example I use the tiling software instead of implementing standard tiling by myself
    # One important thing is that tiling is only a map from (state, action) to a series of indices
    # It doesn't matter whether the indices have meaning, only if this map satisfy some property
    # View the following webpage for more information
    # http://incompleteideas.net/sutton/tiles/tiles3.html
    # @max_size: the maximum # of indices
    def __init__(self, alpha, n_actions, num_of_tilings=8, max_size=2048):
        self.action_space = Discrete(n_actions)
        self.max_size = max_size
        self.num_of_tilings = num_of_tilings

        # divide step size equally to each tiling
        self.step_size = alpha / num_of_tilings

        self.hash_table = IHT(max_size)

        # weight for each tile
        self.weights = np.zeros(max_size)

        # position and velocity needs scaling to satisfy the tile software
        self.position_scale = self.num_of_tilings / (POSITION_MAX -
                                                     POSITION_MIN)
        self.velocity_scale = self.num_of_tilings / (VELOCITY_MAX -
                                                     VELOCITY_MIN)

    # get indices of active tiles for given state and action
    def _get_active_tiles(self, position, velocity, action):
        # I think positionScale * (position - position_min) would be a good normalization.
        # However positionScale * position_min is a constant, so it's ok to ignore it.
        active_tiles = tiles(
            self.hash_table, self.num_of_tilings,
            [self.position_scale * position, self.velocity_scale * velocity],
            [action])
        return active_tiles

    # estimate the value of given state and action
    def __call__(self, state, action):
        position, velocity = tuple(state)
        if position == POSITION_MAX:
            return 0.0
        active_tiles = self._get_active_tiles(position, velocity, action)
        return np.sum(self.weights[active_tiles])

    # learn with given state, action and target
    def update(self, target, state, action):
        active_tiles = self._get_active_tiles(state[0], state[1], action)
        estimation = np.sum(self.weights[active_tiles])
        delta = self.step_size * (target - estimation)
        for active_tile in active_tiles:
            self.weights[active_tile] += delta

    def act(self, state, epsilon=0):
        if np.random.random() < epsilon:
            return self.action_space.sample()
        return np.argmax(
            [self(state, action) for action in range(self.action_space.n)])
def test_qagent_1():
    agent = QLearningAgent(
        action_space=Discrete(3),
        obs_space=Discrete(3),
        gamma=0.99,
        alpha=1.0,
        epsilon=0.9,
    )
    space = Discrete(10)
    action = space.sample()
    obs = space.sample()
    reward = 0.0
    obs_n = space.sample()

    agent.learn(obs, action, reward, obs_n, False)

    assert (obs, action) in agent.q_table
    assert type(agent.q_table[(obs, action)]) == float
Esempio n. 6
0
def test_space_utils():
    # Box
    box = Box(-1.0, 1.0, shape=[2, 3], dtype=np.float32)
    sample = box.sample()
    assert flatdim(box) == 2 * 3
    assert flatten(box, sample).shape == (2 * 3, )
    assert np.allclose(sample, unflatten(box, flatten(box, sample)))

    x = np.array([[1.0, 1.0], [1.0, 1.0]])
    box = Box(low=-x, high=x, dtype=np.float32)
    sample = box.sample()
    assert flatdim(box) == 2 * 2
    assert flatten(box, sample).shape == (2 * 2, )
    assert np.allclose(sample, unflatten(box, flatten(box, sample)))

    # Discrete
    discrete = Discrete(5)
    sample = discrete.sample()
    assert flatdim(discrete) == 5
    assert flatten(discrete, sample).shape == (5, )
    assert sample == unflatten(discrete, flatten(discrete, sample))

    # Tuple
    S = Tuple([
        Discrete(5),
        Box(-1.0, 1.0, shape=(2, 3), dtype=np.float32),
        Dict({
            'success': Discrete(2),
            'velocity': Box(-1, 1, shape=(1, 3), dtype=np.float32)
        })
    ])
    sample = S.sample()
    assert flatdim(S) == 5 + 2 * 3 + 2 + 3
    assert flatten(S, sample).shape == (16, )
    _sample = unflatten(S, flatten(S, sample))
    assert sample[0] == _sample[0]
    assert np.allclose(sample[1], _sample[1])
    assert sample[2]['success'] == _sample[2]['success']
    assert np.allclose(sample[2]['velocity'], _sample[2]['velocity'])

    # Dict
    D0 = Dict({
        'position': Box(-100, 100, shape=(3, ), dtype=np.float32),
        'velocity': Box(-1, 1, shape=(4, ), dtype=np.float32)
    })
    D = Dict({'sensors': D0, 'score': Discrete(100)})
    sample = D.sample()
    assert flatdim(D) == 3 + 4 + 100
    assert flatten(D, sample).shape == (107, )
    _sample = unflatten(D, flatten(D, sample))
    assert sample['score'] == _sample['score']
    assert np.allclose(sample['sensors']['position'],
                       _sample['sensors']['position'])
    assert np.allclose(sample['sensors']['velocity'],
                       _sample['sensors']['velocity'])
Esempio n. 7
0
    def test_trajectory(self):
        """Tests the Trajectory class."""

        buffer_size = 5

        # Small trajecory object for testing purposes.
        trajectory = Trajectory(buffer_size=buffer_size)
        self.assertEqual(trajectory.cursor, 0)
        self.assertEqual(trajectory.timestep, 0)
        self.assertEqual(trajectory.sample_batch_offset, 0)
        assert not trajectory.buffers
        observation_space = Box(-1.0, 1.0, shape=(3, ))
        action_space = Discrete(2)
        trajectory.add_init_obs(env_id=0,
                                agent_id="agent",
                                policy_id="policy",
                                init_obs=observation_space.sample())
        self.assertEqual(trajectory.cursor, 0)
        self.assertEqual(trajectory.initial_obs.shape, observation_space.shape)

        # Fill up the buffer and make it extend if it hits the limit.
        cur_buffer_size = buffer_size
        for i in range(buffer_size + 1):
            trajectory.add_action_reward_next_obs(
                env_id=0,
                agent_id="agent",
                policy_id="policy",
                values=dict(
                    t=i,
                    actions=action_space.sample(),
                    rewards=1.0,
                    dones=i == buffer_size,
                    new_obs=observation_space.sample(),
                    action_logp=-0.5,
                    action_dist_inputs=np.array([[0.5, 0.5]]),
                ))
            self.assertEqual(trajectory.cursor, i + 1)
            self.assertEqual(trajectory.timestep, i + 1)
            self.assertEqual(trajectory.sample_batch_offset, 0)
            if i == buffer_size - 1:
                cur_buffer_size *= 2
            self.assertEqual(len(trajectory.buffers["new_obs"]),
                             cur_buffer_size)
            self.assertEqual(len(trajectory.buffers["rewards"]),
                             cur_buffer_size)

        # Create a SampleBatch from the Trajectory and reset it.
        batch = trajectory.get_sample_batch_and_reset()
        self.assertEqual(batch.count, buffer_size + 1)
        # Make sure, Trajectory was reset properly.
        self.assertEqual(trajectory.cursor, buffer_size + 1)
        self.assertEqual(trajectory.timestep, 0)
        self.assertEqual(trajectory.sample_batch_offset, buffer_size + 1)
Esempio n. 8
0
class IntegerSphere(gym.Env):
    #An integer/discrete form of the sphere function
    def __init__(self):
        lb = -100
        ub = 100
        self.nx = 5
        self.action_space = Discrete(201)
        self.real_actions = list(range(lb, ub + 1))
        self.observation_space = Box(low=min(self.real_actions),
                                     high=max(self.real_actions),
                                     shape=(self.nx, ),
                                     dtype=int)
        self.episode_length = 50
        self.reset()
        self.done = False
        self.counter = 0

    def step(self, action):
        individual = [self.real_actions[action]] * self.nx
        reward = self.fit(individual=individual)
        self.counter += 1
        if self.counter == self.episode_length:
            self.done = True
            self.counter = 0

        return individual, reward, self.done, {'x': individual}

    def fit(self, individual):
        """Sphere test objective function.
                    F(x) = sum_{i=1}^d xi^2
                    d=1,2,3,...
                    Range: [-100,100]
                    Minima: 0
            """
        #-1 is used to convert minimization to maximization
        return -sum(x**2 for x in individual)

    def reset(self):
        self.done = False
        ac = self.action_space.sample()
        individual = [self.real_actions[ac]] * self.nx
        return individual

    def render(self, mode='human'):
        pass
Esempio n. 9
0
def test_seed_Dict():
    test_space = Dict(
        {
            "a": Box(low=0, high=1, shape=(3, 3)),
            "b": Dict(
                {
                    "b_1": Box(low=-100, high=100, shape=(2,)),
                    "b_2": Box(low=-1, high=1, shape=(2,)),
                }
            ),
            "c": Discrete(5),
        }
    )

    seed_dict = {
        "a": 0,
        "b": {
            "b_1": 1,
            "b_2": 2,
        },
        "c": 3,
    }

    test_space.seed(seed_dict)

    # "Unpack" the dict sub-spaces into individual spaces
    a = Box(low=0, high=1, shape=(3, 3))
    a.seed(0)
    b_1 = Box(low=-100, high=100, shape=(2,))
    b_1.seed(1)
    b_2 = Box(low=-1, high=1, shape=(2,))
    b_2.seed(2)
    c = Discrete(5)
    c.seed(3)

    for i in range(10):
        test_s = test_space.sample()
        a_s = a.sample()
        assert (test_s["a"] == a_s).all()
        b_1_s = b_1.sample()
        assert (test_s["b"]["b_1"] == b_1_s).all()
        b_2_s = b_2.sample()
        assert (test_s["b"]["b_2"] == b_2_s).all()
        c_s = c.sample()
        assert test_s["c"] == c_s
Esempio n. 10
0
class TestEnv(Env):
    def __init__(self, action_size=5, observation_size=5, max_depth=20):
        self.action_space = Discrete(action_size)
        self.observation_space = Discrete(observation_size)
        self.max_depth = max_depth
        self._discount = .95
        self._reward_range = 1

    def reset(self):
        self.state = 0

    def _get_init_state(self):
        # self.state = 0
        return 0  # self.state

    def _set_state(self, state):
        self.state = state

    def step(self, action):
        if self.state < self.max_depth and action == 0:
            rw = 1.0
        else:
            rw = 0.0
        ob = self.observation_space.sample()
        self.state += 1
        return ob, rw, False, {"state": self.state, "p_ob": 1.}

    def optimal_value(self):
        discount = 1.
        total_rw = 0.
        for n in range(self.max_depth):
            total_rw += discount
            discount *= self._discount

        return total_rw

    def mean_value(self):
        discount = 1.
        total_rw = 0.
        for n in range(self.max_depth):
            total_rw += discount / self.action_space.n
            discount *= self._discount
        return total_rw
Esempio n. 11
0
class Player(object):
    """ Abstract player that plays samples from a unifrom random distribution."""
    def __init__(self):
        self.action_space = Discrete(7)
        self.mode = PlayerModes.batting

    def reset(self, mode=PlayerModes.batting):
        """
        Use reset to set the player's mode.
        Args:
            mode: Some mode to which to reset.
        """
        self.mode = mode

    def __call__(self, obs, reward, done, info):
        """ Player will consume everything that the previous step provided."""
        return self.action_space.sample()

    def __repr__(self):
        return '{} instance'.format(type(self).__name__)
Esempio n. 12
0
class ElFarolEnv(Env):

    metadata = {'render.modes': ['human']}

    def __init__(self, n_agents=100, threshold=60, g=10, s=5, b=1):
        if g < s or s < b:
            raise Exception("rewards must be ordered g > s > b")

        self.n_agents = n_agents
        self.action_space = Discrete(2)
        # observe 0 if did not attend, otherwise observe number of agents who atteneded
        self.observation_space = Discrete(n_agents)
        self.reward_range = (b, g)

        def reward_func(action, n_attended):
            if action == 0:
                return s
            elif n_attended <= threshold:
                return g
            else:
                return b

        self.reward_func = reward_func
        self.prev_action = [
            self.action_space.sample() for _ in range(n_agents)
        ]

    def _step(self, action):
        n_attended = sum(action)
        observation = [n_attended if a else 0 for a in action]
        reward = [self.reward_func(a, n_attended) for a in action]

        self.prev_action = action
        return observation, reward, False, ()

    def _reset(self):
        pass

    def _render(self, mode='human', close=False):
        if mode == 'human':
            print(str(sum(self.prev_action)))
Esempio n. 13
0
class DiscreteMaskEnv(gym.Env):
    metadata = {'render.modes': ['human', 'system', 'none']}

    def __init__(self):
        self.action_space = Discrete(5)
        self.observation_space = Discrete(3)
        self.current_step = 0
        self._action_mask = torch.ones(self.action_space.n)

    def reset(self):
        self.current_step = 0
        self._action_mask = torch.ones(self.action_space.n)
        self._choose_next_state()
        return self.state

    def step(self, action: int):
        action_mask = torch.ones(self.action_space.n)
        if self.action_mask[action] == 0:
            raise Exception("Invalid action was selected! Valid actions: {}, "
                            "action taken: {}".format(self.action_mask,
                                                      action))
        action_mask[action] = 0

        self.current_step += 1
        self._action_mask = action_mask
        self._choose_next_state()
        return self.state, 0, self.finish(), {"action_mask": self.action_mask}

    def render(self, mode='human'):
        pass

    def finish(self):
        return self.current_step == 250

    def _choose_next_state(self):
        self.state = torch.tensor(self.observation_space.sample(),
                                  dtype=torch.long)

    @property
    def action_mask(self):
        return self._action_mask
Esempio n. 14
0
class PocEnv(Env):
    def __init__(self, maze):
        self.board = select_maze(maze)
        self.grid = PocGrid(board=self.board["_maze"])
        self._get_init_state()
        self.action_space = Discrete(4)
        self.observation_space = Discrete(1 << 10)  # 1024
        # self.observation_space = Discrete(14)
        self._reward_range = 100
        self._discount = .95

    def seed(self, seed=None):
        np.random.seed(seed)

    def is_power(self, idx):
        return self.board['_maze'][idx] == 3

    def is_passable(self, idx):
        return self.board['_maze'][idx] != 0

    def _is_valid(self):

        assert self.grid.is_inside(self.state.agent_pos)
        assert self.is_passable(self.state.agent_pos)
        for ghost in self.state.ghosts:
            assert self.grid.is_inside(ghost.pos)
            assert self.is_passable(ghost.pos)

    def _set_state(self, state):
        self.done = False
        self.state = state

    def _generate_legal(self):
        actions = []
        for action in Action:
            if self.grid.is_inside(self.state.agent_pos +
                                   Moves.get_coord(action.value)):
                actions.append(action.value)
        return actions

    def step(self, action):
        assert self.action_space.contains(action)
        assert self.done is False

        reward = -1
        next_pos = self._next_pos(self.state.agent_pos, action)
        if next_pos.is_valid():
            self.state.agent_pos = next_pos
        else:
            reward += -25

        if self.state.power_step > 0:
            self.state.power_step -= 1

        hit_ghost = -1
        for g, ghost in enumerate(self.state.ghosts):
            if ghost.pos == self.state.agent_pos:
                hit_ghost = g
            # move ghost
            self._move_ghost(g, ghost_range=self.board["_ghost_range"])

        if hit_ghost >= 0:
            if self.state.power_step > 0:
                reward += 25
                self.state.ghosts[hit_ghost].reset()
            else:
                reward += -100
                self.done = True

        ob = self._make_ob(action)

        if self.state.food_pos[self.grid.get_index(self.state.agent_pos)]:
            if sum(self.state.food_pos) == 0:
                reward += 1000
                self.done = True
            if self.is_power(self.state.agent_pos):
                self.state.power_step = config["_power_steps"]
            reward += 10

        return ob, reward, self.done, {"state": self.state}

    def _make_ob(self, action):
        # TODO fix me
        ob = 0
        for d in range(self.action_space.n):
            if self._see_ghost(action) > 0:
                ob = set_flags(ob, d)
            next_pos = self._next_pos(self.state.agent_pos, direction=d)
            if next_pos.is_valid() and self.is_passable(next_pos):
                ob = set_flags(ob, d + self.action_space.n)
        if self._smell_food():
            ob = set_flags(ob, 8)
        if self._hear_ghost(self.state):
            ob = set_flags(ob, 9)
        return ob

    def _encode_state(self, state):
        poc_idx = self.grid.get_index(state.agent_pos)
        ghosts = [(self.grid.get_index(ghost.pos), ghost.direction)
                  for ghost in state.ghosts]
        return np.concatenate([[poc_idx], *ghosts, state.food_pos,
                               [state.power_step]])

    def _decode_state(self, state):
        poc_state = PocState(Coord(*self.grid.get_coord(state[0])))
        ghosts = np.split(state[1:self.board["_num_ghosts"] * 3], 1)
        for g in ghosts:
            poc_state.ghosts.append(
                Ghost(pos=self.grid.get_coord(g[0]), direction=g[1]))
        poc_state.power_step = state[-1]
        poc_state.food_pos = state[self.board["_num_ghosts"] * 3:-1].tolist()
        return poc_state

    def _compute_prob(self, action, next_state, ob):
        return int(ob == self._make_ob(action))

    def _see_ghost(self, action):
        eye_pos = self.state.agent_pos + Moves.get_coord(action)
        while True:
            for g, ghost in enumerate(self.state.ghosts):
                if ghost.pos == eye_pos:
                    return g
            eye_pos += Moves.get_coord(action)
            if not self.grid.is_inside(eye_pos) or not self.is_passable(
                    eye_pos):
                break
        return -1

    def _smell_food(self, smell_range=1):
        for x in range(-smell_range, smell_range + 1):
            for y in range(-smell_range, smell_range + 1):
                smell_pos = Coord(x, y)
                idx = self.grid.get_index(self.state.agent_pos + smell_pos)
                if self.grid.is_inside(self.state.agent_pos +
                                       smell_pos) and self.state.food_pos[idx]:
                    return True
        return False

    @staticmethod
    def _hear_ghost(poc_state, hear_range=2):
        for ghost in poc_state.ghosts:
            if Grid.manhattan_distance(ghost.pos,
                                       poc_state.agent_pos) <= hear_range:
                return True
        return False

    def render(self, mode='human', close=False):
        pass

    def reset(self):
        self.t = 0
        self.done = False
        self._get_init_state()
        return 0

    def close(self):
        pass

    def _get_init_state(self):
        # create walls
        # for tile in self.grid:
        #     value = config["maze"][tile.key[0]]
        #     self.grid.set_value(value, coord=tile.key)

        self.state = PocState()
        self.state.agent_pos = Coord(*self.board["_poc_home"])
        ghost_home = Coord(*self.board["_ghost_home"])

        for g in range(self.board["_num_ghosts"]):
            pos = Coord(ghost_home.x + g % 2, ghost_home.y + g // 2)
            self.state.ghosts.append(Ghost(pos, direction=-1))

        self.state.food_pos = np.random.binomial(1,
                                                 config["_food_prob"],
                                                 size=self.grid.n_tiles + 1)
        self.state.power_step = 0
        return self.state

    def _next_pos(self, pos, direction):
        direction = Moves.get_coord(direction)
        if pos.x == 0 and pos.y == self.board[
                '_passage_y'] and direction == Moves.EAST:
            next_pos = Coord(self.grid.x_size - 1, pos.y)
        elif pos.x == self.grid.x_size - 1 and pos.y == self.board[
                '_passage_y'] and direction == Moves.WEST:
            next_pos = Coord(0, pos.y)
        else:
            next_pos = pos + direction

        if self.grid.is_inside(next_pos) and self.is_passable(next_pos):
            return next_pos
        else:
            return Coord(-1, -1)

    def _move_ghost(self, g, ghost_range):
        if Grid.manhattan_distance(self.state.agent_pos,
                                   self.state.ghosts[g].pos) < ghost_range:
            if self.state.power_step > 0:
                self._move_defensive(g)
            else:
                self._move_aggressive(g)
        else:
            self._move_random(g)

    def _move_aggressive(self, g, chase_prob=.75):
        if not np.random.binomial(1, p=chase_prob):
            return self._move_random(g)

        best_dist = self.grid.x_size + self.grid.y_size
        best_pos = self.state.ghosts[g].pos
        best_dir = -1
        for d in range(self.action_space.n):
            dist = Grid.directional_distance(self.state.agent_pos,
                                             self.state.ghosts[g].pos, d)
            new_pos = self._next_pos(self.state.ghosts[g].pos, d)
            if dist <= best_dist and new_pos.is_valid() and can_move(
                    self.state.ghosts[g], d):
                best_pos = new_pos
                best_dist = dist
                best_dir = d

        self.state.ghosts[g].update(best_pos, best_dir)

    def _move_defensive(self, g, defensive_prob=.5):
        if np.random.binomial(
                1, defensive_prob) and self.state.ghosts[g].direction >= 0:
            self.state.ghosts[g].direction = -1

        best_dist = self.grid.x_size + self.grid.y_size
        best_pos = self.state.ghosts[g].pos
        best_dir = -1
        for d in range(self.action_space.n):
            dist = Grid.directional_distance(self.state.agent_pos,
                                             self.state.ghosts[g].pos, d)
            new_pos = self._next_pos(self.state.ghosts[g].pos, d)
            if dist >= best_dist and new_pos.is_valid() and can_move(
                    self.state.ghosts[g], d):
                best_pos = new_pos
                best_dist = dist
                best_dir = d

        self.state.ghosts[g].update(best_pos, best_dir)

    def _move_random(self, g):
        # there are no dead ends
        # never switch to opposite direction
        ghost_pos = self.state.ghosts[g].pos
        while True:
            d = self.action_space.sample()
            next_pos = self._next_pos(ghost_pos, d)
            if next_pos.is_valid() and can_move(self.state.ghosts[g], d):
                break

        self.state.ghosts[g].update(next_pos, d)
 def test_discrete_space_encode(self):
     discrete_space = Discrete(100)
     value = discrete_space.sample()
     encoded_value = gym_spaces_utils.gym_space_encode(
         discrete_space, value)
     self.assertListEqual([value], encoded_value)
Esempio n. 16
0
class SawyerXYZEnv(SawyerMocapBase, metaclass=abc.ABCMeta):
    def __init__(
        self,
        model_name,
        frame_skip=5,
        hand_low=(-0.2, 0.55, 0.05),
        hand_high=(0.2, 0.75, 0.3),
        mocap_low=None,
        mocap_high=None,
        action_scale=1. / 100,
        action_rot_scale=1.,
    ):
        super().__init__(model_name, frame_skip=frame_skip)
        self.action_scale = action_scale
        self.action_rot_scale = action_rot_scale
        self.hand_low = np.array(hand_low)
        self.hand_high = np.array(hand_high)
        if mocap_low is None:
            mocap_low = hand_low
        if mocap_high is None:
            mocap_high = hand_high
        self.mocap_low = np.hstack(mocap_low)
        self.mocap_high = np.hstack(mocap_high)
        self.goal_space = Discrete(1)  # OVERRIDE ME
        self.curr_path_length = 0

        # We use continuous goal space by default and
        # can discretize the goal space by calling
        # the `discretize_goal_space` method.
        self.discrete_goal_space = None
        self.discrete_goals = []
        self.active_discrete_goal = None

    def set_xyz_action(self, action):
        action = np.clip(action, -1, 1)
        pos_delta = action * self.action_scale
        new_mocap_pos = self.data.mocap_pos + pos_delta[None]

        new_mocap_pos[0, :] = np.clip(
            new_mocap_pos[0, :],
            self.mocap_low,
            self.mocap_high,
        )
        self.data.set_mocap_pos('mocap', new_mocap_pos)
        self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))

    def discretize_goal_space(self, goals):
        assert len(goals) >= 1
        self.discrete_goals = goals
        # update the goal_space to a Discrete space
        self.discrete_goal_space = Discrete(len(self.discrete_goals))

    # Belows are methods for using the new wrappers.
    # `sample_goals` is implmented across the sawyer_xyz
    # as sampling from the task lists. This will be done
    # with the new `discrete_goals`. After all the algorithms
    # conform to this API (i.e. using the new wrapper), we can
    # just remove the underscore in all method signature.
    def sample_goals_(self, batch_size):
        if self.discrete_goal_space is not None:
            return [
                self.discrete_goal_space.sample() for _ in range(batch_size)
            ]
        else:
            return [self.goal_space.sample() for _ in range(batch_size)]

    def set_goal_(self, goal):
        if self.discrete_goal_space is not None:
            self.active_discrete_goal = goal
            self.goal = self.discrete_goals[goal]
            self._state_goal_idx = np.zeros(len(self.discrete_goals))
            self._state_goal_idx[goal] = 1.
        else:
            self.goal = goal

    def _set_obj_xyz(self, pos):
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        qpos[9:12] = pos.copy()
        qvel[9:15] = 0
        self.set_state(qpos, qvel)

    def get_site_pos(self, siteName):
        _id = self.model.site_names.index(siteName)
        return self.data.site_xpos[_id].copy()

    def reset(self):
        self.curr_path_length = 0
        return super().reset()
Esempio n. 17
0
 def action(self, state: Box, action_space: Discrete) -> int:
     if self._exploration_policy.should_explore():
         return action_space.sample()
     else:
         predict = self._model.predict(np.array([state]))
         return np.argmax(predict).item()
Esempio n. 18
0
        # wrap the chosen default model with our new model API class
        # (DuelingQModel). This way, both `forward` and `get_q_values`
        # are available in the returned class.
        model_interface=ContActionQModel
        if args.framework != "torch" else TorchContActionQModel,
        name="cont_action_q_model",
    )
    # __sphinx_doc_model_construct_end__

    batch_size = 10
    input_ = np.array([obs_space.sample() for _ in range(batch_size)])

    # Note that for PyTorch, you will have to provide torch tensors here.
    if args.framework == "torch":
        input_ = torch.from_numpy(input_)

    input_dict = {
        "obs": input_,
        "is_training": False,
    }
    # Note that for PyTorch, you will have to provide torch tensors here.
    out, state_outs = my_cont_action_q_model(input_dict=input_dict)
    assert out.shape == (10, 256)
    # Pass `out` and an action into `my_cont_action_q_model`
    action = np.array([action_space.sample() for _ in range(batch_size)])
    if args.framework == "torch":
        action = torch.from_numpy(action)

    q_value = my_cont_action_q_model.get_single_q_value(out, action)
    assert q_value.shape == (10, 1)
Esempio n. 19
0
class Expando(Env):
    """Gym environment wrapping the expando game. For details on the game, check the ExpandoGame class.

    Action-space:
        Multidiscrete: (move_direction, action_type) with move_direction in {0, ..., 2 * n_axis}, where 0 - n_axis
        represent movement along an axis in the positive direction and n_axis - 2 * n_axis in negative direction. And
        action_type is in {piece_type_0, ..., piece_type_n}, ie.e. there is a placement action for each type of piece.

        If `multi_discrete_actions` is set to False, the discrete action space over all items in the cartesian product
        of move_direction and action_type will be used to get a single discrete action space over
        {0,..., n_axis * piece_type_n}. The order of action pairs then is the same as returned by `itertools.product()`.

    Observation-space description:
        A Box space where each observation has dimensions (axis_0 x axis_1 ... x axis_n x n_one_hot x n_scores)
        where axis_k is the length of the k-th axis of the game board grid,
        n_one_hot = 1 + n_players * (n_piece_types - 1) the dimension of the piece's one-hot encodings and n_scores = 3
        is the number of additional normalized features regarding the player: is_cursor_position, room, population.
        Note that n_one_hot accounts for the empty piece_type which doesn't belong to a player.

        If `flat_observations` is set to True, the box observations are going to be
        (axis_0 * axis_1 ... * axis_n * n_one_hot + n_scores) dimensional vectors, where n_scores = 3 + n_axis, since
        the cursor's position is on longer represented as bit, but as normalized (x, y, ...) coordinates.
    """
    def __init__(self,
                 grid_size: tuple,
                 n_players: int = 2,
                 max_turns=100,
                 final_reward=100,
                 piece_types=None,
                 policies_other=None,
                 observe_all=False,
                 multi_discrete_actions=False,
                 flat_observations=False,
                 render=False,
                 cell_size=50,
                 padding=5,
                 ui_font_size=14,
                 seed=None):
        """

        :param grid_size: tuple specifying the dimensions of the game's board.
        :param n_players: number of players participating in the game.
        :param max_turns: maximum number of turns per episode.
        :param final_reward: amount of final reward given to the winner and taken from the losers.
        :param piece_types: list of dict configs containing describing possible pieces.
        :param policies_other: list of policies to use for opponents players.
        :param observe_all: whether to return observations on `step()` for all players in the info dict or not.
        :param multi_discrete_actions: whether to use a multi-discrete action space.
        :param flat_observations: whether to flatten the observations or return as tensor.
        :param render: enables rendering when calling `render()`.
        :param cell_size: width/height of a cell when rendering.
        :param padding: padding between cells when rendering.
        :param ui_font_size: size of the ui font when rendering.
        :param seed: random seed.
        """
        grid_size = tuple(grid_size)
        if policies_other is not None:
            assert n_players - 1 == len(
                policies_other), 'please provide a policy for each opponent.'

        self.n_players = n_players
        self.policies_other = policies_other
        self.observe_all = observe_all

        if piece_types is None:
            self.piece_types = self._get_default_piece_types()
        else:
            self.piece_types = piece_types
        n_piece_types = len(self.piece_types)

        # actions: (cursor move direction, piece_type)
        # where (cursor move direction) encodes +1 or -1 movement along an axis and 0 for no movement.
        n_move_directions = 1 + 2 * len(grid_size)
        if multi_discrete_actions:
            self.action_space = MultiDiscrete(
                [n_move_directions, n_piece_types])
        else:
            self.action_space = Discrete(n_move_directions * n_piece_types)

        # observation space:
        # (d_0 * ... * d_n * piece_type * player
        # + cursor_d_0 + ... + cursor_d_n + population + room)
        k_cursor_features = len(grid_size) if flat_observations else 1
        obs_dims = grid_size + (1 + (n_piece_types - 1) * n_players, )
        self.observation_space = OneHotBox(OneHot(obs_dims),
                                           Box(0.0,
                                               1.0,
                                               shape=(2 +
                                                      k_cursor_features, )),
                                           flatten=flat_observations)

        self.game = ExpandoGame(grid_size,
                                n_players,
                                max_turns,
                                final_reward=final_reward,
                                piece_types=self.piece_types,
                                seed=seed)
        self.observation_format = 'flat' if flat_observations else 'grid'
        self.do_render = render
        if self.do_render:
            self.renderer = GameRenderer(self.game, cell_size, padding,
                                         ui_font_size)

        self.seed(seed)

    def step(self, action, other_actions=None):
        """Perform each player's turn.

        :param action: action to take as player 0
        :param other_actions: optional list of actions to take for the other players. Will be sampled from actions_space
        if not provided.
        :return: obs_0, reward_0, done, info
        """
        if self.policies_other is not None:
            assert other_actions is None, 'other actions are already defined by the policies passed at initialization'

        # other player actions passed as argument
        if other_actions is not None:
            assert len(
                other_actions
            ) + 1 == self.n_players, 'please provide an action for each player'
            rewards_other = [
                self.game.take_turn(action, i)
                for i, action in enumerate(other_actions, start=1)
            ]
        # other player actions defined by policies passed to constructor
        elif self.policies_other is not None:
            other_obs = [
                self.game.get_observation(i, self.observation_format)
                for i in range(1, self.n_players)
            ]
            actions_other = [
                policy.predict(obs)[0][0]
                for obs, policy in zip(other_obs, self.policies_other)
            ]
            rewards_other = [
                self.game.take_turn(a, i)
                for i, a in enumerate(actions_other, start=1)
            ]
        # no other player actions provided: sample
        else:
            rewards_other = [
                self.game.take_turn(self.action_space.sample(), i)
                for i in range(1, self.n_players)
            ]

        info = {}
        if self.observe_all:
            other_obs_new = [
                self.game.get_observation(i, self.observation_format)
                for i in range(1, self.n_players)
            ]
            info = {'rewards_other': rewards_other, 'obs_other': other_obs_new}

        reward_0 = self.game.take_turn(action, player_id=0)
        obs_0 = self.game.get_observation(player_id=0,
                                          formatting=self.observation_format)
        done = self.game.is_done

        if done:
            self.game.reset()

        return obs_0, reward_0, done, info

    def seed(self, seed=None):
        """Set seeds of all random number generators. Note that pseudo random actions are performed at initialization,
        so in order to seed these actions as well you need to pass a seed to the constructor.

        :param seed: seed to set
        """
        self.observation_space.seed(seed)
        self.action_space.seed(seed)
        self.game.seed(seed)

    def reset(self, player_id=0):
        """Reset the environment.

        :param player_id: id of the player to get the first observation from.
        :return: observation of player with player_id or a list of all observations if `observe_all` was set.
        """
        self.game.reset()
        if self.observe_all:
            return [
                self.game.get_observation(i, self.observation_format)
                for i in range(self.n_players)
            ]
        return self.game.get_observation(player_id, self.observation_format)

    def render(self, mode='human'):
        """Render a pyglet visualization. Only works with 2D grids.
        """
        assert len(
            self.game.grid_size
        ) < 3, 'Only 2D grids are supported for rendering at the moment.'
        if self.do_render:
            self.renderer.step()

    @staticmethod
    def from_config(file_path):
        """Load environment using a yaml configuration file or a composable hydra config

        :param file_path: path to the config file
        :return: A configured Expando environment
        """
        file_path = to_absolute_path(file_path)
        conf_dir, file_name = os.path.split(file_path)
        with initialize_config_dir(conf_dir):
            cfg = compose(config_name=file_name)

        env = Expando(**cfg)
        return env

    @staticmethod
    def _get_default_piece_types():
        """Load the default piece types from default_config/

        :return: DictConfig containing piece_types
        """
        this_file_dir = os.path.split(relpath(__file__))[0]
        path = os.path.join(this_file_dir, 'default_config/piece_types.yaml')
        return OmegaConf.load(path).piece_types
Esempio n. 20
0
class SimpleFetchMdp(GoalEnv):
    def __init__(self, x_dim=5, y_dim=5, **kwargs):
        self.x_dim = x_dim
        self.y_dim = y_dim
        self.num_states = x_dim * y_dim
        # Right, Up, Left, Down, Grab
        self.action_space = Discrete(5)
        self.observation_space = Dict(
            dict(
                desired_goal=Discrete(self.num_states),  # Goal Position
                achieved_goal=Discrete(self.num_states),  # block position
                observation=MultiDiscrete([self.num_states,
                                           2])  #arm position, object in air
            ))

        self._location_space = Discrete(self.num_states)
        self._goal_location = self._location_space.sample()
        self._block_location = self._location_space.sample()
        self._arm_location = self._location_space.sample()
        self._picked_up_block = False

        self.action_handlers = [
            self._move_function(lambda s: s - 1,
                                lambda s: s % self.x_dim == 0),  # right
            self._move_function(lambda s: s - self.x_dim,
                                lambda s: s < self.x_dim),  # up
            self._move_function(lambda s: s + 1, lambda s:
                                (s + 1) % self.x_dim == 0),  # left
            self._move_function(
                lambda s: s + self.x_dim,
                lambda s: s + self.x_dim >= self.x_dim * self.y_dim),  # down
            self._grab
        ]

    def reset(self):
        # Pick a random goal and block location
        self._goal_location = self._location_space.sample()
        self._block_location = self._location_space.sample()
        while self._block_location == self._goal_location:
            self._block_location = self._location_space.sample()
        self._arm_location = self._location_space.sample()
        self._picked_up_block = False
        return self._get_obs()

    def render(self, mode='human'):
        pass

    def close(self):
        pass

    def seed(self, seed="None"):
        pass

    def step(self, action):
        self.action_handlers[action]()
        obs = self._get_obs()
        reward = self.compute_reward()
        done = reward == 1.
        info = []
        return obs, reward, done, info

    # Shortcut for setting the state and getting the output of the action
    def step_for(self, state, action, obs_format='dict'):
        self._goal_location = state[0]
        self._arm_location = state[2]
        if state[1] == -1:
            self._block_location = self._arm_location
            self._picked_up_block = True
        else:
            self._block_location = state[1]
            self._picked_up_block = False

        result = self.step(action)
        if obs_format == 'dict':
            return result
        return ([
            result[0]['desired_goal'], result[0]['achieved_goal'],
            result[0]['observation']
        ], ) + result[1:]

    def compute_reward(self):
        if self._arm_location == self._goal_location and self._picked_up_block:
            return 1.
        return 0.

    def _get_obs(self):
        return dict(desired_goal=self._goal_location,
                    achieved_goal=-1
                    if self._picked_up_block else self._block_location,
                    observation=self._arm_location)

    def _grab(self):
        if self._arm_location == self._block_location:
            self._picked_up_block = True

    def _move_function(self, displace, no_move_if):
        def ret():
            if no_move_if(self._arm_location):
                return
            self._arm_location = displace(self._arm_location)
            if self._picked_up_block:
                self._block_location = displace(self._block_location)

        return ret
Esempio n. 21
0
class TigerEnv(gym.Env):
    metadata = {"render.modes": ["human", "ansi"]}

    def __init__(self, seed=0, correct_prob=.85):
        self.correct_prob = correct_prob
        self.action_space = Discrete(len(Action))
        self.state_space = Discrete(len(State))
        self.observation_space = Discrete(len(Obs))
        self._discount = .95
        self._reward_range = (-float(100), float(10))
        self._query = 0
        self.seed(seed)

    def reset(self):
        self.done = False
        self.t = 0
        self._query = 0
        self.state = self.state_space.sample()
        self.last_action = Action.LISTEN.value
        return Obs.NULL.value

    def seed(self, seed=1234):
        np.random.seed(seed)
        return [seed]

    def step(self, action):

        assert self.action_space.contains(action)
        assert self.done is False
        self.t += 1
        self._query += 1
        self.last_action = action

        rw = TigerEnv._compute_rw(self.state, action)
        if TigerEnv._is_terminal(self.state, action):
            self.done = True
            return self.state, rw, self.done, {'state': self.state}

        self._sample_state(action)
        ob = TigerEnv._sample_ob(action, self.state)
        self.done = False
        return ob, rw, self.done, {"state": self.state}

    def render(self, mode='human', close=False):
        if close:
            return
        if mode == "human":
            if not hasattr(self, "gui"):
                self.gui = TigerGui()
            msg = "A: " + action_to_str(
                self.last_action) + " S: " + state_to_str(self.state)
            self.gui.render(state=(self.last_action, self.state), msg=msg)
        elif mode == "ansi":
            print("Current step: {}, tiger is in state: {}, action took: {}".
                  format(self.t, self.state, self.last_action[0]))
        else:
            raise NotImplementedError()

    def close(self):
        self._render(close=True)

    def _set_state(self, state):
        self.state = state
        self.done = False

    def _generate_legal(self):
        return list(range(self.action_space.n))

    def _generate_preferred(self, history):
        return self._generate_legal()

    def _sample_state(self, action):
        if action == Action.RIGHT.value or action == Action.LEFT.value:
            self.state = self.state_space.sample()

    def _get_init_state(self):
        # fix initial belief to be exact
        return self.state_space.sample()

    @staticmethod
    def _compute_prob(action, next_state, ob, correct_prob=.85):
        p_ob = 0.0
        if action == Action.LISTEN.value and ob != Obs.NULL.value:
            if (next_state == State.LEFT.value and ob == Obs.LEFT.value) or (
                    next_state == State.RIGHT.value and ob == Obs.RIGHT.value):
                p_ob = correct_prob
            else:
                p_ob = 1 - correct_prob
        elif action != Action.LISTEN.value and ob == Obs.NULL.value:
            p_ob = 1.

        assert p_ob >= 0.0 and p_ob <= 1.0
        return p_ob

    @staticmethod
    def _sample_ob(action, next_state, correct_prob=.85):
        ob = Obs.NULL.value
        p = np.random.uniform()
        if action == Action.LISTEN.value:
            if next_state == State.LEFT.value:
                ob = Obs.RIGHT.value if p > correct_prob else Obs.LEFT.value
            else:
                ob = Obs.LEFT.value if p > correct_prob else Obs.RIGHT.value
        return ob

    @staticmethod
    def _local_move(state, last_action, last_ob):
        raise NotImplementedError()

    @staticmethod
    def _is_terminal(state, action):
        is_terminal = False
        if action != Action.LISTEN.value:
            is_terminal = (
                (action == Action.LEFT.value and state == State.LEFT.value) or
                (action == Action.RIGHT.value and state == State.RIGHT.value))
        return is_terminal

    @staticmethod
    def _compute_rw(state, action):
        if action == Action.LISTEN.value:
            reward = -1
        elif not TigerEnv._is_terminal(state, action):
            reward = 10
        else:
            reward = -100
        return reward
Esempio n. 22
0
class SawyerXYZEnv(SawyerMocapBase, metaclass=abc.ABCMeta):
    def __init__(
        self,
        model_name,
        frame_skip=5,
        hand_low=(-0.2, 0.55, 0.05),
        hand_high=(0.2, 0.75, 0.3),
        mocap_low=None,
        mocap_high=None,
        action_scale=1. / 100,
        action_rot_scale=1.,
    ):
        super().__init__(model_name, frame_skip=frame_skip)
        self.random_init = True
        self.action_scale = action_scale
        self.action_rot_scale = action_rot_scale
        self.hand_low = np.array(hand_low)
        self.hand_high = np.array(hand_high)
        if mocap_low is None:
            mocap_low = hand_low
        if mocap_high is None:
            mocap_high = hand_high
        self.mocap_low = np.hstack(mocap_low)
        self.mocap_high = np.hstack(mocap_high)
        self.curr_path_length = 0
        self._freeze_rand_vec = True
        self._last_rand_vec = None

        # We use continuous goal space by default and
        # can discretize the goal space by calling
        # the `discretize_goal_space` method.
        self.discrete_goal_space = None
        self.discrete_goals = []
        self.active_discrete_goal = None

        self.action_space = Box(
            np.array([-1, -1, -1, -1]),
            np.array([+1, +1, +1, +1]),
        )

        self._pos_obj_max_len = 6
        self._pos_obj_possible_lens = (3, 6)

        self._set_task_called = False
        self._partially_observable = True

        self._state_goal = None  # OVERRIDE ME

    def _set_task_inner(self):
        # Doesn't absorb "extra" kwargs, to ensure nothing's missed.
        pass

    def set_task(self, task):
        self._set_task_called = True
        data = pickle.loads(task.data)
        assert isinstance(self, data['env_cls'])
        del data['env_cls']
        self._last_rand_vec = data['rand_vec']
        self._freeze_rand_vec = True
        self._last_rand_vec = data['rand_vec']
        del data['rand_vec']
        self._partially_observable = data['partially_observable']
        del data['partially_observable']
        self._set_task_inner(**data)

    def set_xyz_action(self, action):
        action = np.clip(action, -1, 1)
        pos_delta = action * self.action_scale
        new_mocap_pos = self.data.mocap_pos + pos_delta[None]

        new_mocap_pos[0, :] = np.clip(
            new_mocap_pos[0, :],
            self.mocap_low,
            self.mocap_high,
        )
        self.data.set_mocap_pos('mocap', new_mocap_pos)
        self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))

    def discretize_goal_space(self, goals):
        assert False
        assert len(goals) >= 1
        self.discrete_goals = goals
        # update the goal_space to a Discrete space
        self.discrete_goal_space = Discrete(len(self.discrete_goals))

    # Belows are methods for using the new wrappers.
    # `sample_goals` is implmented across the sawyer_xyz
    # as sampling from the task lists. This will be done
    # with the new `discrete_goals`. After all the algorithms
    # conform to this API (i.e. using the new wrapper), we can
    # just remove the underscore in all method signature.
    def sample_goals_(self, batch_size):
        assert False
        if self.discrete_goal_space is not None:
            return [
                self.discrete_goal_space.sample() for _ in range(batch_size)
            ]
        else:
            return [self.goal_space.sample() for _ in range(batch_size)]

    def set_goal_(self, goal):
        assert False
        if self.discrete_goal_space is not None:
            self.active_discrete_goal = goal
            self.goal = self.discrete_goals[goal]
            self._state_goal_idx = np.zeros(len(self.discrete_goals))
            self._state_goal_idx[goal] = 1.
        else:
            self.goal = goal

    def _set_obj_xyz(self, pos):
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        qpos[9:12] = pos.copy()
        qvel[9:15] = 0
        self.set_state(qpos, qvel)

    def get_site_pos(self, siteName):
        _id = self.model.site_names.index(siteName)
        return self.data.site_xpos[_id].copy()

    def _get_pos_objects(self):
        """Retrieves object position(s) from mujoco properties or instance vars

        Returns:
            np.ndarray: Flat array (usually 3 elements) representing the
                object(s)' position(s)
        """
        # Throw error rather than making this an @abc.abstractmethod so that
        # V1 environments don't have to implement it
        raise NotImplementedError

    def _get_pos_goal(self):
        """Retrieves goal position from mujoco properties or instance vars

        Returns:
            np.ndarray: Flat array (3 elements) representing the goal position
        """
        assert isinstance(self._state_goal, np.ndarray)
        assert self._state_goal.ndim == 1
        return self._state_goal

    def _get_obs(self):
        """Combines positions of the end effector, object(s) and goal into a
        single flat observation

        Returns:
            np.ndarray: The flat observation array (12 elements)
        """
        pos_hand = self.get_endeff_pos()

        pos_obj_padded = np.zeros(self._pos_obj_max_len)
        pos_obj = self._get_pos_objects()
        assert len(pos_obj) in self._pos_obj_possible_lens
        pos_obj_padded[:len(pos_obj)] = pos_obj

        pos_goal = self._get_pos_goal()
        if self._partially_observable:
            pos_goal = np.zeros_like(pos_goal)

        return np.hstack((pos_hand, pos_obj_padded, pos_goal))

    def _get_obs_dict(self):
        obs = self._get_obs()
        return dict(
            state_observation=obs,
            state_desired_goal=self._get_pos_goal(),
            state_achieved_goal=obs[3:-3],
        )

    def reset(self):
        self.curr_path_length = 0
        return super().reset()

    def _get_state_rand_vec(self):
        if self._freeze_rand_vec:
            assert self._last_rand_vec is not None
            return self._last_rand_vec
        else:
            rand_vec = np.random.uniform(self.obj_and_goal_space.low,
                                         self.obj_and_goal_space.high,
                                         size=self.obj_and_goal_space.low.size)
            self._last_rand_vec = rand_vec
            return rand_vec

    def sample_tasks(self, num_tasks):
        directions = 2 * self.np_random.binomial(1, p=0.5,
                                                 size=(num_tasks, )) - 1
        tasks = [{'direction': direction} for direction in directions]
        return tasks

    def reset_task(self, task):
        self._task = task
        self._goal_dir = task['direction']
Esempio n. 23
0
 def action(self, state: Box, action_space: Discrete) -> int:
     return action_space.sample()
Esempio n. 24
0
    def test_spaces(self):
        experiment_name = "test_spaces"
        module_name = "module"

        logger = ModuleLogger(
            output_path=Path(self.temp_dir.name),
            experiment_name=experiment_name,
            module=module_name,
            step_write_frequency=None,
            episode_write_frequency=None,
        )
        seed = 3

        # Discrete
        space = Discrete(n=3)
        space.seed(seed)
        logger.log_space("Discrete", space.sample())

        # MultiDiscrete
        space = MultiDiscrete(np.array([3, 2]))
        space.seed(seed)
        logger.log_space("MultiDiscrete", space.sample())

        # Dict
        space = Dict({
            "predictiveChangeVarDiscountedAverage":
            spaces.Box(low=-np.inf, high=np.inf, shape=(1, )),
            "predictiveChangeVarUncertainty":
            spaces.Box(low=0, high=np.inf, shape=(1, )),
            "lossVarDiscountedAverage":
            spaces.Box(low=-np.inf, high=np.inf, shape=(1, )),
            "lossVarUncertainty":
            spaces.Box(low=0, high=np.inf, shape=(1, )),
            "currentLR":
            spaces.Box(low=0, high=1, shape=(1, )),
            "trainingLoss":
            spaces.Box(low=0, high=np.inf, shape=(1, )),
            "validationLoss":
            spaces.Box(low=0, high=np.inf, shape=(1, )),
        })
        space.seed(seed)
        logger.log_space("Dict", space.sample())

        space = Box(np.array([0, 0]), np.array([2, 2]))
        space.seed(seed)
        logger.log_space("Box", space.sample())
        logger.close()

        with open(logger.get_logfile(), "r") as log_file:
            logs = list(map(json.loads, log_file))

        wide = log2dataframe(logs, wide=True)
        long = log2dataframe(logs, drop_columns=None)

        self.assertEqual(len(wide), 1)
        first_row = wide.iloc[0]

        # Discrete
        self.assertTrue(not np.isnan(first_row.Discrete))

        # MultiDiscrete
        self.assertTrue(not np.isnan(first_row.MultiDiscrete_0))
        self.assertTrue(not np.isnan(first_row.MultiDiscrete_1))
        simultaneous_logged = long[(long.name == "MultiDiscrete_0") |
                                   (long.name == "MultiDiscrete_1")]
        self.assertEqual(len(simultaneous_logged.time.unique()), 1)

        # Dict
        expected_columns = [
            "Dict_currentLR_0",
            "Dict_lossVarDiscountedAverage_0",
            "Dict_lossVarUncertainty_0",
            "Dict_predictiveChangeVarDiscountedAverage_0",
            "Dict_predictiveChangeVarUncertainty_0",
            "Dict_trainingLoss_0",
        ]

        for expected_column in expected_columns:
            self.assertTrue(not np.isnan(first_row[expected_column]))

        simultaneous_logged = long[long.name.isin(expected_columns)]
        self.assertEqual(len(simultaneous_logged.time.unique()), 1)

        # Box
        self.assertTrue(not np.isnan(first_row.Box_0))
        self.assertTrue(not np.isnan(first_row.Box_1))

        simultaneous_logged = long[(long.name == "Box_0") |
                                   (long.name == "Box_1")]
        self.assertEqual(len(simultaneous_logged.time.unique()), 1)
Esempio n. 25
0
class PocEnv(Env):
    def __init__(self, maze, obs_array=False):
        self.board = select_maze(maze)
        self.grid = PocGrid(board=self.board["_maze"])
        self._get_init_state()
        self.action_space = Discrete(4)
        self.observation_space = Discrete(1 << 10)  # 1024
        # self.observation_space = Discrete(14)
        self._reward_range = 100
        self._discount = .95
        self.done = False

        self.gui = None

        if obs_array:
            self._set_flags = set_flags_array
            self._zero = lambda: [0] * 10
        else:
            self._set_flags = set_flags
            self._zero = lambda: 0

    def seed(self, seed=None):
        np.random.seed(seed)

    def is_power(self, idx):
        return self.board['_maze'][idx] == 3

    def is_passable(self, idx):
        return self.board['_maze'][idx] != 0

    def _is_valid(self):

        assert self.grid.is_inside(self.state.agent_pos)
        assert self.is_passable(self.state.agent_pos)
        for ghost in self.state.ghosts:
            assert self.grid.is_inside(ghost.pos)
            assert self.is_passable(ghost.pos)

    def _set_state(self, state):
        self.done = False
        self.state = state

    def _generate_legal(self):
        actions = []
        for action in self.action_space.n:
            if self.grid.is_inside(self.state.agent_pos +
                                   Moves.get_coord(action.value)):
                actions.append(action.value)
        return actions

    def step(self, action):
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg
        assert self.done is False

        self.state.action = action

        reward = -1
        next_pos = self._next_pos(self.state.agent_pos, action)
        if next_pos.is_valid():
            self.state.agent_pos = next_pos
        else:
            reward += -25

        if self.state.power_step > 0:
            self.state.power_step -= 1

        hit_ghost = -1
        for g, ghost in enumerate(self.state.ghosts):
            if ghost.pos == self.state.agent_pos:
                hit_ghost = g
            else:
                # move ghost
                self._move_ghost(g, ghost_range=self.board["_ghost_range"])
                if ghost.pos == self.state.agent_pos:
                    hit_ghost = g

        if hit_ghost >= 0:
            if self.state.power_step > 0:
                reward += 25
                self.state.ghosts[hit_ghost].reset()
            else:
                reward += -100
                self.done = True
        # don't eat power up when hit by a ghost already
        elif self.is_power(self.state.agent_pos):
            self.state.power_step = config["_power_steps"]
            reward += 10
        # same for food
        elif self.state.food_pos[self.grid.get_index(self.state.agent_pos)]:
            self.state.food_pos[self.grid.get_index(self.state.agent_pos)] = 0
            if sum(self.state.food_pos) == 0:
                reward += 1000
                self.done = True

        obs = self._make_ob()

        return obs, reward, self.done, {"state": self.state}

    def _make_ob(self):
        obs = self._zero()
        for d in range(self.action_space.n):
            if self._see_ghost(d) >= 0:
                obs = self._set_flags(obs, d)
            next_pos = self._next_pos(self.state.agent_pos, direction=d)
            if next_pos.is_valid() and self.is_passable(next_pos):
                obs = self._set_flags(obs, d + self.action_space.n)
        if self._smell_food():
            obs = self._set_flags(obs, 8)
        if self._hear_ghost(self.state):
            obs = self._set_flags(obs, 9)
        return obs

    def _encode_state(self, state):
        poc_idx = self.grid.get_index(state.agent_pos)
        ghosts = [(self.grid.get_index(ghost.pos), ghost.direction)
                  for ghost in state.ghosts]

        return np.concatenate([[poc_idx], *ghosts, state.food_pos,
                               [state.power_step]])

    def _decode_state(self, state):
        poc_state = PocState(Coord(*self.grid.get_coord(state[0])))
        ghosts = np.split(state[1:self.board["_num_ghosts"] * 3], 1)
        for g in ghosts:
            poc_state.ghosts.append(
                Ghost(pos=self.grid.get_coord(g[0]), direction=g[1]))
        poc_state.power_step = state[-1]
        poc_state.food_pos = state[self.board["_num_ghosts"] * 3:-1].tolist()
        return poc_state

    def _see_ghost(self, action):
        eye_pos = self.state.agent_pos + Moves.get_coord(action)
        while True:
            for g, ghost in enumerate(self.state.ghosts):
                if ghost.pos == eye_pos:
                    return g
            eye_pos += Moves.get_coord(action)
            if not (self.grid.is_inside(eye_pos)
                    and self.is_passable(eye_pos)):
                break
        return -1

    def _smell_food(self, smell_range=1):
        for x in range(-smell_range, smell_range + 1):
            for y in range(-smell_range, smell_range + 1):
                smell_pos = Coord(x, y)
                idx = self.grid.get_index(self.state.agent_pos + smell_pos)
                if self.grid.is_inside(self.state.agent_pos + smell_pos) and\
                        self.state.food_pos[idx]:
                    return True
        return False

    @staticmethod
    def _hear_ghost(poc_state, hear_range=2):
        for ghost in poc_state.ghosts:
            if Grid.manhattan_distance(ghost.pos,
                                       poc_state.agent_pos) <= hear_range:
                return True
        return False

    def render(self, mode='human', close=False):
        if close:
            return
        if mode == 'human':
            if self.gui is None:
                self.gui = PocGui(board_size=self.grid.get_size,
                                  maze=self.board["_maze"],
                                  state=self.state)
            else:
                self.gui.render(state=self.state)

    def reset(self):
        self.done = False
        self._get_init_state()
        return self._make_ob()

    def close(self):
        pass

    def _get_init_state(self):
        self.state = PocState()
        self.state.agent_pos = Coord(*self.board["_poc_home"])
        ghost_home = Coord(*self.board["_ghost_home"])

        for g in range(self.board["_num_ghosts"]):
            pos = Coord(ghost_home.x + g % 2, ghost_home.y + g // 2)
            self.state.ghosts.append(Ghost(pos, direction=-1))

        self.state.food_pos = np.random.binomial(1,
                                                 config["_food_prob"],
                                                 size=self.grid.n_tiles)
        # only make free space food
        idx = (self.board["_maze"] > 0) &\
              (self.state.food_pos.reshape(self.board["_maze"].shape) > 0)
        self.board["_maze"][idx] = 4
        self.state.power_step = 0
        return self.state

    def _next_pos(self, pos, direction):
        direction = Moves.get_coord(direction)
        if pos.x == 0 and pos.y == self.board['_passage_y'] and\
                direction == Moves.EAST:
            next_pos = Coord(self.grid.x_size - 1, pos.y)
        elif pos.x == self.grid.x_size - 1 and\
                pos.y == self.board['_passage_y'] and direction == Moves.WEST:
            next_pos = Coord(0, pos.y)
        else:
            next_pos = pos + direction

        if self.grid.is_inside(next_pos) and self.is_passable(next_pos):
            return next_pos
        else:
            return Coord(-1, -1)

    def _move_ghost(self, g, ghost_range):
        if Grid.manhattan_distance(self.state.agent_pos,
                                   self.state.ghosts[g].pos) < ghost_range:
            if self.state.power_step > 0:
                self._move_defensive(g)
            else:
                self._move_aggressive(g)
        else:
            self._move_random(g)

    def _move_aggressive(self, g):
        if not np.random.binomial(1, p=config["_chase_prob"]):
            return self._move_random(g)

        best_dist = self.grid.x_size + self.grid.y_size
        best_pos = self.state.ghosts[g].pos
        best_dir = -1
        for d in range(self.action_space.n):
            dist = Grid.directional_distance(self.state.agent_pos,
                                             self.state.ghosts[g].pos, d)

            new_pos = self._next_pos(self.state.ghosts[g].pos, d)
            if dist <= best_dist and new_pos.is_valid() and\
                    can_move(self.state.ghosts[g], d):
                best_pos = new_pos
                best_dist = dist
                best_dir = d

        self.state.ghosts[g].update(best_pos, best_dir)

    def _move_defensive(self, g, defensive_prob=.5):
        if np.random.binomial(1, defensive_prob) and\
                self.state.ghosts[g].direction >= 0:
            self.state.ghosts[g].direction = -1
            return

        best_dist = 0
        best_pos = self.state.ghosts[g].pos
        best_dir = -1
        for d in range(self.action_space.n):
            dist = Grid.directional_distance(self.state.agent_pos,
                                             self.state.ghosts[g].pos, d)

            new_pos = self._next_pos(self.state.ghosts[g].pos, d)
            if dist >= best_dist and new_pos.is_valid() and\
                    can_move(self.state.ghosts[g], d):
                best_pos = new_pos
                best_dist = dist
                best_dir = d

        self.state.ghosts[g].update(best_pos, best_dir)

    def _move_random(self, g):
        # there are !!! dead ends
        # only switch to opposite direction when it failed 10 times (hack)
        ghost_pos = self.state.ghosts[g].pos
        i = 0
        while True:
            d = self.action_space.sample()
            next_pos = self._next_pos(ghost_pos, d)
            # normal map has dead ends:
            if next_pos.is_valid() and (can_move(self.state.ghosts[g], d)
                                        or i > 10):
                break
            i += 1

        self.state.ghosts[g].update(next_pos, d)
Esempio n. 26
0
class UR3DualXYZEnv(UR3MocapBase, metaclass=abc.ABCMeta):
    def __init__(self,
                 *args,
                 hand_low=(-0.2, 0.55, 0.05),
                 hand_high=(0.2, 0.75, 0.3),
                 second_hand_low=(-0.2, 0.55, 0.05),
                 second_hand_high=(0.2, 0.75, 0.3),
                 mocap_low=None,
                 mocap_high=None,
                 second_mocap_low=None,
                 second_mocap_high=None,
                 action_scale=2. / 100,
                 action_rot_scale=1.,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.action_scale = action_scale
        self.action_rot_scale = action_rot_scale
        self.hand_low = np.array(hand_low)
        self.hand_high = np.array(hand_high)
        self.second_hand_low = np.array(second_hand_low)
        self.second_hand_high = np.array(second_hand_high)
        if mocap_low is None:
            mocap_low = hand_low
        if mocap_high is None:
            mocap_high = hand_high
        if second_mocap_low is None:
            second_mocap_low = second_hand_low
        if second_mocap_high is None:
            second_mocap_high = second_hand_high
        self.mocap_low = np.hstack(mocap_low)
        self.mocap_high = np.hstack(mocap_high)
        self.second_mocap_low = np.hstack(second_mocap_low)
        self.second_mocap_high = np.hstack(second_mocap_high)
        # We use continuous goal space by default and
        # can discretize the goal space by calling
        # the `discretize_goal_space` method.
        self.discrete_goal_space = None
        self.discrete_goals = []
        self.active_discrete_goal = None

    def set_xyz_action(self, action):
        action = np.clip(action, -1, 1)
        pos_delta = action * self.action_scale
        new_mocap_pos = self.data.mocap_pos + pos_delta[None]

        new_mocap_pos[0, :] = np.clip(
            new_mocap_pos[0, :],
            self.mocap_low,
            self.mocap_high,
        )
        self.data.set_mocap_pos('mocap', new_mocap_pos)
        if self.rotMode == 'vertical_fixed':
            quat = quat_mul(quat_create(np.array([1., 0, 0]), np.pi),
                            quat_create(np.array([0, 0, 1.]),
                                        np.pi / 2))  #ref 기준 x축 180, z축 90순
        elif self.rotMode == 'horizontal_fixed':
            quat = quat_mul(quat_create(np.array([0, 0, 1.]), np.pi),
                            quat_create(np.array([0, 1., 0]),
                                        np.pi / 2))  #ref 기준 z축 180, y축 90순

        self.data.set_mocap_quat('mocap', quat)  #w v 순인듯
        # self.data.set_mocap_quat('mocap', np.array([1, 0, 0, 0])) #w v 순인듯

    def set_xyz_action_rot(self, action):
        action[:3] = np.clip(action[:3], -1, 1)
        pos_delta = action[:3] * self.action_scale
        new_mocap_pos = self.data.mocap_pos + pos_delta[None]
        new_mocap_pos[0, :] = np.clip(
            new_mocap_pos[0, :],
            self.mocap_low,
            self.mocap_high,
        )
        rot_axis = action[4:] / np.linalg.norm(action[4:])
        action[3] = action[3] * self.action_rot_scale
        self.data.set_mocap_pos('mocap', new_mocap_pos)
        # replace this with learned rotation

        quat = quat_mul(
            quat_create(np.array([0, 1., 0]), np.pi),
            quat_create(np.array(rot_axis).astype(np.float64), action[3]))
        self.data.set_mocap_quat('mocap', quat)
        # self.data.set_mocap_quat('mocap', np.array([np.cos(action[3]/2), np.sin(action[3]/2)*rot_axis[0], np.sin(action[3]/2)*rot_axis[1], np.sin(action[3]/2)*rot_axis[2]]))
        # self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0]))

    def set_xyz_action_rotz(self, action):
        action[:3] = np.clip(action[:3], -1, 1)
        pos_delta = action[:3] * self.action_scale
        new_mocap_pos = self.data.mocap_pos + pos_delta[None]
        new_mocap_pos[0, :] = np.clip(
            new_mocap_pos[0, :],
            self.mocap_low,
            self.mocap_high,
        )
        self.data.set_mocap_pos('mocap', new_mocap_pos)
        zangle_delta = action[3] * self.action_rot_scale
        new_mocap_zangle = ur3_quat_to_zangle(
            self.data.mocap_quat[0]) + zangle_delta

        # new_mocap_zangle = action[3]
        new_mocap_zangle = np.clip(
            new_mocap_zangle,
            -3.0,
            3.0,
        )
        if new_mocap_zangle < 0:
            new_mocap_zangle += 2 * np.pi
        self.data.set_mocap_quat('mocap', ur3_zangle_to_quat(new_mocap_zangle))

    def set_xy_action(self, xy_action, fixed_z):
        delta_z = fixed_z - self.data.mocap_pos[0, 2]
        xyz_action = np.hstack((xy_action, delta_z))
        self.set_xyz_action(xyz_action)

    def discretize_goal_space(self, goals=None):
        if goals is None:
            self.discrete_goals = [self.default_goal]
        else:
            assert len(goals) >= 1
            self.discrete_goals = goals
        # update the goal_space to a Discrete space
        self.discrete_goal_space = Discrete(len(self.discrete_goals))

    # Belows are methods for using the new wrappers.
    # `sample_goals` is implmented across the sawyer_xyz
    # as sampling from the task lists. This will be done
    # with the new `discrete_goals`. After all the algorithms
    # conform to this API (i.e. using the new wrapper), we can
    # just remove the underscore in all method signature.
    def sample_goals_(self, batch_size):
        if self.discrete_goal_space is not None:
            return [
                self.discrete_goal_space.sample() for _ in range(batch_size)
            ]
        else:
            return [self.goal_space.sample() for _ in range(batch_size)]

    def set_goal_(self, goal):
        if self.discrete_goal_space is not None:
            self.active_discrete_goal = goal
            self.goal = self.discrete_goals[goal]
            self._state_goal_idx = np.zeros(len(self.discrete_goals))
            self._state_goal_idx[goal] = 1.
        else:
            self.goal = goal

    def set_init_config(self, config):
        assert isinstance(config, dict)
        for key, val in config.items():
            self.init_config[key] = val

    '''
    Functions that are copied and pasted everywhere and seems
    to be not used.
    '''

    def sample_goals(self, batch_size):
        '''Note: should be replaced by sample_goals_ if not used'''
        # Required by HER-TD3
        goals = self.sample_goals_(batch_size)
        if self.discrete_goal_space is not None:
            goals = [self.discrete_goal_space[g].copy() for g in goals]
        return {
            'state_desired_goal': goals,
        }

    def sample_task(self):
        '''Note: this can be replaced by sample_goal_(batch_size=1)'''
        goal = self.sample_goals_(1)
        if self.discrete_goal_space is not None:
            return self.discrete_goals[goal]
        else:
            return goal

    def _set_obj_xyz_quat(self, pos, angle):
        quat = quat_create(np.array([0, 0, .1]), angle)
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        qpos[9:12] = pos.copy()
        qpos[12:16] = quat.copy()
        qvel[9:15] = 0
        self.set_state(qpos, qvel)

    def _set_obj_xyz(self, pos):
        qpos = self.data.qpos.flat.copy()
        qvel = self.data.qvel.flat.copy()
        qpos[9:12] = pos.copy()
        qvel[9:15] = 0
        self.set_state(qpos, qvel)
Esempio n. 27
0
class FDEnvSelHeur(Env):
    def __init__(self,
                 num_heuristics: int,
                 host: str = '',
                 port: int = 12345,
                 num_steps=None,
                 state_type: Union[int, StateType] = StateType.RAW,
                 seed: int = 12345,
                 max_rand_steps: int = 0,
                 config_dir: str = '.',
                 port_file_id=None,
                 use_general_state_info: bool = True,
                 time_step_limit: int = -1):
        """
        Initialize environment
        """

        self._heuristic_state_features = [
            'Average Value',  # 'Dead Ends Reliable',
            'Max Value',
            'Min Value',
            'Open List Entries',
            'Varianz'
        ]
        self.action_space = Discrete(num_heuristics)
        self._general_state_features = [  #'evaluated_states', 'evaluations', 'expanded_states',
            # 'generated_ops',
            #'generated_states', 'num_variables',
            #'registered_states', 'reopened_states',
            #"cg_num_eff_to_eff", "cg_num_eff_to_pre", "cg_num_pre_to_eff"
        ]

        total_state_features = (num_heuristics *
                                len(self._heuristic_state_features))
        self._use_gsi = use_general_state_info
        if use_general_state_info:
            total_state_features += len(self._general_state_features)
        self.observation_space = Box(
            low=np.array([-np.inf for _ in range(total_state_features)]),
            high=np.array([np.inf for _ in range(total_state_features)]),
            dtype=np.float32)

        self.__skip_transform = [False for _ in range(total_state_features)]
        if use_general_state_info:
            self.__skip_transform[4] = True  # skip num_variables transform
            self.__skip_transform[7] = True
            self.__skip_transform[8] = True
            self.__skip_transform[9] = True

        self.__num_heuristics = num_heuristics
        self.host = host
        self.port = port

        self.socket = None
        self.conn = None

        self._prev_state = None
        self.num_steps = num_steps
        self.time_step_limit = time_step_limit

        self.__state_type = StateType(state_type)
        self.__norm_vals = []
        self._config_dir = config_dir
        self._port_file_id = port_file_id

        self._transformation_func = None
        # create state transformation function with inputs (current state, previous state, normalization values)
        if self.__state_type == StateType.DIFF:
            self._transformation_func = lambda x, y, z, skip: x - y if not skip else x
        elif self.__state_type == StateType.ABSDIFF:
            self._transformation_func = lambda x, y, z, skip: abs(
                x - y) if not skip else x
        elif self.__state_type == StateType.NORMAL:
            self._transformation_func = lambda x, y, z, skip: FDEnvSelHeur._save_div(
                x, z) if not skip else x
        elif self.__state_type == StateType.NORMDIFF:
            self._transformation_func = lambda x, y, z, skip: \
                FDEnvSelHeur._save_div(x, z) - FDEnvSelHeur._save_div(y, z) if not skip else x
        elif self.__state_type == StateType.NORMABSDIFF:
            self._transformation_func = lambda x, y, z, skip:\
                abs(FDEnvSelHeur._save_div(x, z) - FDEnvSelHeur._save_div(y, z)) if not skip else x

        self.rng = np.random.RandomState(seed=seed)
        self.max_rand_steps = max_rand_steps
        self.__step = 0
        self.__start_time = None
        self.done = True  # Starts as true as the expected behavior is that before normal resets an episode was done.

    @staticmethod
    def _save_div(a, b):
        return np.divide(a, b, out=np.zeros_like(a), where=b != 0)

    def send_msg(self, msg: bytes):
        """
        Send message and prepend the message size

        Based on comment from SO see [1]
        [1] https://stackoverflow.com/a/17668009

        :param msg: The message as byte
        """
        # Prefix each message with a 4-byte length (network byte order)
        msg = str.encode("{:>04d}".format(len(msg))) + msg
        self.conn.sendall(msg)

    def recv_msg(self):
        """
        Recieve a whole message. The message has to be prepended with its total size
        Based on comment from SO see [1]
        """
        # Read message length and unpack it into an integer
        raw_msglen = self.recvall(4)
        if not raw_msglen:
            return None
        msglen = int(raw_msglen.decode())
        # Read the message data
        return self.recvall(msglen)

    def recvall(self, n: int):
        """
        Given we know the size we want to recieve, we can recieve that amount of bytes.
        Based on comment from SO see [1]

        :param n: Number of bytes to expect in the data
        """
        # Helper function to recv n bytes or return None if EOF is hit
        data = b''
        while len(data) < n:
            packet = self.conn.recv(n - len(data))
            if not packet:
                return None
            data += packet
        return data

    def _process_data(self):
        """
        Split received json into state reward and done
        :return:
        """
        msg = self.recv_msg().decode()
        #print("----------------------------")
        #print(msg)
        #print("=>")
        msg = msg.replace('-inf', '0')
        msg = msg.replace('inf', '0')
        #print(msg)
        data = eval(msg)
        r = data['reward']
        done = data['done']
        del data['reward']
        del data['done']

        state = []

        if self._use_gsi:
            for feature in self._general_state_features:
                state.append(data[feature])
        for heuristic_id in range(
                self.__num_heuristics):  # process heuristic data
            for feature in self._heuristic_state_features:
                state.append(data["%d" % heuristic_id][feature])

        if self._prev_state is None:
            self.__norm_vals = deepcopy(state)
            self._prev_state = deepcopy(state)
        if self.__state_type != StateType.RAW:  # Transform state to DIFF state or normalize
            tmp_state = state
            state = list(
                map(self._transformation_func, state, self._prev_state,
                    self.__norm_vals, self.__skip_transform))
            self._prev_state = tmp_state
        return np.array(state), r, done

    def step(self, action: typing.Union[int, typing.List[int]]):
        """
        Play RL-Action
        :param action:
        :return:
        """
        self.__step += 1
        if not np.issubdtype(
                type(action),
                np.integer):  # check for core int and any numpy-int
            try:
                action = action[0]
            except IndexError as e:
                print(type(action))
                raise e
        if self.num_steps:
            msg = ','.join([str(action), str(self.num_steps)])
        else:
            msg = str(action)
        self.send_msg(str.encode(msg))
        s, r, d = self._process_data()
        info = {}
        if d:
            self.done = True
            self.kill_connection()
        if self.__step > self.time_step_limit:
            info['needs_reset'] = True
            self.send_msg(str.encode("END"))
            self.kill_connection()
            self.done = True
        return s, r, d, info

    def reset(self):
        """
        Initialize FD
        :return:
        """
        self._prev_state = None
        self.__step = 0
        self.__start_time = time.time()
        if not self.done:  # This means we interrupt FD before a plan was found
            # Inform FD about imminent shutdown of the connection
            self.send_msg(str.encode("END"))
        self.done = False
        if self.conn:
            self.conn.shutdown(2)
            self.conn.close()
            self.conn = None
        if not self.socket:
            self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            self.socket.bind((self.host, self.port))

        # write down port such that FD can potentially read where to connect to
        if self._port_file_id:
            fp = joinpath(self._config_dir,
                          'port_{:d}.txt'.format(self._port_file_id))
        else:
            fp = joinpath(self._config_dir, 'port.txt')
        with open(fp, 'w') as portfh:
            portfh.write(str(self.port))
        print(fp)

        self.socket.listen()
        self.conn, address = self.socket.accept()
        s, _, _ = self._process_data()
        if self.max_rand_steps > 1:
            for _ in range(self.rng.randint(1, self.max_rand_steps + 1)):
                s, _, _, _ = self.step(self.action_space.sample())
        else:
            s, _, _, _ = self.step(0)  # hard coded to zero as initial step

        remove(
            fp
        )  # remove the port file such that there is no chance of loading the old port
        return s

    def kill_connection(self):
        """Kill the connection"""
        if self.conn:
            self.conn.shutdown(2)
            self.conn.close()
            self.conn = None
        if self.socket:
            self.socket.shutdown(2)
            self.socket.close()
            self.socket = None

    def close(self):
        """
        Needs to "kill" the environment
        :return:
        """
        self.kill_connection()

    def render(self, mode: str = 'human') -> None:
        """
        Required by gym.Env but not implemented
        :param mode:
        :return: None
        """
        pass

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]