class BitFlippingEnv(gym.Env): """ bit-flipping environment: https://arxiv.org/abs/1707.01495 * Environment has n-bit state. * Initial state and goal state are randomly selected. * Action is one of the 0, ..., n-1, which flips single bit * Reward is 0 if state == goal, otherwise reward is -1. (Sparse Binary Reward) Simple RL algorithms tend to fail for large ``n`` like ``n > 40`` """ def __init__(self, n): seeds = np.random.SeedSequence().spawn(3) self.np_random = np.random.default_rng(seeds[0]) self.observation_space = Box(low=0, high=1, shape=(n,), dtype=int) self.action_space = Discrete(self.n) self.observation_space.seed(seeds[1].entropy) self.action_space.seed(seeds[2].entropy) def step(self, action): action = int(action) self.bit[action] = 1 - self.bit[action] done = (self.bit == self.goal).all() rew = 0 if done else -1 return self.bit.copy(), rew, done, {} def reset(self): self.bit = self.np_random.integers(low=0, high=1, size=self.action_space.n, endpoint=True, dtype=int) self.goal = self.np_random.integers(low=0, high=1, size=self.action_space.n, endpoint=True, dtype=int) return self.bit.copy()
def test_seed_Dict(): test_space = Dict( { "a": Box(low=0, high=1, shape=(3, 3)), "b": Dict( { "b_1": Box(low=-100, high=100, shape=(2,)), "b_2": Box(low=-1, high=1, shape=(2,)), } ), "c": Discrete(5), } ) seed_dict = { "a": 0, "b": { "b_1": 1, "b_2": 2, }, "c": 3, } test_space.seed(seed_dict) # "Unpack" the dict sub-spaces into individual spaces a = Box(low=0, high=1, shape=(3, 3)) a.seed(0) b_1 = Box(low=-100, high=100, shape=(2,)) b_1.seed(1) b_2 = Box(low=-1, high=1, shape=(2,)) b_2.seed(2) c = Discrete(5) c.seed(3) for i in range(10): test_s = test_space.sample() a_s = a.sample() assert (test_s["a"] == a_s).all() b_1_s = b_1.sample() assert (test_s["b"]["b_1"] == b_1_s).all() b_2_s = b_2.sample() assert (test_s["b"]["b_2"] == b_2_s).all() c_s = c.sample() assert test_s["c"] == c_s
class MouseEnv(gym.Env) : metadata = { 'render.modes' : ['human','rgb'] } def __init__(self): #Turn left 45°, Move forward, Turn right 45° self.action_space = Discrete(3) self._done = False self.viewer = None self.engine = None self.max_step = 100 self.cur_step = 0 self.image_size = (720,720) self.seed() # 3 Continuous Inputs from both eyes self.observation_space = Dict( {'Right' : Box(0, 255, shape=(100,ec.CacheNum,3), dtype=np.uint8), 'Left' : Box(0,255, shape=(100,ec.CacheNum,3), dtype = np.uint8)} ) def step(self, action): assert not (self.engine is None), 'Reset first before starting env' if self._done : print('The game is already done. Continuing may cause unexpected'\ ' behaviors') if action == 0: trans_action = ((0,0),np.pi/4) elif action == 1: trans_action = ((10,0),0) elif action == 2: trans_action = ((0,0),-np.pi/4) observation, reward, done, info = self.engine.update(trans_action) if done: self._done = True #Check if reached max_step self.cur_step += 1 if self.cur_step >= self.max_step: self._done = True done = True return observation, reward, done, info def reset(self): """ Reset the environment and return initial observation """ self._done = False self.cur_step = 0 self.engine = self._new_engine() initial_observation = self.engine.initial_observation() return initial_observation def render(self, mode='human'): assert not (self.engine is None), 'Reset first before starting env' if 'human' in mode : from gym.envs.classic_control import rendering if self.viewer == None: self.viewer = rendering.SimpleImageViewer(maxwidth=720) self.viewer.imshow(self.engine.image) elif 'rgb' in mode : return self.engine.image def seed(self, seed=None): np_random, seed = seeding.np_random(seed) rng.np_random = np_random self.action_space.seed(seed) def close(self): if self.viewer: self.viewer.close() self.viewer = None def _new_engine(self): return Engine(*self.image_size)
class Expando(Env): """Gym environment wrapping the expando game. For details on the game, check the ExpandoGame class. Action-space: Multidiscrete: (move_direction, action_type) with move_direction in {0, ..., 2 * n_axis}, where 0 - n_axis represent movement along an axis in the positive direction and n_axis - 2 * n_axis in negative direction. And action_type is in {piece_type_0, ..., piece_type_n}, ie.e. there is a placement action for each type of piece. If `multi_discrete_actions` is set to False, the discrete action space over all items in the cartesian product of move_direction and action_type will be used to get a single discrete action space over {0,..., n_axis * piece_type_n}. The order of action pairs then is the same as returned by `itertools.product()`. Observation-space description: A Box space where each observation has dimensions (axis_0 x axis_1 ... x axis_n x n_one_hot x n_scores) where axis_k is the length of the k-th axis of the game board grid, n_one_hot = 1 + n_players * (n_piece_types - 1) the dimension of the piece's one-hot encodings and n_scores = 3 is the number of additional normalized features regarding the player: is_cursor_position, room, population. Note that n_one_hot accounts for the empty piece_type which doesn't belong to a player. If `flat_observations` is set to True, the box observations are going to be (axis_0 * axis_1 ... * axis_n * n_one_hot + n_scores) dimensional vectors, where n_scores = 3 + n_axis, since the cursor's position is on longer represented as bit, but as normalized (x, y, ...) coordinates. """ def __init__(self, grid_size: tuple, n_players: int = 2, max_turns=100, final_reward=100, piece_types=None, policies_other=None, observe_all=False, multi_discrete_actions=False, flat_observations=False, render=False, cell_size=50, padding=5, ui_font_size=14, seed=None): """ :param grid_size: tuple specifying the dimensions of the game's board. :param n_players: number of players participating in the game. :param max_turns: maximum number of turns per episode. :param final_reward: amount of final reward given to the winner and taken from the losers. :param piece_types: list of dict configs containing describing possible pieces. :param policies_other: list of policies to use for opponents players. :param observe_all: whether to return observations on `step()` for all players in the info dict or not. :param multi_discrete_actions: whether to use a multi-discrete action space. :param flat_observations: whether to flatten the observations or return as tensor. :param render: enables rendering when calling `render()`. :param cell_size: width/height of a cell when rendering. :param padding: padding between cells when rendering. :param ui_font_size: size of the ui font when rendering. :param seed: random seed. """ grid_size = tuple(grid_size) if policies_other is not None: assert n_players - 1 == len( policies_other), 'please provide a policy for each opponent.' self.n_players = n_players self.policies_other = policies_other self.observe_all = observe_all if piece_types is None: self.piece_types = self._get_default_piece_types() else: self.piece_types = piece_types n_piece_types = len(self.piece_types) # actions: (cursor move direction, piece_type) # where (cursor move direction) encodes +1 or -1 movement along an axis and 0 for no movement. n_move_directions = 1 + 2 * len(grid_size) if multi_discrete_actions: self.action_space = MultiDiscrete( [n_move_directions, n_piece_types]) else: self.action_space = Discrete(n_move_directions * n_piece_types) # observation space: # (d_0 * ... * d_n * piece_type * player # + cursor_d_0 + ... + cursor_d_n + population + room) k_cursor_features = len(grid_size) if flat_observations else 1 obs_dims = grid_size + (1 + (n_piece_types - 1) * n_players, ) self.observation_space = OneHotBox(OneHot(obs_dims), Box(0.0, 1.0, shape=(2 + k_cursor_features, )), flatten=flat_observations) self.game = ExpandoGame(grid_size, n_players, max_turns, final_reward=final_reward, piece_types=self.piece_types, seed=seed) self.observation_format = 'flat' if flat_observations else 'grid' self.do_render = render if self.do_render: self.renderer = GameRenderer(self.game, cell_size, padding, ui_font_size) self.seed(seed) def step(self, action, other_actions=None): """Perform each player's turn. :param action: action to take as player 0 :param other_actions: optional list of actions to take for the other players. Will be sampled from actions_space if not provided. :return: obs_0, reward_0, done, info """ if self.policies_other is not None: assert other_actions is None, 'other actions are already defined by the policies passed at initialization' # other player actions passed as argument if other_actions is not None: assert len( other_actions ) + 1 == self.n_players, 'please provide an action for each player' rewards_other = [ self.game.take_turn(action, i) for i, action in enumerate(other_actions, start=1) ] # other player actions defined by policies passed to constructor elif self.policies_other is not None: other_obs = [ self.game.get_observation(i, self.observation_format) for i in range(1, self.n_players) ] actions_other = [ policy.predict(obs)[0][0] for obs, policy in zip(other_obs, self.policies_other) ] rewards_other = [ self.game.take_turn(a, i) for i, a in enumerate(actions_other, start=1) ] # no other player actions provided: sample else: rewards_other = [ self.game.take_turn(self.action_space.sample(), i) for i in range(1, self.n_players) ] info = {} if self.observe_all: other_obs_new = [ self.game.get_observation(i, self.observation_format) for i in range(1, self.n_players) ] info = {'rewards_other': rewards_other, 'obs_other': other_obs_new} reward_0 = self.game.take_turn(action, player_id=0) obs_0 = self.game.get_observation(player_id=0, formatting=self.observation_format) done = self.game.is_done if done: self.game.reset() return obs_0, reward_0, done, info def seed(self, seed=None): """Set seeds of all random number generators. Note that pseudo random actions are performed at initialization, so in order to seed these actions as well you need to pass a seed to the constructor. :param seed: seed to set """ self.observation_space.seed(seed) self.action_space.seed(seed) self.game.seed(seed) def reset(self, player_id=0): """Reset the environment. :param player_id: id of the player to get the first observation from. :return: observation of player with player_id or a list of all observations if `observe_all` was set. """ self.game.reset() if self.observe_all: return [ self.game.get_observation(i, self.observation_format) for i in range(self.n_players) ] return self.game.get_observation(player_id, self.observation_format) def render(self, mode='human'): """Render a pyglet visualization. Only works with 2D grids. """ assert len( self.game.grid_size ) < 3, 'Only 2D grids are supported for rendering at the moment.' if self.do_render: self.renderer.step() @staticmethod def from_config(file_path): """Load environment using a yaml configuration file or a composable hydra config :param file_path: path to the config file :return: A configured Expando environment """ file_path = to_absolute_path(file_path) conf_dir, file_name = os.path.split(file_path) with initialize_config_dir(conf_dir): cfg = compose(config_name=file_name) env = Expando(**cfg) return env @staticmethod def _get_default_piece_types(): """Load the default piece types from default_config/ :return: DictConfig containing piece_types """ this_file_dir = os.path.split(relpath(__file__))[0] path = os.path.join(this_file_dir, 'default_config/piece_types.yaml') return OmegaConf.load(path).piece_types
def test_spaces(self): experiment_name = "test_spaces" module_name = "module" logger = ModuleLogger( output_path=Path(self.temp_dir.name), experiment_name=experiment_name, module=module_name, step_write_frequency=None, episode_write_frequency=None, ) seed = 3 # Discrete space = Discrete(n=3) space.seed(seed) logger.log_space("Discrete", space.sample()) # MultiDiscrete space = MultiDiscrete(np.array([3, 2])) space.seed(seed) logger.log_space("MultiDiscrete", space.sample()) # Dict space = Dict({ "predictiveChangeVarDiscountedAverage": spaces.Box(low=-np.inf, high=np.inf, shape=(1, )), "predictiveChangeVarUncertainty": spaces.Box(low=0, high=np.inf, shape=(1, )), "lossVarDiscountedAverage": spaces.Box(low=-np.inf, high=np.inf, shape=(1, )), "lossVarUncertainty": spaces.Box(low=0, high=np.inf, shape=(1, )), "currentLR": spaces.Box(low=0, high=1, shape=(1, )), "trainingLoss": spaces.Box(low=0, high=np.inf, shape=(1, )), "validationLoss": spaces.Box(low=0, high=np.inf, shape=(1, )), }) space.seed(seed) logger.log_space("Dict", space.sample()) space = Box(np.array([0, 0]), np.array([2, 2])) space.seed(seed) logger.log_space("Box", space.sample()) logger.close() with open(logger.get_logfile(), "r") as log_file: logs = list(map(json.loads, log_file)) wide = log2dataframe(logs, wide=True) long = log2dataframe(logs, drop_columns=None) self.assertEqual(len(wide), 1) first_row = wide.iloc[0] # Discrete self.assertTrue(not np.isnan(first_row.Discrete)) # MultiDiscrete self.assertTrue(not np.isnan(first_row.MultiDiscrete_0)) self.assertTrue(not np.isnan(first_row.MultiDiscrete_1)) simultaneous_logged = long[(long.name == "MultiDiscrete_0") | (long.name == "MultiDiscrete_1")] self.assertEqual(len(simultaneous_logged.time.unique()), 1) # Dict expected_columns = [ "Dict_currentLR_0", "Dict_lossVarDiscountedAverage_0", "Dict_lossVarUncertainty_0", "Dict_predictiveChangeVarDiscountedAverage_0", "Dict_predictiveChangeVarUncertainty_0", "Dict_trainingLoss_0", ] for expected_column in expected_columns: self.assertTrue(not np.isnan(first_row[expected_column])) simultaneous_logged = long[long.name.isin(expected_columns)] self.assertEqual(len(simultaneous_logged.time.unique()), 1) # Box self.assertTrue(not np.isnan(first_row.Box_0)) self.assertTrue(not np.isnan(first_row.Box_1)) simultaneous_logged = long[(long.name == "Box_0") | (long.name == "Box_1")] self.assertEqual(len(simultaneous_logged.time.unique()), 1)
class MouseEnv_cl(gym.Env): """MouseEnv_cl-v2 Now it does not calculate mouse - apple distance. Instead, it can have multiple apples """ metadata = {'render.modes': ['human', 'rgb']} def __init__(self, **kwargs): """ kwargs ------ apple_num : int number of apples in a map. Default is 1 eat_apple : float reward given when apple is eaten. Default is 1.0 hit_wall : float punishment(or reward?) given when hit wall. Default is 0 """ #Turn left 45°, Move forward, Turn right 45° self.action_space = Discrete(3) self._done = False self.viewer = None self.engine = None kwargs.setdefault('apple_num', 1) kwargs.setdefault('eat_apple', 1.0) kwargs.setdefault('hit_wall', 0) self._options = kwargs self.max_step = 1000 self.cur_step = 0 self.image_size = (720, 720) self.seed() # 3 Continuous Inputs from both eyes self.observation_space = Dict({ 'Right': Box(0, 255, shape=(100, ec.CacheNum, 3), dtype=np.uint8), 'Left': Box(0, 255, shape=(100, ec.CacheNum, 3), dtype=np.uint8) }) def step(self, action): assert not (self.engine is None), 'Reset first before starting env' if self._done: print('The game is already done. Continuing may cause unexpected'\ ' behaviors') if action == 0: trans_action = ((0, 0), np.pi / 4) elif action == 1: trans_action = ((10, 0), 0) elif action == 2: trans_action = ((0, 0), -np.pi / 4) observation, reward, done, info = self.engine.update(trans_action) if done: self._done = True #Check if reached max_step self.cur_step += 1 if self.cur_step >= self.max_step: self._done = True done = True return observation, reward, done, info def reset(self): """ Reset the environment and return initial observation """ self._done = False self.cur_step = 0 self.engine = self._new_engine() initial_observation = self.engine.initial_observation() return initial_observation def render(self, mode='human'): assert not (self.engine is None), 'Reset first before starting env' if 'human' in mode: from gym.envs.classic_control import rendering if self.viewer == None: self.viewer = rendering.SimpleImageViewer(maxwidth=720) self.viewer.imshow(self.engine.image) elif 'rgb' in mode: return self.engine.image def seed(self, seed=None): np_random, seed = seeding.np_random(seed) rng.np_random = np_random self.action_space.seed(seed) def close(self): if self.viewer: self.viewer.close() self.viewer = None def _new_engine(self): return Engine(self.image_size, **self._options)