class IdentityEnv(Env): def __init__(self, dim, ep_length=100): """ Identity environment for testing purposes :param dim: (int) the size of the dimensions you want to learn :param ep_length: (int) the length of each episodes in timesteps """ self.action_space = Discrete(dim) self.observation_space = self.action_space self.ep_length = ep_length self.current_step = 0 self.dim = dim self.reset() def reset(self): self.current_step = 0 self._choose_next_state() return self.state def step(self, action): reward = self._get_reward(action) self._choose_next_state() self.current_step += 1 done = self.current_step >= self.ep_length return self.state, reward, done, {} def _choose_next_state(self): self.state = self.action_space.sample() def _get_reward(self, action): return 1 if np.all(self.state == action) else 0 def render(self, mode='human'): pass
class IdentityEnv(Env): def __init__( self, dim, ep_length=100, ): self.action_space = Discrete(dim) self.reset() def reset(self): self._choose_next_state() self.observation_space = self.action_space return self.state def step(self, actions): rew = self._get_reward(actions) self._choose_next_state() return self.state, rew, False, {} def _choose_next_state(self): self.state = self.action_space.sample() def _get_reward(self, actions): return 1 if self.state == actions else 0
class ValueFunction: # In this example I use the tiling software instead of implementing standard tiling by myself # One important thing is that tiling is only a map from (state, action) to a series of indices # It doesn't matter whether the indices have meaning, only if this map satisfy some property # View the following webpage for more information # http://incompleteideas.net/sutton/tiles/tiles3.html # @max_size: the maximum # of indices def __init__(self, alpha, n_actions, num_of_tilings=8, max_size=2048): self.action_space = Discrete(n_actions) self.max_size = max_size self.num_of_tilings = num_of_tilings # divide step size equally to each tiling self.step_size = alpha / num_of_tilings self.hash_table = IHT(max_size) # weight for each tile self.weights = np.zeros(max_size) # position and velocity needs scaling to satisfy the tile software self.position_scale = self.num_of_tilings / (POSITION_MAX - POSITION_MIN) self.velocity_scale = self.num_of_tilings / (VELOCITY_MAX - VELOCITY_MIN) # get indices of active tiles for given state and action def _get_active_tiles(self, position, velocity, action): # I think positionScale * (position - position_min) would be a good normalization. # However positionScale * position_min is a constant, so it's ok to ignore it. active_tiles = tiles( self.hash_table, self.num_of_tilings, [self.position_scale * position, self.velocity_scale * velocity], [action]) return active_tiles # estimate the value of given state and action def __call__(self, state, action): position, velocity = tuple(state) if position == POSITION_MAX: return 0.0 active_tiles = self._get_active_tiles(position, velocity, action) return np.sum(self.weights[active_tiles]) # learn with given state, action and target def update(self, target, state, action): active_tiles = self._get_active_tiles(state[0], state[1], action) estimation = np.sum(self.weights[active_tiles]) delta = self.step_size * (target - estimation) for active_tile in active_tiles: self.weights[active_tile] += delta def act(self, state, epsilon=0): if np.random.random() < epsilon: return self.action_space.sample() return np.argmax( [self(state, action) for action in range(self.action_space.n)])
def test_qagent_1(): agent = QLearningAgent( action_space=Discrete(3), obs_space=Discrete(3), gamma=0.99, alpha=1.0, epsilon=0.9, ) space = Discrete(10) action = space.sample() obs = space.sample() reward = 0.0 obs_n = space.sample() agent.learn(obs, action, reward, obs_n, False) assert (obs, action) in agent.q_table assert type(agent.q_table[(obs, action)]) == float
def test_space_utils(): # Box box = Box(-1.0, 1.0, shape=[2, 3], dtype=np.float32) sample = box.sample() assert flatdim(box) == 2 * 3 assert flatten(box, sample).shape == (2 * 3, ) assert np.allclose(sample, unflatten(box, flatten(box, sample))) x = np.array([[1.0, 1.0], [1.0, 1.0]]) box = Box(low=-x, high=x, dtype=np.float32) sample = box.sample() assert flatdim(box) == 2 * 2 assert flatten(box, sample).shape == (2 * 2, ) assert np.allclose(sample, unflatten(box, flatten(box, sample))) # Discrete discrete = Discrete(5) sample = discrete.sample() assert flatdim(discrete) == 5 assert flatten(discrete, sample).shape == (5, ) assert sample == unflatten(discrete, flatten(discrete, sample)) # Tuple S = Tuple([ Discrete(5), Box(-1.0, 1.0, shape=(2, 3), dtype=np.float32), Dict({ 'success': Discrete(2), 'velocity': Box(-1, 1, shape=(1, 3), dtype=np.float32) }) ]) sample = S.sample() assert flatdim(S) == 5 + 2 * 3 + 2 + 3 assert flatten(S, sample).shape == (16, ) _sample = unflatten(S, flatten(S, sample)) assert sample[0] == _sample[0] assert np.allclose(sample[1], _sample[1]) assert sample[2]['success'] == _sample[2]['success'] assert np.allclose(sample[2]['velocity'], _sample[2]['velocity']) # Dict D0 = Dict({ 'position': Box(-100, 100, shape=(3, ), dtype=np.float32), 'velocity': Box(-1, 1, shape=(4, ), dtype=np.float32) }) D = Dict({'sensors': D0, 'score': Discrete(100)}) sample = D.sample() assert flatdim(D) == 3 + 4 + 100 assert flatten(D, sample).shape == (107, ) _sample = unflatten(D, flatten(D, sample)) assert sample['score'] == _sample['score'] assert np.allclose(sample['sensors']['position'], _sample['sensors']['position']) assert np.allclose(sample['sensors']['velocity'], _sample['sensors']['velocity'])
def test_trajectory(self): """Tests the Trajectory class.""" buffer_size = 5 # Small trajecory object for testing purposes. trajectory = Trajectory(buffer_size=buffer_size) self.assertEqual(trajectory.cursor, 0) self.assertEqual(trajectory.timestep, 0) self.assertEqual(trajectory.sample_batch_offset, 0) assert not trajectory.buffers observation_space = Box(-1.0, 1.0, shape=(3, )) action_space = Discrete(2) trajectory.add_init_obs(env_id=0, agent_id="agent", policy_id="policy", init_obs=observation_space.sample()) self.assertEqual(trajectory.cursor, 0) self.assertEqual(trajectory.initial_obs.shape, observation_space.shape) # Fill up the buffer and make it extend if it hits the limit. cur_buffer_size = buffer_size for i in range(buffer_size + 1): trajectory.add_action_reward_next_obs( env_id=0, agent_id="agent", policy_id="policy", values=dict( t=i, actions=action_space.sample(), rewards=1.0, dones=i == buffer_size, new_obs=observation_space.sample(), action_logp=-0.5, action_dist_inputs=np.array([[0.5, 0.5]]), )) self.assertEqual(trajectory.cursor, i + 1) self.assertEqual(trajectory.timestep, i + 1) self.assertEqual(trajectory.sample_batch_offset, 0) if i == buffer_size - 1: cur_buffer_size *= 2 self.assertEqual(len(trajectory.buffers["new_obs"]), cur_buffer_size) self.assertEqual(len(trajectory.buffers["rewards"]), cur_buffer_size) # Create a SampleBatch from the Trajectory and reset it. batch = trajectory.get_sample_batch_and_reset() self.assertEqual(batch.count, buffer_size + 1) # Make sure, Trajectory was reset properly. self.assertEqual(trajectory.cursor, buffer_size + 1) self.assertEqual(trajectory.timestep, 0) self.assertEqual(trajectory.sample_batch_offset, buffer_size + 1)
class IntegerSphere(gym.Env): #An integer/discrete form of the sphere function def __init__(self): lb = -100 ub = 100 self.nx = 5 self.action_space = Discrete(201) self.real_actions = list(range(lb, ub + 1)) self.observation_space = Box(low=min(self.real_actions), high=max(self.real_actions), shape=(self.nx, ), dtype=int) self.episode_length = 50 self.reset() self.done = False self.counter = 0 def step(self, action): individual = [self.real_actions[action]] * self.nx reward = self.fit(individual=individual) self.counter += 1 if self.counter == self.episode_length: self.done = True self.counter = 0 return individual, reward, self.done, {'x': individual} def fit(self, individual): """Sphere test objective function. F(x) = sum_{i=1}^d xi^2 d=1,2,3,... Range: [-100,100] Minima: 0 """ #-1 is used to convert minimization to maximization return -sum(x**2 for x in individual) def reset(self): self.done = False ac = self.action_space.sample() individual = [self.real_actions[ac]] * self.nx return individual def render(self, mode='human'): pass
def test_seed_Dict(): test_space = Dict( { "a": Box(low=0, high=1, shape=(3, 3)), "b": Dict( { "b_1": Box(low=-100, high=100, shape=(2,)), "b_2": Box(low=-1, high=1, shape=(2,)), } ), "c": Discrete(5), } ) seed_dict = { "a": 0, "b": { "b_1": 1, "b_2": 2, }, "c": 3, } test_space.seed(seed_dict) # "Unpack" the dict sub-spaces into individual spaces a = Box(low=0, high=1, shape=(3, 3)) a.seed(0) b_1 = Box(low=-100, high=100, shape=(2,)) b_1.seed(1) b_2 = Box(low=-1, high=1, shape=(2,)) b_2.seed(2) c = Discrete(5) c.seed(3) for i in range(10): test_s = test_space.sample() a_s = a.sample() assert (test_s["a"] == a_s).all() b_1_s = b_1.sample() assert (test_s["b"]["b_1"] == b_1_s).all() b_2_s = b_2.sample() assert (test_s["b"]["b_2"] == b_2_s).all() c_s = c.sample() assert test_s["c"] == c_s
class TestEnv(Env): def __init__(self, action_size=5, observation_size=5, max_depth=20): self.action_space = Discrete(action_size) self.observation_space = Discrete(observation_size) self.max_depth = max_depth self._discount = .95 self._reward_range = 1 def reset(self): self.state = 0 def _get_init_state(self): # self.state = 0 return 0 # self.state def _set_state(self, state): self.state = state def step(self, action): if self.state < self.max_depth and action == 0: rw = 1.0 else: rw = 0.0 ob = self.observation_space.sample() self.state += 1 return ob, rw, False, {"state": self.state, "p_ob": 1.} def optimal_value(self): discount = 1. total_rw = 0. for n in range(self.max_depth): total_rw += discount discount *= self._discount return total_rw def mean_value(self): discount = 1. total_rw = 0. for n in range(self.max_depth): total_rw += discount / self.action_space.n discount *= self._discount return total_rw
class Player(object): """ Abstract player that plays samples from a unifrom random distribution.""" def __init__(self): self.action_space = Discrete(7) self.mode = PlayerModes.batting def reset(self, mode=PlayerModes.batting): """ Use reset to set the player's mode. Args: mode: Some mode to which to reset. """ self.mode = mode def __call__(self, obs, reward, done, info): """ Player will consume everything that the previous step provided.""" return self.action_space.sample() def __repr__(self): return '{} instance'.format(type(self).__name__)
class ElFarolEnv(Env): metadata = {'render.modes': ['human']} def __init__(self, n_agents=100, threshold=60, g=10, s=5, b=1): if g < s or s < b: raise Exception("rewards must be ordered g > s > b") self.n_agents = n_agents self.action_space = Discrete(2) # observe 0 if did not attend, otherwise observe number of agents who atteneded self.observation_space = Discrete(n_agents) self.reward_range = (b, g) def reward_func(action, n_attended): if action == 0: return s elif n_attended <= threshold: return g else: return b self.reward_func = reward_func self.prev_action = [ self.action_space.sample() for _ in range(n_agents) ] def _step(self, action): n_attended = sum(action) observation = [n_attended if a else 0 for a in action] reward = [self.reward_func(a, n_attended) for a in action] self.prev_action = action return observation, reward, False, () def _reset(self): pass def _render(self, mode='human', close=False): if mode == 'human': print(str(sum(self.prev_action)))
class DiscreteMaskEnv(gym.Env): metadata = {'render.modes': ['human', 'system', 'none']} def __init__(self): self.action_space = Discrete(5) self.observation_space = Discrete(3) self.current_step = 0 self._action_mask = torch.ones(self.action_space.n) def reset(self): self.current_step = 0 self._action_mask = torch.ones(self.action_space.n) self._choose_next_state() return self.state def step(self, action: int): action_mask = torch.ones(self.action_space.n) if self.action_mask[action] == 0: raise Exception("Invalid action was selected! Valid actions: {}, " "action taken: {}".format(self.action_mask, action)) action_mask[action] = 0 self.current_step += 1 self._action_mask = action_mask self._choose_next_state() return self.state, 0, self.finish(), {"action_mask": self.action_mask} def render(self, mode='human'): pass def finish(self): return self.current_step == 250 def _choose_next_state(self): self.state = torch.tensor(self.observation_space.sample(), dtype=torch.long) @property def action_mask(self): return self._action_mask
class PocEnv(Env): def __init__(self, maze): self.board = select_maze(maze) self.grid = PocGrid(board=self.board["_maze"]) self._get_init_state() self.action_space = Discrete(4) self.observation_space = Discrete(1 << 10) # 1024 # self.observation_space = Discrete(14) self._reward_range = 100 self._discount = .95 def seed(self, seed=None): np.random.seed(seed) def is_power(self, idx): return self.board['_maze'][idx] == 3 def is_passable(self, idx): return self.board['_maze'][idx] != 0 def _is_valid(self): assert self.grid.is_inside(self.state.agent_pos) assert self.is_passable(self.state.agent_pos) for ghost in self.state.ghosts: assert self.grid.is_inside(ghost.pos) assert self.is_passable(ghost.pos) def _set_state(self, state): self.done = False self.state = state def _generate_legal(self): actions = [] for action in Action: if self.grid.is_inside(self.state.agent_pos + Moves.get_coord(action.value)): actions.append(action.value) return actions def step(self, action): assert self.action_space.contains(action) assert self.done is False reward = -1 next_pos = self._next_pos(self.state.agent_pos, action) if next_pos.is_valid(): self.state.agent_pos = next_pos else: reward += -25 if self.state.power_step > 0: self.state.power_step -= 1 hit_ghost = -1 for g, ghost in enumerate(self.state.ghosts): if ghost.pos == self.state.agent_pos: hit_ghost = g # move ghost self._move_ghost(g, ghost_range=self.board["_ghost_range"]) if hit_ghost >= 0: if self.state.power_step > 0: reward += 25 self.state.ghosts[hit_ghost].reset() else: reward += -100 self.done = True ob = self._make_ob(action) if self.state.food_pos[self.grid.get_index(self.state.agent_pos)]: if sum(self.state.food_pos) == 0: reward += 1000 self.done = True if self.is_power(self.state.agent_pos): self.state.power_step = config["_power_steps"] reward += 10 return ob, reward, self.done, {"state": self.state} def _make_ob(self, action): # TODO fix me ob = 0 for d in range(self.action_space.n): if self._see_ghost(action) > 0: ob = set_flags(ob, d) next_pos = self._next_pos(self.state.agent_pos, direction=d) if next_pos.is_valid() and self.is_passable(next_pos): ob = set_flags(ob, d + self.action_space.n) if self._smell_food(): ob = set_flags(ob, 8) if self._hear_ghost(self.state): ob = set_flags(ob, 9) return ob def _encode_state(self, state): poc_idx = self.grid.get_index(state.agent_pos) ghosts = [(self.grid.get_index(ghost.pos), ghost.direction) for ghost in state.ghosts] return np.concatenate([[poc_idx], *ghosts, state.food_pos, [state.power_step]]) def _decode_state(self, state): poc_state = PocState(Coord(*self.grid.get_coord(state[0]))) ghosts = np.split(state[1:self.board["_num_ghosts"] * 3], 1) for g in ghosts: poc_state.ghosts.append( Ghost(pos=self.grid.get_coord(g[0]), direction=g[1])) poc_state.power_step = state[-1] poc_state.food_pos = state[self.board["_num_ghosts"] * 3:-1].tolist() return poc_state def _compute_prob(self, action, next_state, ob): return int(ob == self._make_ob(action)) def _see_ghost(self, action): eye_pos = self.state.agent_pos + Moves.get_coord(action) while True: for g, ghost in enumerate(self.state.ghosts): if ghost.pos == eye_pos: return g eye_pos += Moves.get_coord(action) if not self.grid.is_inside(eye_pos) or not self.is_passable( eye_pos): break return -1 def _smell_food(self, smell_range=1): for x in range(-smell_range, smell_range + 1): for y in range(-smell_range, smell_range + 1): smell_pos = Coord(x, y) idx = self.grid.get_index(self.state.agent_pos + smell_pos) if self.grid.is_inside(self.state.agent_pos + smell_pos) and self.state.food_pos[idx]: return True return False @staticmethod def _hear_ghost(poc_state, hear_range=2): for ghost in poc_state.ghosts: if Grid.manhattan_distance(ghost.pos, poc_state.agent_pos) <= hear_range: return True return False def render(self, mode='human', close=False): pass def reset(self): self.t = 0 self.done = False self._get_init_state() return 0 def close(self): pass def _get_init_state(self): # create walls # for tile in self.grid: # value = config["maze"][tile.key[0]] # self.grid.set_value(value, coord=tile.key) self.state = PocState() self.state.agent_pos = Coord(*self.board["_poc_home"]) ghost_home = Coord(*self.board["_ghost_home"]) for g in range(self.board["_num_ghosts"]): pos = Coord(ghost_home.x + g % 2, ghost_home.y + g // 2) self.state.ghosts.append(Ghost(pos, direction=-1)) self.state.food_pos = np.random.binomial(1, config["_food_prob"], size=self.grid.n_tiles + 1) self.state.power_step = 0 return self.state def _next_pos(self, pos, direction): direction = Moves.get_coord(direction) if pos.x == 0 and pos.y == self.board[ '_passage_y'] and direction == Moves.EAST: next_pos = Coord(self.grid.x_size - 1, pos.y) elif pos.x == self.grid.x_size - 1 and pos.y == self.board[ '_passage_y'] and direction == Moves.WEST: next_pos = Coord(0, pos.y) else: next_pos = pos + direction if self.grid.is_inside(next_pos) and self.is_passable(next_pos): return next_pos else: return Coord(-1, -1) def _move_ghost(self, g, ghost_range): if Grid.manhattan_distance(self.state.agent_pos, self.state.ghosts[g].pos) < ghost_range: if self.state.power_step > 0: self._move_defensive(g) else: self._move_aggressive(g) else: self._move_random(g) def _move_aggressive(self, g, chase_prob=.75): if not np.random.binomial(1, p=chase_prob): return self._move_random(g) best_dist = self.grid.x_size + self.grid.y_size best_pos = self.state.ghosts[g].pos best_dir = -1 for d in range(self.action_space.n): dist = Grid.directional_distance(self.state.agent_pos, self.state.ghosts[g].pos, d) new_pos = self._next_pos(self.state.ghosts[g].pos, d) if dist <= best_dist and new_pos.is_valid() and can_move( self.state.ghosts[g], d): best_pos = new_pos best_dist = dist best_dir = d self.state.ghosts[g].update(best_pos, best_dir) def _move_defensive(self, g, defensive_prob=.5): if np.random.binomial( 1, defensive_prob) and self.state.ghosts[g].direction >= 0: self.state.ghosts[g].direction = -1 best_dist = self.grid.x_size + self.grid.y_size best_pos = self.state.ghosts[g].pos best_dir = -1 for d in range(self.action_space.n): dist = Grid.directional_distance(self.state.agent_pos, self.state.ghosts[g].pos, d) new_pos = self._next_pos(self.state.ghosts[g].pos, d) if dist >= best_dist and new_pos.is_valid() and can_move( self.state.ghosts[g], d): best_pos = new_pos best_dist = dist best_dir = d self.state.ghosts[g].update(best_pos, best_dir) def _move_random(self, g): # there are no dead ends # never switch to opposite direction ghost_pos = self.state.ghosts[g].pos while True: d = self.action_space.sample() next_pos = self._next_pos(ghost_pos, d) if next_pos.is_valid() and can_move(self.state.ghosts[g], d): break self.state.ghosts[g].update(next_pos, d)
def test_discrete_space_encode(self): discrete_space = Discrete(100) value = discrete_space.sample() encoded_value = gym_spaces_utils.gym_space_encode( discrete_space, value) self.assertListEqual([value], encoded_value)
class SawyerXYZEnv(SawyerMocapBase, metaclass=abc.ABCMeta): def __init__( self, model_name, frame_skip=5, hand_low=(-0.2, 0.55, 0.05), hand_high=(0.2, 0.75, 0.3), mocap_low=None, mocap_high=None, action_scale=1. / 100, action_rot_scale=1., ): super().__init__(model_name, frame_skip=frame_skip) self.action_scale = action_scale self.action_rot_scale = action_rot_scale self.hand_low = np.array(hand_low) self.hand_high = np.array(hand_high) if mocap_low is None: mocap_low = hand_low if mocap_high is None: mocap_high = hand_high self.mocap_low = np.hstack(mocap_low) self.mocap_high = np.hstack(mocap_high) self.goal_space = Discrete(1) # OVERRIDE ME self.curr_path_length = 0 # We use continuous goal space by default and # can discretize the goal space by calling # the `discretize_goal_space` method. self.discrete_goal_space = None self.discrete_goals = [] self.active_discrete_goal = None def set_xyz_action(self, action): action = np.clip(action, -1, 1) pos_delta = action * self.action_scale new_mocap_pos = self.data.mocap_pos + pos_delta[None] new_mocap_pos[0, :] = np.clip( new_mocap_pos[0, :], self.mocap_low, self.mocap_high, ) self.data.set_mocap_pos('mocap', new_mocap_pos) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) def discretize_goal_space(self, goals): assert len(goals) >= 1 self.discrete_goals = goals # update the goal_space to a Discrete space self.discrete_goal_space = Discrete(len(self.discrete_goals)) # Belows are methods for using the new wrappers. # `sample_goals` is implmented across the sawyer_xyz # as sampling from the task lists. This will be done # with the new `discrete_goals`. After all the algorithms # conform to this API (i.e. using the new wrapper), we can # just remove the underscore in all method signature. def sample_goals_(self, batch_size): if self.discrete_goal_space is not None: return [ self.discrete_goal_space.sample() for _ in range(batch_size) ] else: return [self.goal_space.sample() for _ in range(batch_size)] def set_goal_(self, goal): if self.discrete_goal_space is not None: self.active_discrete_goal = goal self.goal = self.discrete_goals[goal] self._state_goal_idx = np.zeros(len(self.discrete_goals)) self._state_goal_idx[goal] = 1. else: self.goal = goal def _set_obj_xyz(self, pos): qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() qpos[9:12] = pos.copy() qvel[9:15] = 0 self.set_state(qpos, qvel) def get_site_pos(self, siteName): _id = self.model.site_names.index(siteName) return self.data.site_xpos[_id].copy() def reset(self): self.curr_path_length = 0 return super().reset()
def action(self, state: Box, action_space: Discrete) -> int: if self._exploration_policy.should_explore(): return action_space.sample() else: predict = self._model.predict(np.array([state])) return np.argmax(predict).item()
# wrap the chosen default model with our new model API class # (DuelingQModel). This way, both `forward` and `get_q_values` # are available in the returned class. model_interface=ContActionQModel if args.framework != "torch" else TorchContActionQModel, name="cont_action_q_model", ) # __sphinx_doc_model_construct_end__ batch_size = 10 input_ = np.array([obs_space.sample() for _ in range(batch_size)]) # Note that for PyTorch, you will have to provide torch tensors here. if args.framework == "torch": input_ = torch.from_numpy(input_) input_dict = { "obs": input_, "is_training": False, } # Note that for PyTorch, you will have to provide torch tensors here. out, state_outs = my_cont_action_q_model(input_dict=input_dict) assert out.shape == (10, 256) # Pass `out` and an action into `my_cont_action_q_model` action = np.array([action_space.sample() for _ in range(batch_size)]) if args.framework == "torch": action = torch.from_numpy(action) q_value = my_cont_action_q_model.get_single_q_value(out, action) assert q_value.shape == (10, 1)
class Expando(Env): """Gym environment wrapping the expando game. For details on the game, check the ExpandoGame class. Action-space: Multidiscrete: (move_direction, action_type) with move_direction in {0, ..., 2 * n_axis}, where 0 - n_axis represent movement along an axis in the positive direction and n_axis - 2 * n_axis in negative direction. And action_type is in {piece_type_0, ..., piece_type_n}, ie.e. there is a placement action for each type of piece. If `multi_discrete_actions` is set to False, the discrete action space over all items in the cartesian product of move_direction and action_type will be used to get a single discrete action space over {0,..., n_axis * piece_type_n}. The order of action pairs then is the same as returned by `itertools.product()`. Observation-space description: A Box space where each observation has dimensions (axis_0 x axis_1 ... x axis_n x n_one_hot x n_scores) where axis_k is the length of the k-th axis of the game board grid, n_one_hot = 1 + n_players * (n_piece_types - 1) the dimension of the piece's one-hot encodings and n_scores = 3 is the number of additional normalized features regarding the player: is_cursor_position, room, population. Note that n_one_hot accounts for the empty piece_type which doesn't belong to a player. If `flat_observations` is set to True, the box observations are going to be (axis_0 * axis_1 ... * axis_n * n_one_hot + n_scores) dimensional vectors, where n_scores = 3 + n_axis, since the cursor's position is on longer represented as bit, but as normalized (x, y, ...) coordinates. """ def __init__(self, grid_size: tuple, n_players: int = 2, max_turns=100, final_reward=100, piece_types=None, policies_other=None, observe_all=False, multi_discrete_actions=False, flat_observations=False, render=False, cell_size=50, padding=5, ui_font_size=14, seed=None): """ :param grid_size: tuple specifying the dimensions of the game's board. :param n_players: number of players participating in the game. :param max_turns: maximum number of turns per episode. :param final_reward: amount of final reward given to the winner and taken from the losers. :param piece_types: list of dict configs containing describing possible pieces. :param policies_other: list of policies to use for opponents players. :param observe_all: whether to return observations on `step()` for all players in the info dict or not. :param multi_discrete_actions: whether to use a multi-discrete action space. :param flat_observations: whether to flatten the observations or return as tensor. :param render: enables rendering when calling `render()`. :param cell_size: width/height of a cell when rendering. :param padding: padding between cells when rendering. :param ui_font_size: size of the ui font when rendering. :param seed: random seed. """ grid_size = tuple(grid_size) if policies_other is not None: assert n_players - 1 == len( policies_other), 'please provide a policy for each opponent.' self.n_players = n_players self.policies_other = policies_other self.observe_all = observe_all if piece_types is None: self.piece_types = self._get_default_piece_types() else: self.piece_types = piece_types n_piece_types = len(self.piece_types) # actions: (cursor move direction, piece_type) # where (cursor move direction) encodes +1 or -1 movement along an axis and 0 for no movement. n_move_directions = 1 + 2 * len(grid_size) if multi_discrete_actions: self.action_space = MultiDiscrete( [n_move_directions, n_piece_types]) else: self.action_space = Discrete(n_move_directions * n_piece_types) # observation space: # (d_0 * ... * d_n * piece_type * player # + cursor_d_0 + ... + cursor_d_n + population + room) k_cursor_features = len(grid_size) if flat_observations else 1 obs_dims = grid_size + (1 + (n_piece_types - 1) * n_players, ) self.observation_space = OneHotBox(OneHot(obs_dims), Box(0.0, 1.0, shape=(2 + k_cursor_features, )), flatten=flat_observations) self.game = ExpandoGame(grid_size, n_players, max_turns, final_reward=final_reward, piece_types=self.piece_types, seed=seed) self.observation_format = 'flat' if flat_observations else 'grid' self.do_render = render if self.do_render: self.renderer = GameRenderer(self.game, cell_size, padding, ui_font_size) self.seed(seed) def step(self, action, other_actions=None): """Perform each player's turn. :param action: action to take as player 0 :param other_actions: optional list of actions to take for the other players. Will be sampled from actions_space if not provided. :return: obs_0, reward_0, done, info """ if self.policies_other is not None: assert other_actions is None, 'other actions are already defined by the policies passed at initialization' # other player actions passed as argument if other_actions is not None: assert len( other_actions ) + 1 == self.n_players, 'please provide an action for each player' rewards_other = [ self.game.take_turn(action, i) for i, action in enumerate(other_actions, start=1) ] # other player actions defined by policies passed to constructor elif self.policies_other is not None: other_obs = [ self.game.get_observation(i, self.observation_format) for i in range(1, self.n_players) ] actions_other = [ policy.predict(obs)[0][0] for obs, policy in zip(other_obs, self.policies_other) ] rewards_other = [ self.game.take_turn(a, i) for i, a in enumerate(actions_other, start=1) ] # no other player actions provided: sample else: rewards_other = [ self.game.take_turn(self.action_space.sample(), i) for i in range(1, self.n_players) ] info = {} if self.observe_all: other_obs_new = [ self.game.get_observation(i, self.observation_format) for i in range(1, self.n_players) ] info = {'rewards_other': rewards_other, 'obs_other': other_obs_new} reward_0 = self.game.take_turn(action, player_id=0) obs_0 = self.game.get_observation(player_id=0, formatting=self.observation_format) done = self.game.is_done if done: self.game.reset() return obs_0, reward_0, done, info def seed(self, seed=None): """Set seeds of all random number generators. Note that pseudo random actions are performed at initialization, so in order to seed these actions as well you need to pass a seed to the constructor. :param seed: seed to set """ self.observation_space.seed(seed) self.action_space.seed(seed) self.game.seed(seed) def reset(self, player_id=0): """Reset the environment. :param player_id: id of the player to get the first observation from. :return: observation of player with player_id or a list of all observations if `observe_all` was set. """ self.game.reset() if self.observe_all: return [ self.game.get_observation(i, self.observation_format) for i in range(self.n_players) ] return self.game.get_observation(player_id, self.observation_format) def render(self, mode='human'): """Render a pyglet visualization. Only works with 2D grids. """ assert len( self.game.grid_size ) < 3, 'Only 2D grids are supported for rendering at the moment.' if self.do_render: self.renderer.step() @staticmethod def from_config(file_path): """Load environment using a yaml configuration file or a composable hydra config :param file_path: path to the config file :return: A configured Expando environment """ file_path = to_absolute_path(file_path) conf_dir, file_name = os.path.split(file_path) with initialize_config_dir(conf_dir): cfg = compose(config_name=file_name) env = Expando(**cfg) return env @staticmethod def _get_default_piece_types(): """Load the default piece types from default_config/ :return: DictConfig containing piece_types """ this_file_dir = os.path.split(relpath(__file__))[0] path = os.path.join(this_file_dir, 'default_config/piece_types.yaml') return OmegaConf.load(path).piece_types
class SimpleFetchMdp(GoalEnv): def __init__(self, x_dim=5, y_dim=5, **kwargs): self.x_dim = x_dim self.y_dim = y_dim self.num_states = x_dim * y_dim # Right, Up, Left, Down, Grab self.action_space = Discrete(5) self.observation_space = Dict( dict( desired_goal=Discrete(self.num_states), # Goal Position achieved_goal=Discrete(self.num_states), # block position observation=MultiDiscrete([self.num_states, 2]) #arm position, object in air )) self._location_space = Discrete(self.num_states) self._goal_location = self._location_space.sample() self._block_location = self._location_space.sample() self._arm_location = self._location_space.sample() self._picked_up_block = False self.action_handlers = [ self._move_function(lambda s: s - 1, lambda s: s % self.x_dim == 0), # right self._move_function(lambda s: s - self.x_dim, lambda s: s < self.x_dim), # up self._move_function(lambda s: s + 1, lambda s: (s + 1) % self.x_dim == 0), # left self._move_function( lambda s: s + self.x_dim, lambda s: s + self.x_dim >= self.x_dim * self.y_dim), # down self._grab ] def reset(self): # Pick a random goal and block location self._goal_location = self._location_space.sample() self._block_location = self._location_space.sample() while self._block_location == self._goal_location: self._block_location = self._location_space.sample() self._arm_location = self._location_space.sample() self._picked_up_block = False return self._get_obs() def render(self, mode='human'): pass def close(self): pass def seed(self, seed="None"): pass def step(self, action): self.action_handlers[action]() obs = self._get_obs() reward = self.compute_reward() done = reward == 1. info = [] return obs, reward, done, info # Shortcut for setting the state and getting the output of the action def step_for(self, state, action, obs_format='dict'): self._goal_location = state[0] self._arm_location = state[2] if state[1] == -1: self._block_location = self._arm_location self._picked_up_block = True else: self._block_location = state[1] self._picked_up_block = False result = self.step(action) if obs_format == 'dict': return result return ([ result[0]['desired_goal'], result[0]['achieved_goal'], result[0]['observation'] ], ) + result[1:] def compute_reward(self): if self._arm_location == self._goal_location and self._picked_up_block: return 1. return 0. def _get_obs(self): return dict(desired_goal=self._goal_location, achieved_goal=-1 if self._picked_up_block else self._block_location, observation=self._arm_location) def _grab(self): if self._arm_location == self._block_location: self._picked_up_block = True def _move_function(self, displace, no_move_if): def ret(): if no_move_if(self._arm_location): return self._arm_location = displace(self._arm_location) if self._picked_up_block: self._block_location = displace(self._block_location) return ret
class TigerEnv(gym.Env): metadata = {"render.modes": ["human", "ansi"]} def __init__(self, seed=0, correct_prob=.85): self.correct_prob = correct_prob self.action_space = Discrete(len(Action)) self.state_space = Discrete(len(State)) self.observation_space = Discrete(len(Obs)) self._discount = .95 self._reward_range = (-float(100), float(10)) self._query = 0 self.seed(seed) def reset(self): self.done = False self.t = 0 self._query = 0 self.state = self.state_space.sample() self.last_action = Action.LISTEN.value return Obs.NULL.value def seed(self, seed=1234): np.random.seed(seed) return [seed] def step(self, action): assert self.action_space.contains(action) assert self.done is False self.t += 1 self._query += 1 self.last_action = action rw = TigerEnv._compute_rw(self.state, action) if TigerEnv._is_terminal(self.state, action): self.done = True return self.state, rw, self.done, {'state': self.state} self._sample_state(action) ob = TigerEnv._sample_ob(action, self.state) self.done = False return ob, rw, self.done, {"state": self.state} def render(self, mode='human', close=False): if close: return if mode == "human": if not hasattr(self, "gui"): self.gui = TigerGui() msg = "A: " + action_to_str( self.last_action) + " S: " + state_to_str(self.state) self.gui.render(state=(self.last_action, self.state), msg=msg) elif mode == "ansi": print("Current step: {}, tiger is in state: {}, action took: {}". format(self.t, self.state, self.last_action[0])) else: raise NotImplementedError() def close(self): self._render(close=True) def _set_state(self, state): self.state = state self.done = False def _generate_legal(self): return list(range(self.action_space.n)) def _generate_preferred(self, history): return self._generate_legal() def _sample_state(self, action): if action == Action.RIGHT.value or action == Action.LEFT.value: self.state = self.state_space.sample() def _get_init_state(self): # fix initial belief to be exact return self.state_space.sample() @staticmethod def _compute_prob(action, next_state, ob, correct_prob=.85): p_ob = 0.0 if action == Action.LISTEN.value and ob != Obs.NULL.value: if (next_state == State.LEFT.value and ob == Obs.LEFT.value) or ( next_state == State.RIGHT.value and ob == Obs.RIGHT.value): p_ob = correct_prob else: p_ob = 1 - correct_prob elif action != Action.LISTEN.value and ob == Obs.NULL.value: p_ob = 1. assert p_ob >= 0.0 and p_ob <= 1.0 return p_ob @staticmethod def _sample_ob(action, next_state, correct_prob=.85): ob = Obs.NULL.value p = np.random.uniform() if action == Action.LISTEN.value: if next_state == State.LEFT.value: ob = Obs.RIGHT.value if p > correct_prob else Obs.LEFT.value else: ob = Obs.LEFT.value if p > correct_prob else Obs.RIGHT.value return ob @staticmethod def _local_move(state, last_action, last_ob): raise NotImplementedError() @staticmethod def _is_terminal(state, action): is_terminal = False if action != Action.LISTEN.value: is_terminal = ( (action == Action.LEFT.value and state == State.LEFT.value) or (action == Action.RIGHT.value and state == State.RIGHT.value)) return is_terminal @staticmethod def _compute_rw(state, action): if action == Action.LISTEN.value: reward = -1 elif not TigerEnv._is_terminal(state, action): reward = 10 else: reward = -100 return reward
class SawyerXYZEnv(SawyerMocapBase, metaclass=abc.ABCMeta): def __init__( self, model_name, frame_skip=5, hand_low=(-0.2, 0.55, 0.05), hand_high=(0.2, 0.75, 0.3), mocap_low=None, mocap_high=None, action_scale=1. / 100, action_rot_scale=1., ): super().__init__(model_name, frame_skip=frame_skip) self.random_init = True self.action_scale = action_scale self.action_rot_scale = action_rot_scale self.hand_low = np.array(hand_low) self.hand_high = np.array(hand_high) if mocap_low is None: mocap_low = hand_low if mocap_high is None: mocap_high = hand_high self.mocap_low = np.hstack(mocap_low) self.mocap_high = np.hstack(mocap_high) self.curr_path_length = 0 self._freeze_rand_vec = True self._last_rand_vec = None # We use continuous goal space by default and # can discretize the goal space by calling # the `discretize_goal_space` method. self.discrete_goal_space = None self.discrete_goals = [] self.active_discrete_goal = None self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([+1, +1, +1, +1]), ) self._pos_obj_max_len = 6 self._pos_obj_possible_lens = (3, 6) self._set_task_called = False self._partially_observable = True self._state_goal = None # OVERRIDE ME def _set_task_inner(self): # Doesn't absorb "extra" kwargs, to ensure nothing's missed. pass def set_task(self, task): self._set_task_called = True data = pickle.loads(task.data) assert isinstance(self, data['env_cls']) del data['env_cls'] self._last_rand_vec = data['rand_vec'] self._freeze_rand_vec = True self._last_rand_vec = data['rand_vec'] del data['rand_vec'] self._partially_observable = data['partially_observable'] del data['partially_observable'] self._set_task_inner(**data) def set_xyz_action(self, action): action = np.clip(action, -1, 1) pos_delta = action * self.action_scale new_mocap_pos = self.data.mocap_pos + pos_delta[None] new_mocap_pos[0, :] = np.clip( new_mocap_pos[0, :], self.mocap_low, self.mocap_high, ) self.data.set_mocap_pos('mocap', new_mocap_pos) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) def discretize_goal_space(self, goals): assert False assert len(goals) >= 1 self.discrete_goals = goals # update the goal_space to a Discrete space self.discrete_goal_space = Discrete(len(self.discrete_goals)) # Belows are methods for using the new wrappers. # `sample_goals` is implmented across the sawyer_xyz # as sampling from the task lists. This will be done # with the new `discrete_goals`. After all the algorithms # conform to this API (i.e. using the new wrapper), we can # just remove the underscore in all method signature. def sample_goals_(self, batch_size): assert False if self.discrete_goal_space is not None: return [ self.discrete_goal_space.sample() for _ in range(batch_size) ] else: return [self.goal_space.sample() for _ in range(batch_size)] def set_goal_(self, goal): assert False if self.discrete_goal_space is not None: self.active_discrete_goal = goal self.goal = self.discrete_goals[goal] self._state_goal_idx = np.zeros(len(self.discrete_goals)) self._state_goal_idx[goal] = 1. else: self.goal = goal def _set_obj_xyz(self, pos): qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() qpos[9:12] = pos.copy() qvel[9:15] = 0 self.set_state(qpos, qvel) def get_site_pos(self, siteName): _id = self.model.site_names.index(siteName) return self.data.site_xpos[_id].copy() def _get_pos_objects(self): """Retrieves object position(s) from mujoco properties or instance vars Returns: np.ndarray: Flat array (usually 3 elements) representing the object(s)' position(s) """ # Throw error rather than making this an @abc.abstractmethod so that # V1 environments don't have to implement it raise NotImplementedError def _get_pos_goal(self): """Retrieves goal position from mujoco properties or instance vars Returns: np.ndarray: Flat array (3 elements) representing the goal position """ assert isinstance(self._state_goal, np.ndarray) assert self._state_goal.ndim == 1 return self._state_goal def _get_obs(self): """Combines positions of the end effector, object(s) and goal into a single flat observation Returns: np.ndarray: The flat observation array (12 elements) """ pos_hand = self.get_endeff_pos() pos_obj_padded = np.zeros(self._pos_obj_max_len) pos_obj = self._get_pos_objects() assert len(pos_obj) in self._pos_obj_possible_lens pos_obj_padded[:len(pos_obj)] = pos_obj pos_goal = self._get_pos_goal() if self._partially_observable: pos_goal = np.zeros_like(pos_goal) return np.hstack((pos_hand, pos_obj_padded, pos_goal)) def _get_obs_dict(self): obs = self._get_obs() return dict( state_observation=obs, state_desired_goal=self._get_pos_goal(), state_achieved_goal=obs[3:-3], ) def reset(self): self.curr_path_length = 0 return super().reset() def _get_state_rand_vec(self): if self._freeze_rand_vec: assert self._last_rand_vec is not None return self._last_rand_vec else: rand_vec = np.random.uniform(self.obj_and_goal_space.low, self.obj_and_goal_space.high, size=self.obj_and_goal_space.low.size) self._last_rand_vec = rand_vec return rand_vec def sample_tasks(self, num_tasks): directions = 2 * self.np_random.binomial(1, p=0.5, size=(num_tasks, )) - 1 tasks = [{'direction': direction} for direction in directions] return tasks def reset_task(self, task): self._task = task self._goal_dir = task['direction']
def action(self, state: Box, action_space: Discrete) -> int: return action_space.sample()
def test_spaces(self): experiment_name = "test_spaces" module_name = "module" logger = ModuleLogger( output_path=Path(self.temp_dir.name), experiment_name=experiment_name, module=module_name, step_write_frequency=None, episode_write_frequency=None, ) seed = 3 # Discrete space = Discrete(n=3) space.seed(seed) logger.log_space("Discrete", space.sample()) # MultiDiscrete space = MultiDiscrete(np.array([3, 2])) space.seed(seed) logger.log_space("MultiDiscrete", space.sample()) # Dict space = Dict({ "predictiveChangeVarDiscountedAverage": spaces.Box(low=-np.inf, high=np.inf, shape=(1, )), "predictiveChangeVarUncertainty": spaces.Box(low=0, high=np.inf, shape=(1, )), "lossVarDiscountedAverage": spaces.Box(low=-np.inf, high=np.inf, shape=(1, )), "lossVarUncertainty": spaces.Box(low=0, high=np.inf, shape=(1, )), "currentLR": spaces.Box(low=0, high=1, shape=(1, )), "trainingLoss": spaces.Box(low=0, high=np.inf, shape=(1, )), "validationLoss": spaces.Box(low=0, high=np.inf, shape=(1, )), }) space.seed(seed) logger.log_space("Dict", space.sample()) space = Box(np.array([0, 0]), np.array([2, 2])) space.seed(seed) logger.log_space("Box", space.sample()) logger.close() with open(logger.get_logfile(), "r") as log_file: logs = list(map(json.loads, log_file)) wide = log2dataframe(logs, wide=True) long = log2dataframe(logs, drop_columns=None) self.assertEqual(len(wide), 1) first_row = wide.iloc[0] # Discrete self.assertTrue(not np.isnan(first_row.Discrete)) # MultiDiscrete self.assertTrue(not np.isnan(first_row.MultiDiscrete_0)) self.assertTrue(not np.isnan(first_row.MultiDiscrete_1)) simultaneous_logged = long[(long.name == "MultiDiscrete_0") | (long.name == "MultiDiscrete_1")] self.assertEqual(len(simultaneous_logged.time.unique()), 1) # Dict expected_columns = [ "Dict_currentLR_0", "Dict_lossVarDiscountedAverage_0", "Dict_lossVarUncertainty_0", "Dict_predictiveChangeVarDiscountedAverage_0", "Dict_predictiveChangeVarUncertainty_0", "Dict_trainingLoss_0", ] for expected_column in expected_columns: self.assertTrue(not np.isnan(first_row[expected_column])) simultaneous_logged = long[long.name.isin(expected_columns)] self.assertEqual(len(simultaneous_logged.time.unique()), 1) # Box self.assertTrue(not np.isnan(first_row.Box_0)) self.assertTrue(not np.isnan(first_row.Box_1)) simultaneous_logged = long[(long.name == "Box_0") | (long.name == "Box_1")] self.assertEqual(len(simultaneous_logged.time.unique()), 1)
class PocEnv(Env): def __init__(self, maze, obs_array=False): self.board = select_maze(maze) self.grid = PocGrid(board=self.board["_maze"]) self._get_init_state() self.action_space = Discrete(4) self.observation_space = Discrete(1 << 10) # 1024 # self.observation_space = Discrete(14) self._reward_range = 100 self._discount = .95 self.done = False self.gui = None if obs_array: self._set_flags = set_flags_array self._zero = lambda: [0] * 10 else: self._set_flags = set_flags self._zero = lambda: 0 def seed(self, seed=None): np.random.seed(seed) def is_power(self, idx): return self.board['_maze'][idx] == 3 def is_passable(self, idx): return self.board['_maze'][idx] != 0 def _is_valid(self): assert self.grid.is_inside(self.state.agent_pos) assert self.is_passable(self.state.agent_pos) for ghost in self.state.ghosts: assert self.grid.is_inside(ghost.pos) assert self.is_passable(ghost.pos) def _set_state(self, state): self.done = False self.state = state def _generate_legal(self): actions = [] for action in self.action_space.n: if self.grid.is_inside(self.state.agent_pos + Moves.get_coord(action.value)): actions.append(action.value) return actions def step(self, action): err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg assert self.done is False self.state.action = action reward = -1 next_pos = self._next_pos(self.state.agent_pos, action) if next_pos.is_valid(): self.state.agent_pos = next_pos else: reward += -25 if self.state.power_step > 0: self.state.power_step -= 1 hit_ghost = -1 for g, ghost in enumerate(self.state.ghosts): if ghost.pos == self.state.agent_pos: hit_ghost = g else: # move ghost self._move_ghost(g, ghost_range=self.board["_ghost_range"]) if ghost.pos == self.state.agent_pos: hit_ghost = g if hit_ghost >= 0: if self.state.power_step > 0: reward += 25 self.state.ghosts[hit_ghost].reset() else: reward += -100 self.done = True # don't eat power up when hit by a ghost already elif self.is_power(self.state.agent_pos): self.state.power_step = config["_power_steps"] reward += 10 # same for food elif self.state.food_pos[self.grid.get_index(self.state.agent_pos)]: self.state.food_pos[self.grid.get_index(self.state.agent_pos)] = 0 if sum(self.state.food_pos) == 0: reward += 1000 self.done = True obs = self._make_ob() return obs, reward, self.done, {"state": self.state} def _make_ob(self): obs = self._zero() for d in range(self.action_space.n): if self._see_ghost(d) >= 0: obs = self._set_flags(obs, d) next_pos = self._next_pos(self.state.agent_pos, direction=d) if next_pos.is_valid() and self.is_passable(next_pos): obs = self._set_flags(obs, d + self.action_space.n) if self._smell_food(): obs = self._set_flags(obs, 8) if self._hear_ghost(self.state): obs = self._set_flags(obs, 9) return obs def _encode_state(self, state): poc_idx = self.grid.get_index(state.agent_pos) ghosts = [(self.grid.get_index(ghost.pos), ghost.direction) for ghost in state.ghosts] return np.concatenate([[poc_idx], *ghosts, state.food_pos, [state.power_step]]) def _decode_state(self, state): poc_state = PocState(Coord(*self.grid.get_coord(state[0]))) ghosts = np.split(state[1:self.board["_num_ghosts"] * 3], 1) for g in ghosts: poc_state.ghosts.append( Ghost(pos=self.grid.get_coord(g[0]), direction=g[1])) poc_state.power_step = state[-1] poc_state.food_pos = state[self.board["_num_ghosts"] * 3:-1].tolist() return poc_state def _see_ghost(self, action): eye_pos = self.state.agent_pos + Moves.get_coord(action) while True: for g, ghost in enumerate(self.state.ghosts): if ghost.pos == eye_pos: return g eye_pos += Moves.get_coord(action) if not (self.grid.is_inside(eye_pos) and self.is_passable(eye_pos)): break return -1 def _smell_food(self, smell_range=1): for x in range(-smell_range, smell_range + 1): for y in range(-smell_range, smell_range + 1): smell_pos = Coord(x, y) idx = self.grid.get_index(self.state.agent_pos + smell_pos) if self.grid.is_inside(self.state.agent_pos + smell_pos) and\ self.state.food_pos[idx]: return True return False @staticmethod def _hear_ghost(poc_state, hear_range=2): for ghost in poc_state.ghosts: if Grid.manhattan_distance(ghost.pos, poc_state.agent_pos) <= hear_range: return True return False def render(self, mode='human', close=False): if close: return if mode == 'human': if self.gui is None: self.gui = PocGui(board_size=self.grid.get_size, maze=self.board["_maze"], state=self.state) else: self.gui.render(state=self.state) def reset(self): self.done = False self._get_init_state() return self._make_ob() def close(self): pass def _get_init_state(self): self.state = PocState() self.state.agent_pos = Coord(*self.board["_poc_home"]) ghost_home = Coord(*self.board["_ghost_home"]) for g in range(self.board["_num_ghosts"]): pos = Coord(ghost_home.x + g % 2, ghost_home.y + g // 2) self.state.ghosts.append(Ghost(pos, direction=-1)) self.state.food_pos = np.random.binomial(1, config["_food_prob"], size=self.grid.n_tiles) # only make free space food idx = (self.board["_maze"] > 0) &\ (self.state.food_pos.reshape(self.board["_maze"].shape) > 0) self.board["_maze"][idx] = 4 self.state.power_step = 0 return self.state def _next_pos(self, pos, direction): direction = Moves.get_coord(direction) if pos.x == 0 and pos.y == self.board['_passage_y'] and\ direction == Moves.EAST: next_pos = Coord(self.grid.x_size - 1, pos.y) elif pos.x == self.grid.x_size - 1 and\ pos.y == self.board['_passage_y'] and direction == Moves.WEST: next_pos = Coord(0, pos.y) else: next_pos = pos + direction if self.grid.is_inside(next_pos) and self.is_passable(next_pos): return next_pos else: return Coord(-1, -1) def _move_ghost(self, g, ghost_range): if Grid.manhattan_distance(self.state.agent_pos, self.state.ghosts[g].pos) < ghost_range: if self.state.power_step > 0: self._move_defensive(g) else: self._move_aggressive(g) else: self._move_random(g) def _move_aggressive(self, g): if not np.random.binomial(1, p=config["_chase_prob"]): return self._move_random(g) best_dist = self.grid.x_size + self.grid.y_size best_pos = self.state.ghosts[g].pos best_dir = -1 for d in range(self.action_space.n): dist = Grid.directional_distance(self.state.agent_pos, self.state.ghosts[g].pos, d) new_pos = self._next_pos(self.state.ghosts[g].pos, d) if dist <= best_dist and new_pos.is_valid() and\ can_move(self.state.ghosts[g], d): best_pos = new_pos best_dist = dist best_dir = d self.state.ghosts[g].update(best_pos, best_dir) def _move_defensive(self, g, defensive_prob=.5): if np.random.binomial(1, defensive_prob) and\ self.state.ghosts[g].direction >= 0: self.state.ghosts[g].direction = -1 return best_dist = 0 best_pos = self.state.ghosts[g].pos best_dir = -1 for d in range(self.action_space.n): dist = Grid.directional_distance(self.state.agent_pos, self.state.ghosts[g].pos, d) new_pos = self._next_pos(self.state.ghosts[g].pos, d) if dist >= best_dist and new_pos.is_valid() and\ can_move(self.state.ghosts[g], d): best_pos = new_pos best_dist = dist best_dir = d self.state.ghosts[g].update(best_pos, best_dir) def _move_random(self, g): # there are !!! dead ends # only switch to opposite direction when it failed 10 times (hack) ghost_pos = self.state.ghosts[g].pos i = 0 while True: d = self.action_space.sample() next_pos = self._next_pos(ghost_pos, d) # normal map has dead ends: if next_pos.is_valid() and (can_move(self.state.ghosts[g], d) or i > 10): break i += 1 self.state.ghosts[g].update(next_pos, d)
class UR3DualXYZEnv(UR3MocapBase, metaclass=abc.ABCMeta): def __init__(self, *args, hand_low=(-0.2, 0.55, 0.05), hand_high=(0.2, 0.75, 0.3), second_hand_low=(-0.2, 0.55, 0.05), second_hand_high=(0.2, 0.75, 0.3), mocap_low=None, mocap_high=None, second_mocap_low=None, second_mocap_high=None, action_scale=2. / 100, action_rot_scale=1., **kwargs): super().__init__(*args, **kwargs) self.action_scale = action_scale self.action_rot_scale = action_rot_scale self.hand_low = np.array(hand_low) self.hand_high = np.array(hand_high) self.second_hand_low = np.array(second_hand_low) self.second_hand_high = np.array(second_hand_high) if mocap_low is None: mocap_low = hand_low if mocap_high is None: mocap_high = hand_high if second_mocap_low is None: second_mocap_low = second_hand_low if second_mocap_high is None: second_mocap_high = second_hand_high self.mocap_low = np.hstack(mocap_low) self.mocap_high = np.hstack(mocap_high) self.second_mocap_low = np.hstack(second_mocap_low) self.second_mocap_high = np.hstack(second_mocap_high) # We use continuous goal space by default and # can discretize the goal space by calling # the `discretize_goal_space` method. self.discrete_goal_space = None self.discrete_goals = [] self.active_discrete_goal = None def set_xyz_action(self, action): action = np.clip(action, -1, 1) pos_delta = action * self.action_scale new_mocap_pos = self.data.mocap_pos + pos_delta[None] new_mocap_pos[0, :] = np.clip( new_mocap_pos[0, :], self.mocap_low, self.mocap_high, ) self.data.set_mocap_pos('mocap', new_mocap_pos) if self.rotMode == 'vertical_fixed': quat = quat_mul(quat_create(np.array([1., 0, 0]), np.pi), quat_create(np.array([0, 0, 1.]), np.pi / 2)) #ref 기준 x축 180, z축 90순 elif self.rotMode == 'horizontal_fixed': quat = quat_mul(quat_create(np.array([0, 0, 1.]), np.pi), quat_create(np.array([0, 1., 0]), np.pi / 2)) #ref 기준 z축 180, y축 90순 self.data.set_mocap_quat('mocap', quat) #w v 순인듯 # self.data.set_mocap_quat('mocap', np.array([1, 0, 0, 0])) #w v 순인듯 def set_xyz_action_rot(self, action): action[:3] = np.clip(action[:3], -1, 1) pos_delta = action[:3] * self.action_scale new_mocap_pos = self.data.mocap_pos + pos_delta[None] new_mocap_pos[0, :] = np.clip( new_mocap_pos[0, :], self.mocap_low, self.mocap_high, ) rot_axis = action[4:] / np.linalg.norm(action[4:]) action[3] = action[3] * self.action_rot_scale self.data.set_mocap_pos('mocap', new_mocap_pos) # replace this with learned rotation quat = quat_mul( quat_create(np.array([0, 1., 0]), np.pi), quat_create(np.array(rot_axis).astype(np.float64), action[3])) self.data.set_mocap_quat('mocap', quat) # self.data.set_mocap_quat('mocap', np.array([np.cos(action[3]/2), np.sin(action[3]/2)*rot_axis[0], np.sin(action[3]/2)*rot_axis[1], np.sin(action[3]/2)*rot_axis[2]])) # self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) def set_xyz_action_rotz(self, action): action[:3] = np.clip(action[:3], -1, 1) pos_delta = action[:3] * self.action_scale new_mocap_pos = self.data.mocap_pos + pos_delta[None] new_mocap_pos[0, :] = np.clip( new_mocap_pos[0, :], self.mocap_low, self.mocap_high, ) self.data.set_mocap_pos('mocap', new_mocap_pos) zangle_delta = action[3] * self.action_rot_scale new_mocap_zangle = ur3_quat_to_zangle( self.data.mocap_quat[0]) + zangle_delta # new_mocap_zangle = action[3] new_mocap_zangle = np.clip( new_mocap_zangle, -3.0, 3.0, ) if new_mocap_zangle < 0: new_mocap_zangle += 2 * np.pi self.data.set_mocap_quat('mocap', ur3_zangle_to_quat(new_mocap_zangle)) def set_xy_action(self, xy_action, fixed_z): delta_z = fixed_z - self.data.mocap_pos[0, 2] xyz_action = np.hstack((xy_action, delta_z)) self.set_xyz_action(xyz_action) def discretize_goal_space(self, goals=None): if goals is None: self.discrete_goals = [self.default_goal] else: assert len(goals) >= 1 self.discrete_goals = goals # update the goal_space to a Discrete space self.discrete_goal_space = Discrete(len(self.discrete_goals)) # Belows are methods for using the new wrappers. # `sample_goals` is implmented across the sawyer_xyz # as sampling from the task lists. This will be done # with the new `discrete_goals`. After all the algorithms # conform to this API (i.e. using the new wrapper), we can # just remove the underscore in all method signature. def sample_goals_(self, batch_size): if self.discrete_goal_space is not None: return [ self.discrete_goal_space.sample() for _ in range(batch_size) ] else: return [self.goal_space.sample() for _ in range(batch_size)] def set_goal_(self, goal): if self.discrete_goal_space is not None: self.active_discrete_goal = goal self.goal = self.discrete_goals[goal] self._state_goal_idx = np.zeros(len(self.discrete_goals)) self._state_goal_idx[goal] = 1. else: self.goal = goal def set_init_config(self, config): assert isinstance(config, dict) for key, val in config.items(): self.init_config[key] = val ''' Functions that are copied and pasted everywhere and seems to be not used. ''' def sample_goals(self, batch_size): '''Note: should be replaced by sample_goals_ if not used''' # Required by HER-TD3 goals = self.sample_goals_(batch_size) if self.discrete_goal_space is not None: goals = [self.discrete_goal_space[g].copy() for g in goals] return { 'state_desired_goal': goals, } def sample_task(self): '''Note: this can be replaced by sample_goal_(batch_size=1)''' goal = self.sample_goals_(1) if self.discrete_goal_space is not None: return self.discrete_goals[goal] else: return goal def _set_obj_xyz_quat(self, pos, angle): quat = quat_create(np.array([0, 0, .1]), angle) qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() qpos[9:12] = pos.copy() qpos[12:16] = quat.copy() qvel[9:15] = 0 self.set_state(qpos, qvel) def _set_obj_xyz(self, pos): qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() qpos[9:12] = pos.copy() qvel[9:15] = 0 self.set_state(qpos, qvel)
class FDEnvSelHeur(Env): def __init__(self, num_heuristics: int, host: str = '', port: int = 12345, num_steps=None, state_type: Union[int, StateType] = StateType.RAW, seed: int = 12345, max_rand_steps: int = 0, config_dir: str = '.', port_file_id=None, use_general_state_info: bool = True, time_step_limit: int = -1): """ Initialize environment """ self._heuristic_state_features = [ 'Average Value', # 'Dead Ends Reliable', 'Max Value', 'Min Value', 'Open List Entries', 'Varianz' ] self.action_space = Discrete(num_heuristics) self._general_state_features = [ #'evaluated_states', 'evaluations', 'expanded_states', # 'generated_ops', #'generated_states', 'num_variables', #'registered_states', 'reopened_states', #"cg_num_eff_to_eff", "cg_num_eff_to_pre", "cg_num_pre_to_eff" ] total_state_features = (num_heuristics * len(self._heuristic_state_features)) self._use_gsi = use_general_state_info if use_general_state_info: total_state_features += len(self._general_state_features) self.observation_space = Box( low=np.array([-np.inf for _ in range(total_state_features)]), high=np.array([np.inf for _ in range(total_state_features)]), dtype=np.float32) self.__skip_transform = [False for _ in range(total_state_features)] if use_general_state_info: self.__skip_transform[4] = True # skip num_variables transform self.__skip_transform[7] = True self.__skip_transform[8] = True self.__skip_transform[9] = True self.__num_heuristics = num_heuristics self.host = host self.port = port self.socket = None self.conn = None self._prev_state = None self.num_steps = num_steps self.time_step_limit = time_step_limit self.__state_type = StateType(state_type) self.__norm_vals = [] self._config_dir = config_dir self._port_file_id = port_file_id self._transformation_func = None # create state transformation function with inputs (current state, previous state, normalization values) if self.__state_type == StateType.DIFF: self._transformation_func = lambda x, y, z, skip: x - y if not skip else x elif self.__state_type == StateType.ABSDIFF: self._transformation_func = lambda x, y, z, skip: abs( x - y) if not skip else x elif self.__state_type == StateType.NORMAL: self._transformation_func = lambda x, y, z, skip: FDEnvSelHeur._save_div( x, z) if not skip else x elif self.__state_type == StateType.NORMDIFF: self._transformation_func = lambda x, y, z, skip: \ FDEnvSelHeur._save_div(x, z) - FDEnvSelHeur._save_div(y, z) if not skip else x elif self.__state_type == StateType.NORMABSDIFF: self._transformation_func = lambda x, y, z, skip:\ abs(FDEnvSelHeur._save_div(x, z) - FDEnvSelHeur._save_div(y, z)) if not skip else x self.rng = np.random.RandomState(seed=seed) self.max_rand_steps = max_rand_steps self.__step = 0 self.__start_time = None self.done = True # Starts as true as the expected behavior is that before normal resets an episode was done. @staticmethod def _save_div(a, b): return np.divide(a, b, out=np.zeros_like(a), where=b != 0) def send_msg(self, msg: bytes): """ Send message and prepend the message size Based on comment from SO see [1] [1] https://stackoverflow.com/a/17668009 :param msg: The message as byte """ # Prefix each message with a 4-byte length (network byte order) msg = str.encode("{:>04d}".format(len(msg))) + msg self.conn.sendall(msg) def recv_msg(self): """ Recieve a whole message. The message has to be prepended with its total size Based on comment from SO see [1] """ # Read message length and unpack it into an integer raw_msglen = self.recvall(4) if not raw_msglen: return None msglen = int(raw_msglen.decode()) # Read the message data return self.recvall(msglen) def recvall(self, n: int): """ Given we know the size we want to recieve, we can recieve that amount of bytes. Based on comment from SO see [1] :param n: Number of bytes to expect in the data """ # Helper function to recv n bytes or return None if EOF is hit data = b'' while len(data) < n: packet = self.conn.recv(n - len(data)) if not packet: return None data += packet return data def _process_data(self): """ Split received json into state reward and done :return: """ msg = self.recv_msg().decode() #print("----------------------------") #print(msg) #print("=>") msg = msg.replace('-inf', '0') msg = msg.replace('inf', '0') #print(msg) data = eval(msg) r = data['reward'] done = data['done'] del data['reward'] del data['done'] state = [] if self._use_gsi: for feature in self._general_state_features: state.append(data[feature]) for heuristic_id in range( self.__num_heuristics): # process heuristic data for feature in self._heuristic_state_features: state.append(data["%d" % heuristic_id][feature]) if self._prev_state is None: self.__norm_vals = deepcopy(state) self._prev_state = deepcopy(state) if self.__state_type != StateType.RAW: # Transform state to DIFF state or normalize tmp_state = state state = list( map(self._transformation_func, state, self._prev_state, self.__norm_vals, self.__skip_transform)) self._prev_state = tmp_state return np.array(state), r, done def step(self, action: typing.Union[int, typing.List[int]]): """ Play RL-Action :param action: :return: """ self.__step += 1 if not np.issubdtype( type(action), np.integer): # check for core int and any numpy-int try: action = action[0] except IndexError as e: print(type(action)) raise e if self.num_steps: msg = ','.join([str(action), str(self.num_steps)]) else: msg = str(action) self.send_msg(str.encode(msg)) s, r, d = self._process_data() info = {} if d: self.done = True self.kill_connection() if self.__step > self.time_step_limit: info['needs_reset'] = True self.send_msg(str.encode("END")) self.kill_connection() self.done = True return s, r, d, info def reset(self): """ Initialize FD :return: """ self._prev_state = None self.__step = 0 self.__start_time = time.time() if not self.done: # This means we interrupt FD before a plan was found # Inform FD about imminent shutdown of the connection self.send_msg(str.encode("END")) self.done = False if self.conn: self.conn.shutdown(2) self.conn.close() self.conn = None if not self.socket: self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.socket.bind((self.host, self.port)) # write down port such that FD can potentially read where to connect to if self._port_file_id: fp = joinpath(self._config_dir, 'port_{:d}.txt'.format(self._port_file_id)) else: fp = joinpath(self._config_dir, 'port.txt') with open(fp, 'w') as portfh: portfh.write(str(self.port)) print(fp) self.socket.listen() self.conn, address = self.socket.accept() s, _, _ = self._process_data() if self.max_rand_steps > 1: for _ in range(self.rng.randint(1, self.max_rand_steps + 1)): s, _, _, _ = self.step(self.action_space.sample()) else: s, _, _, _ = self.step(0) # hard coded to zero as initial step remove( fp ) # remove the port file such that there is no chance of loading the old port return s def kill_connection(self): """Kill the connection""" if self.conn: self.conn.shutdown(2) self.conn.close() self.conn = None if self.socket: self.socket.shutdown(2) self.socket.close() self.socket = None def close(self): """ Needs to "kill" the environment :return: """ self.kill_connection() def render(self, mode: str = 'human') -> None: """ Required by gym.Env but not implemented :param mode: :return: None """ pass def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed]