class IdentityEnv(Env): def __init__( self, dim, ep_length=100, ): self.action_space = Discrete(dim) self.reset() def reset(self): self._choose_next_state() self.observation_space = self.action_space return self.state def step(self, actions): rew = self._get_reward(actions) self._choose_next_state() return self.state, rew, False, {} def _choose_next_state(self): self.state = self.action_space.sample() def _get_reward(self, actions): return 1 if self.state == actions else 0
def __init__( self, dim, ep_length=100, ): self.action_space = Discrete(dim) self.reset()
class AdaptiveLearningEnv(gym.Env): metadata = { 'render.modes': ['human', 'rgb_array'] } def __init__(self, filename='activities.pkl'): self.filename = filename self.assets_dir = os.path.dirname(os.path.abspath(__file__)) self.reward_range = (0, 1) self.viewer = None self.circle_indexs = [] self.ob = None self._configure() self._seed() self._reset() def _configure(self): self._load_activities() self.action_space = Discrete(len(self.activities)) self.observation_space = Box(0, 1, len(self.knowledges)) # self.simulator = StudentSimulator() def _load_activities(self): data_file = os.path.join(self.assets_dir, 'assets/%s' % self.filename) pkl_file = open(data_file, 'rb') self.knowledges = pickle.load(pkl_file) self.activities = pickle.load(pkl_file) pkl_file.close() def _step(self, action): assert self.action_space.contains(action) a = Activity(self.activities[action], self.knowledges) ob, reward, done = self.simulator.progress(self.ob, a) self.ob = ob return ob, reward, done, {} def _reset(self): self.ob = Box(0.1, 0.1, len(self.knowledges)).sample() return self.ob def _seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def _render(self, mode='human', close=False): if close: if self.viewer is not None: self.viewer.close() self.viewer = None return screen_width = 600 screen_height = 400 radius = 10 init_alpha = 0.1 margin = radius * 2.5 max_per_line = (screen_width - margin * 2) / margin colors = np.array([[78,191,126], [254,178,45], [175,101,194]])/255. if self.viewer is None: from gym.envs.classic_control import rendering self.viewer = rendering.Viewer(screen_width, screen_height) for (i, x) in enumerate(sorted(self.knowledges, key=lambda tup:(tup.level(), tup.group), reverse=True)): h, w = divmod(i, max_per_line) self.circle_indexs.append(x._id) w = screen_width - 20 - w * (margin + 10) h = screen_height - 20 - h * (margin + 10) t = self.viewer.draw_circle(radius) t.add_attr(rendering.Transform((w, h))) r, g, b = colors[x.level() - 1] t.set_color(r, g, b, init_alpha) self.viewer.add_geom(t) for i, x in enumerate(self.ob): if len(self.circle_indexs) != 0: t = self.viewer.geoms[self.circle_indexs.index(i)] k = self.knowledges[i] r, g, b = colors[k.level() - 1] t.set_color(r, g, b, x) return self.viewer.render(return_rgb_array = mode=='rgb_array')
def action(self, state: Box, action_space: Discrete) -> int: if self._exploration_policy.should_explore(): return action_space.sample() else: predict = self._model.predict(np.array([state])) return np.argmax(predict).item()
def getActionSpace(self, agentIDs): actSpace = {} for agent in agentIDs: actSpace[agent] = Discrete(len(ACTION_MAP)) #actSpace['state'] = Discrete(len(ACTION_MAP)) return actSpace
def __init__(self, initial_stacks=100, small_blind=1, big_blind=2, render=False, funds_plot=True, max_raising_rounds=2, use_cpp_montecarlo=False): """ The table needs to be initialized once at the beginning Args: num_of_players (int): number of players that need to be added initial_stacks (real): initial stacks per placyer small_blind (real) big_blind (real) render (bool): render table after each move in graphical format funds_plot (bool): show plot of funds history at end of each episode max_raising_rounds (int): max raises per round per player """ if use_cpp_montecarlo: import cppimport calculator = cppimport.imp("tools.montecarlo_cpp.pymontecarlo") get_equity = calculator.montecarlo else: from tools.montecarlo_python import get_equity self.get_equity = get_equity self.use_cpp_montecarlo = use_cpp_montecarlo self.num_of_players = 0 self.small_blind = small_blind self.big_blind = big_blind self.render_switch = render self.players = [] self.table_cards = None self.dealer_pos = None self.player_status = [] # one hot encoded self.current_player = None self.player_cycle = None # cycle iterator self.stage = None self.last_player_pot = None self.viewer = None self.player_max_win = None # used for side pots self.second_round = False self.last_caller = None self.last_raiser = None self.raisers = [] self.callers = [] self.played_in_round = None self.min_call = None self.community_data = None self.player_data = None self.stage_data = None self.deck = None self.action = None self.winner_ix = None self.initial_stacks = initial_stacks self.acting_agent = None self.funds_plot = funds_plot self.max_round_raising = max_raising_rounds # pots self.community_pot = 0 self.current_round_pot = 9 self.player_pots = None # individual player pots self.observation = None self.reward = None self.info = None self.done = False self.funds_history = None self.array_everything = None self.legal_moves = None self.illegal_move_reward = -1_000_000 self.action_space = Discrete(len(Action) - 2) self.first_action_for_hand = None
from collections import OrderedDict import numpy as np import pytest from gym.spaces import Box, Dict, Discrete, MultiBinary, MultiDiscrete, Tuple, utils spaces = [ Discrete(3), Box(low=0.0, high=np.inf, shape=(2, 2)), Box(low=0.0, high=np.inf, shape=(2, 2), dtype=np.float16), Tuple([Discrete(5), Discrete(10)]), Tuple([ Discrete(5), Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32), ]), Tuple((Discrete(5), Discrete(2), Discrete(2))), MultiDiscrete([2, 2, 10]), MultiBinary(10), Dict({ "position": Discrete(5), "velocity": Box(low=np.array([0, 0]), high=np.array([1, 5]), dtype=np.float32), }), ] flatdims = [3, 4, 4, 15, 7, 9, 14, 10, 7] @pytest.mark.parametrize(["space", "flatdim"], zip(spaces, flatdims))
def discretize_goal_space(self, goals): assert False assert len(goals) >= 1 self.discrete_goals = goals # update the goal_space to a Discrete space self.discrete_goal_space = Discrete(len(self.discrete_goals))
class RockEnv(Env): metadata = {"render.modes": ["human", "ansi"]} def __init__(self, board_size=7, num_rocks=8, use_heuristic=False, observation='o', stay_inside=False): """ :param board_size: int board is a square of board_size x board_size :param num_rocks: int number of rocks on board :param use_heuristic: bool usage unclear :param observation: str must be one of 'o': observed value only 'po': position of the agent + the above 'poa': the above + the action taken """ assert board_size in list(config.keys()) and \ num_rocks == len(config[board_size]["rock_pos"]) self.num_rocks = num_rocks self._use_heuristic = use_heuristic self._rock_pos = \ [Coord(*rock) for rock in config[board_size]['rock_pos']] self._agent_pos = Coord(*config[board_size]['init_pos']) self.grid = Grid(board_size, board_size) for idx, rock in enumerate(self._rock_pos): self.grid.board[rock] = idx self.action_space = Discrete(len(Action) + self.num_rocks) self._discount = .95 self._reward_range = 20 self._penalization = -100 self._query = 0 if stay_inside: self._out_of_bounds_penalty = 0 else: self._out_of_bounds_penalty = self._penalization self.state = None self.last_action = None self.done = False self.gui = None assert observation in ['o', 'oa', 'po', 'poa'] if observation == 'o': self._make_obs = lambda obs, a: obs self.observation_space = Discrete(len(Obs)) elif observation == 'oa': self._make_obs = self._oa self.observation_space =\ Box(low=0, high=np.append(max(Obs), np.ones(self.action_space.n)), dtype=np.int) elif observation == 'po': self._make_obs = self._po self.observation_space = \ Box(low=0, high=np.append(np.ones(self.grid.n_tiles), max(Obs)), dtype=np.int) elif observation == 'poa': self._make_obs = self._poa self.observation_space = \ Box(low=0, high=np.concatenate((np.ones(self.grid.n_tiles), [max(Obs)], np.ones(self.action_space.n))), dtype=np.int) def seed(self, seed=None): np.random.seed(seed) def step(self, action: int): err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg assert self.done is False self.last_action = action self._query += 1 reward = 0 ob = Obs.NULL if action < Action.SAMPLE: if action == Action.EAST: if self.state.agent_pos.x + 1 < self.grid.x_size: self.state.agent_pos += Moves.EAST.value else: reward = 10 self.done = True ob = self._make_obs(ob, action) return ob, reward, self.done, { "state": self._encode_state(self.state) } elif action == Action.NORTH: if self.state.agent_pos.y + 1 < self.grid.y_size: self.state.agent_pos += Moves.NORTH.value else: reward = self._out_of_bounds_penalty elif action == Action.SOUTH: if self.state.agent_pos.y - 1 >= 0: self.state.agent_pos += Moves.SOUTH.value else: reward = self._out_of_bounds_penalty elif action == Action.WEST: if self.state.agent_pos.x - 1 >= 0: self.state.agent_pos += Moves.WEST.value else: reward = self._out_of_bounds_penalty else: raise NotImplementedError() if action == Action.SAMPLE: rock = self.grid[self.state.agent_pos] if rock >= 0 and not self.state.rocks[ rock].status == 0: # collected if self.state.rocks[rock].status == 1: reward = 10 else: reward = -10 self.state.rocks[rock].status = 0 else: reward = self._penalization if action > Action.SAMPLE: rock = action - Action.SAMPLE - 1 assert rock < self.num_rocks ob = self._sample_ob(self.state.agent_pos, self.state.rocks[rock]) self.state.rocks[rock].measured += 1 eff = self._efficiency(self.state.agent_pos, self.state.rocks[rock].pos) if ob == Obs.GOOD: self.state.rocks[rock].count += 1 self.state.rocks[rock].lkv *= eff self.state.rocks[rock].lkw *= (1 - eff) else: self.state.rocks[rock].count -= 1 self.state.rocks[rock].lkw *= eff self.state.rocks[rock].lkv *= (1 - eff) denominator = (.5 * self.state.rocks[rock].lkv) + ( .5 * self.state.rocks[rock].lkw) + 1e-10 self.state.rocks[rock].prob_valuable = \ (.5 * self.state.rocks[rock].lkv) / denominator self.done = self._penalization == reward ob = self._make_obs(ob, action) return ob, reward, self.done, {"state": self._encode_state(self.state)} def _decode_state(self, state, as_array=False): agent_pos = Coord(*state['agent_pos']) rock_state = RockState(agent_pos) for r in state['rocks']: rock = Rock(pos=0) rock.__dict__.update(r) rock_state.rocks.append(rock) if as_array: rocks = [] for rock in rock_state.rocks: rocks.append(rock.status) return np.concatenate([[self.grid.get_index(agent_pos)], rocks]) return rock_state @staticmethod def _encode_state(state): # use dictionary for state encoding return _encode_dict(state) # rocks can take 3 values: -1, 1, 0 if collected def render(self, mode='human', close=False): if close: return if mode == "human": msg = None if self.gui is None: start_pos = self.grid.get_index(self.state.agent_pos) obj_pos = [(self.grid.get_index(rock.pos), rock.status) for rock in self.state.rocks] self.gui = RockGui((self.grid.x_size, self.grid.y_size), start_pos=start_pos, obj=obj_pos) if self.last_action > Action.SAMPLE: rock = self.last_action - Action.SAMPLE - 1 msg = "Rock S: {} P:{}".format(self.state.rocks[rock].status, self.state.rocks[rock].pos) agent_pos = self.grid.get_index(self.state.agent_pos) self.gui.render(agent_pos, msg) def reset(self): self.done = False self._query = 0 self.last_action = Action.SAMPLE self.state = self._get_init_state(should_encode=False) return self._make_obs(Obs.NULL, self.last_action) def _set_state(self, state): self.done = False self.state = self._decode_state(state) def close(self): self.render(close=True) def _compute_prob(self, action, next_state, ob): next_state = self._decode_state(next_state) if action <= Action.SAMPLE: return int(ob == Obs.NULL) eff = self._efficiency( next_state.agent_pos, next_state.rocks[action - Action.SAMPLE - 1].pos) if ob == Obs.GOOD and next_state.rocks[action - Action.SAMPLE - 1].status == 1: return eff elif ob == Obs.BAD and next_state.rocks[action - Action.SAMPLE - 1].status == -1: return eff else: return 1 - eff def _get_init_state(self, should_encode=True): rock_state = RockState(self._agent_pos) for idx in range(self.num_rocks): rock_state.rocks.append(Rock(self._rock_pos[idx])) return self._encode_state(rock_state) if should_encode else rock_state def _generate_legal(self): legal = [Action.EAST] # can always go east if self.state.agent_pos.y + 1 < self.grid.y_size: legal.append(Action.NORTH) if self.state.agent_pos.y - 1 >= 0: legal.append(Action.SOUTH) if self.state.agent_pos.x - 1 >= 0: legal.append(Action.WEST) rock = self.grid[self.state.agent_pos] if rock >= 0 and self.state.rocks[rock].status != 0: legal.append(Action.SAMPLE) for rock in self.state.rocks: assert self.grid[rock.pos] != -1 if rock.status != 0: legal.append(self.grid[rock.pos] + 1 + Action.SAMPLE) return legal def _generate_preferred(self, history): if not self._use_heuristic: return self._generate_legal() actions = [] # sample rocks with high likelihood of being good rock = self.grid[self.state.agent_pos] if rock >= 0 and self.state.rocks[rock].status != 0 and history.size: total = 0 # history for t in range(history.size): if history[t].action == rock + 1 + Action.SAMPLE: if history[t].ob == Obs.GOOD: total += 1 elif history[t].ob == Obs.BAD: total -= 1 if total > 0: actions.append(Action.SAMPLE) return actions # process the rocks all_bad = True direction = { "north": False, "south": False, "west": False, "east": False } for idx in range(self.num_rocks): rock = self.state.rocks[idx] if rock.status != 0: total = 0 for t in range(history.size): if history[t].action == idx + 1 + Action.SAMPLE: if history[t].ob == Obs.GOOD: total += 1 elif history[t].ob == Obs.BAD: total -= 1 if total >= 0: all_bad = False if rock.pos.y > self.state.agent_pos.y: direction['north'] = True elif rock.pos.y < self.state.agent_pos.y: direction['south'] = True elif rock.pos.x < self.state.agent_pos.x: direction['west'] = True elif rock.pos.x > self.state.agent_pos.x: direction['east'] = True if all_bad: actions.append(Action.EAST) return actions # generate a random legal move # do not measure a collected rock # do no measure a rock too often # do not measure clearly bad rocks # don't move in a direction that puts you closer to bad rocks # never sample a rock if self.state.agent_pos.y + 1 < self.grid.y_size and\ direction['north']: actions.append(Action.NORTH) if direction['east']: actions.append(Action.EAST) if self.state.agent_pos.y - 1 >= 0 and direction['south']: actions.append(Action.SOUTH) if self.state.agent_pos.x - 1 >= 0 and direction['west']: actions.append(Action.WEST) for idx, rock in enumerate(self.state.rocks): if not rock.status == 0 and rock.measured < 5 and abs( rock.count) < 2 and 0 < rock.prob_valuable < 1: actions.append(idx + 1 + Action.SAMPLE) if len(actions) == 0: return self._generate_legal() return actions def __dict2np__(self, state): idx = self.grid.get_index(Coord(*state['agent_pos'])) rocks = [] for rock in state['rocks']: rocks.append(rock['status']) return np.concatenate([[idx], rocks]) @staticmethod def _efficiency(agent_pos, rock_pos, hed=20): # TODO check me d = Grid.euclidean_distance(agent_pos, rock_pos) eff = (1 + pow(2, -d / hed)) * .5 return eff @staticmethod def _select_target(rock_state, x_size): best_dist = x_size * 2 best_rock = -1 # Coord(-1, -1) for idx, rock in enumerate(rock_state.rocks): if rock.status != 0 and rock.count >= 0: d = Grid.manhattan_distance(rock_state.agent_pos, rock.pos) if d < best_dist: best_dist = d best_rock = idx # rock.pos return best_rock @staticmethod def _sample_ob(agent_pos, rock, hed=20): eff = RockEnv._efficiency(agent_pos, rock.pos, hed=hed) if np.random.binomial(1, eff): return Obs.GOOD if rock.status == 1 else Obs.BAD else: return Obs.BAD if rock.status == 1 else Obs.GOOD def _po(self, o, _): obs = np.zeros(self.observation_space.shape[0]) obs[self.grid.x_size * self.state.agent_pos.y + self.state.agent_pos.x] = 1. obs[self.grid.n_tiles] = o return obs def _poa(self, o, a): obs = self._po(o, a) obs[self.grid.n_tiles + a] = 1. return obs def _oa(self, o, a): obs = np.zeros(self.observation_space.shape[0]) obs[0] = o obs[1 + a] = 1. return obs
class NanoworldEnv(MultiAgentEnv): # Constants agents = ('passenger', 'driver') max_num_actions = 10 parameters = [ ".", "yes", "no", "starbucks", "peets" ] # dummy destination '.' to be paired with 'OVER', 'YES', 'NO' # parameters = [ # ".", # "yes", # "no", # "starbucks", # "peets", # "ralphs", # "traderjoes", # "wholefoods", # "walmart", # "cvs", # "toysrus", # "applestore", # "bestbuy", # ] # Action spaces passenger_actions = ["SAY", "OVER"] passenger_action_space = Tuple( [Discrete(len(passenger_actions)), Discrete(len(parameters))]) driver_actions = ["CONFIRM", "DRIVE"] driver_action_space = Tuple( [Discrete(len(driver_actions)), Discrete(len(parameters))]) # observation spaces passenger_observation_space = Dict({ 'dialog_history': Repeated(Discrete(len(agents)), Tuple([ Discrete(len(passenger_actions)), Discrete(len(parameters)) ]), max_len=max_num_actions), 'destination': Discrete(len(parameters)) }) driver_observation_space = Dict({ 'dialog_history': Repeated(Discrete(len(agents)), Tuple([ Discrete(len(passenger_actions)), Discrete(len(parameters)) ]), max_len=max_num_actions) }) perfect_dialogs = [ # ("starbucks", [('SAY', 'starbucks'), ('OVER', '.'), ('DRIVE', 'starbucks')]), # ("peets", [('SAY', 'peets'), ('OVER', '.'), ('DRIVE', 'peets')]), ("starbucks", [('SAY', 'starbucks'), ('OVER', '.'), ('CONFIRM', 'starbucks'), ('DRIVE', 'starbucks')]), ("peets", [('SAY', 'peets'), ('OVER', '.'), ('CONFIRM', 'peets'), ('DRIVE', 'peets')]), # ("starbucks", [('SAY', 'starbucks'), # ('OVER', '.'), # ('CONFIRM', 'starbucks'), # ('OVER', '.'), # ('YES', '.'), # ('OVER', '.'), # ('DRIVE', 'starbucks')]), # # ("peets", [('SAY', 'peets'), # ('OVER', '.'), # ('CONFIRM', 'peets'), # ('OVER', '.'), # ('YES', '.'), # ('OVER', '.'), # ('DRIVE', 'peets')]), ] def __init__(self, config): self.is_supervised = False destination_id = random.randint(3, len(NanoworldEnv.parameters) - 1) self.state = DialogStateNano( NanoworldEnv.max_num_actions, desired_destination=NanoworldEnv.parameters[destination_id]) self.num_episodes = 0 self.supervised_episodes = 10000 self.rewards = dict() self.print_episodes = 10000 def reset(self): ''' Called before each episode, returns the first observation ''' if self.num_episodes % 1000 == 0: logger.warning("completed {} episodes.".format(self.num_episodes)) if self.num_episodes >= self.print_episodes: logger.warning('episode ' + str(self.num_episodes)) logger.warning('------------') _, _, history, _ = self.state.get_global_state() for h in history: logger.warning(h) logger.warning('-------------') self.num_episodes += 1 # select the destination if self.is_supervised and self.num_episodes < self.supervised_episodes: a_list = [3, 4] distribution = [.5, .5] destination_id = random.choices(a_list, distribution)[0] else: destination_id = random.randint(3, len(NanoworldEnv.parameters) - 1) if self.num_episodes >= self.print_episodes: logger.warning('set destination: ' + NanoworldEnv.parameters[destination_id]) self.state = DialogStateNano( NanoworldEnv.max_num_actions, desired_destination=NanoworldEnv.parameters[destination_id]) self.obs = {'passenger': self.state.make_passenger_observation()} return self.obs def driver_step(self, action): a1, a2 = action self.state.update_state(NanoworldEnv.driver_actions[a1], NanoworldEnv.parameters[a2]) obs = self.state.make_driver_observation() return obs def passenger_step(self, action): a1, a2 = action self.state.update_state(NanoworldEnv.passenger_actions[a1], NanoworldEnv.parameters[a2]) obs = self.state.make_passenger_observation() return obs def compute_driver_reward(self): driver_reward = 0 _, verbal_history, _, driven_destination = self.state.get_global_state( ) if self.state.is_done(): # to compute at the very end if self.state.dialog_complete: if driven_destination: # completion through a final drive action if len(verbal_history ) == 0: # driver drives before user says anything driver_reward += -1 else: last_uttered_destination = verbal_history[-1].split( " ")[1] if driven_destination == last_uttered_destination: driver_reward += 1 else: driver_reward += -1 else: # timeout driver_reward += -10 else: # dialog not yet over driver_reward += 0 if self.is_supervised: # and self.num_episodes < self.supervised_episodes: driver_reward += self.compositional_supervision_reward() return driver_reward def compute_passenger_reward(self): desired_destination, verbal_history, _, driven_destination = self.state.get_global_state( ) passenger_reward = 0 if self.state.is_done(): # to compute at the very end if self.state.dialog_complete: # completion through a final drive action if desired_destination == driven_destination: passenger_reward += 1 else: passenger_reward += -1 else: # timeout passenger_reward += -10 else: # dialog not yet over passenger_reward += 0 if self.is_supervised: # and self.num_episodes < self.supervised_episodes: passenger_reward += self.compositional_supervision_reward() return passenger_reward def compute_supervision_reward(self): desired_dest, _, all_actions, _ = self.state.get_global_state() dialog_so_far = ", ".join(all_actions) for dest, dialog_raw in NanoworldEnv.perfect_dialogs: dialog = ", ".join([a + " " + p for a, p in dialog_raw]) if dest == desired_dest and dialog.startswith( dialog_so_far): # and len(all_actions) > 2: return 1 return 0 def compositional_supervision_reward(self): desired_dest, _, all_actions_sofar, _ = self.state.get_global_state() all_actions_sofar = " ".join( [action.split(" ")[0] for action in all_actions_sofar]) perfect_dialog = " ".join( [x[0] for x in NanoworldEnv.perfect_dialogs[0][1]]) if perfect_dialog.startswith(all_actions_sofar): return 1 else: return 0 # any kind of exploration is punished ... so negative reward in supervision is bad.. def compute_supervision_reward_negative(self): desired_dest, _, all_actions, _ = self.state.get_global_state() dialog_so_far = ", ".join(all_actions) for dest, dialog_raw in NanoworldEnv.perfect_dialogs: dialog = ", ".join([a + " " + p for a, p in dialog_raw]) if dest == desired_dest and dialog.startswith( dialog_so_far): # and len(all_actions) > 2: return 1 return -1 def step(self, action_dict): ''' Given an action_dict, compute the next observation, rewards, and dones ''' # pdb.set_trace() driver_obs = None passenger_obs = None if 'driver' in action_dict: driver_obs = self.driver_step(action_dict['driver']) if 'passenger' in action_dict: passenger_obs = self.passenger_step(action_dict['passenger']) if self.state.turn == 0: passenger_obs = self.state.make_passenger_observation() driver_obs = None elif self.state.turn == 1: driver_obs = self.state.make_driver_observation() passenger_obs = None self.obs = {} self.rewards = {} if passenger_obs: self.obs['passenger'] = passenger_obs self.rewards['passenger'] = self.compute_passenger_reward() if driver_obs: self.obs['driver'] = driver_obs self.rewards['driver'] = self.compute_driver_reward() self.dones = {'__all__': self.state.is_done()} if self.state.is_done(): self.obs['passenger'] = self.state.make_passenger_observation() self.rewards['passenger'] = self.compute_passenger_reward() self.obs['driver'] = self.state.make_driver_observation() self.rewards['driver'] = self.compute_driver_reward() self.infos = {} return self.obs, self.rewards, self.dones, self.infos
import gym from gym.spaces import Box, Discrete, Tuple, Dict from gym.envs.registration import EnvSpec import numpy as np import sys import ray from ray.rllib.agents.registry import get_agent_class from ray.rllib.test.test_multi_agent_env import MultiCartpole, MultiMountainCar from ray.rllib.utils.error import UnsupportedSpaceException from ray.tune.registry import register_env ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), "tuple": Tuple([Discrete(2), Discrete(3), Box(-1.0, 1.0, (5, ), dtype=np.float32)]), } OBSERVATION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), "image": Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32),
print(f"Running with following CLI args: {args}") return args if __name__ == "__main__": args = get_cli_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) # main part: configure the ActionMaskEnv and ActionMaskModel config = { # random env with 100 discrete actions and 5x [-1,1] observations # some actions are declared invalid and lead to errors "env": ActionMaskEnv, "env_config": { "action_space": Discrete(100), "observation_space": Box(-1.0, 1.0, (5, )), }, # the ActionMaskModel retrieves the invalid actions and avoids them "model": { "custom_model": ActionMaskModel if args.framework != "torch" else TorchActionMaskModel, # disable action masking according to CLI "custom_model_config": { "no_masking": args.no_masking }, }, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework, # Run with tracing enabled for tfe/tf2?
def action_space(self): return Discrete(5)
class ConnectFourEnv(Env): r""" An adversarial environment for playing the `Connect-Four game <https://en.wikipedia.org/wiki/Connect_Four>`_. Attributes ---------- action_space : gym.spaces.Discrete(7) The action space. observation_space : MultiDiscrete(nvec) The state observation space, representing the position of the current player's tokens (``s[1:,:,0]``) and the other player's tokens (``s[1:,:,1]``) as well as a mask over the space of actions, indicating which actions are available to the current player (``s[0,:,0]``) or the other player (``s[0,:,1]``). **Note:** The "current" player is relative to whose turn it is, which means that the entries ``s[:,:,0]`` and ``s[:,:,1]`` swap between turns. max_time_steps : int Maximum number of timesteps within each episode. available_actions : array of int Array of available actions. This list shrinks when columns saturate. win_reward : 1.0 The reward associated with a win. loss_reward : -1.0 The reward associated with a loss. draw_reward : 0.0 The reward associated with a draw. """ # noqa: E501 # class attributes num_rows = 6 num_cols = 7 num_players = 2 win_reward = 1.0 loss_reward = -win_reward draw_reward = 0.0 action_space = Discrete(num_cols) observation_space = MultiDiscrete( nvec=np.full((num_rows + 1, num_cols, num_players), 2, dtype='uint8')) max_time_steps = int(num_rows * num_cols) filters = np.array([ [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1]], [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [0, 0, 0, 0]], [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0]], [[0, 0, 0, 0], [1, 1, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0]], [[1, 1, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]], [[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0]], [[0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 1, 0]], [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1]], ], dtype='uint8') def __init__(self): self._init_state() def reset(self): r""" Reset the environment to the starting position. Returns ------- s : 3d-array, shape: [num_rows + 1, num_cols, num_players] A state observation, representing the position of the current player's tokens (``s[1:,:,0]``) and the other player's tokens (``s[1:,:,1]``) as well as a mask over the space of actions, indicating which actions are available to the current player (``s[0,:,0]``) or the other player (``s[0,:,1]``). **Note:** The "current" player is relative to whose turn it is, which means that the entries ``s[:,:,0]`` and ``s[:,:,1]`` swap between turns. """ self._init_state() return self.state def step(self, a): r""" Take one step in the MDP, following the single-player convention from gym. Parameters ---------- a : int, options: {0, 1, 2, 3, 4, 5, 6} The action to be taken. The action is the zero-based count of the possible insertion slots, starting from the left of the board. Returns ------- s_next : array, shape [6, 7, 2] A next-state observation, representing the position of the current player's tokens (``s[1:,:,0]``) and the other player's tokens (``s[1:,:,1]``) as well as a mask over the space of actions, indicating which actions are available to the current player (``s[0,:,0]``) or the other player (``s[0,:,1]``). **Note:** The "current" player is relative to whose turn it is, which means that the entries ``s[:,:,0]`` and ``s[:,:,1]`` swap between turns. r : float Reward associated with the transition :math:`(s, a)\to s_\text{next}`. **Note:** Since "current" player is relative to whose turn it is, you need to be careful about aligning the rewards with the correct state or state-action pair. In particular, this reward :math:`r` is the one associated with the :math:`s` and :math:`a`, i.e. *not* aligned with :math:`s_\text{next}`. done : bool Whether the episode is done. info : dict or None A dict with some extra information (or None). """ if self.done: raise EpisodeDoneError("please reset env to start new episode") if not self.action_space.contains(a): raise ValueError(f"invalid action: {repr(a)}") if a not in self.available_actions: raise UnavailableActionError("action is not available") # swap players self._players = np.roll(self._players, -1) # update state self._state[self._levels[a], a] = self._players[0] self._prev_action = a # run logic self.done, reward = self._done_reward(a) return self.state, reward, self.done, {'state_id': self.state_id} def render(self, *args, **kwargs): r""" Render the current state of the environment. """ # lookup for symbols symbol = { 1: u'\u25CF', # player 1 token (agent) 2: u'\u25CB', # player 2 token (adversary) -1: u'\u25BD', # indicator for player 1's last action -2: u'\u25BC', # indicator for player 2's last action } # render board hrule = '+---' * self.num_cols + '+\n' board = " " board += " ".join( symbol.get(-(a == self._prev_action) * self._players[1], " ") for a in range(self.num_cols)) board += " \n" board += hrule for i in range(self.num_rows): board += "| " board += " | ".join( symbol.get(self._state[i, j], " ") for j in range(self.num_cols)) board += " |\n" board += hrule board += " 0 1 2 3 4 5 6 \n" # actions print(board) @property def state(self): stacked_layers = np.stack( ( (self._state == self._players[0]).astype('uint8'), (self._state == self._players[1]).astype('uint8'), ), axis=-1) # shape: [num_rows, num_cols, num_players] available_actions_mask = np.zeros((1, self.num_cols, self.num_players), dtype='uint8') available_actions_mask[0, self.available_actions, :] = 1 return np.concatenate((available_actions_mask, stacked_layers), axis=0) @property def state_id(self): p = str(self._players[0]) d = '1' if self.done else '0' if self._prev_action is None: a = str(self.num_cols) else: a = str(self._prev_action) s = ''.join(self._state.ravel().astype('str')) # base-3 string s = '{:017x}'.format(int(s, 3)) # 17-char hex string return p + d + a + s # 20-char hex string def set_state(self, state_id): # decode state id p = int(state_id[0], 16) d = int(state_id[1], 16) a = int(state_id[2], 16) assert p in (1, 2) assert d in (0, 1) assert self.action_space.contains(a) or a == self.num_cols self._players[0] = p # 1 or 2 self._players[1] = 3 - p # 2 or 1 self.done = d == 1 self._prev_action = None if a == self.num_cols else a s = np._base_repr(int(state_id[3:], 16), 3) z = np.zeros(self.num_rows * self.num_cols, dtype='uint8') z[-len(s):] = np.array(list(s), dtype='uint8') self._state = z.reshape((self.num_rows, self.num_cols)) self._levels = np.full(self.num_cols, self.num_rows - 1, dtype='uint8') for j in range(self.num_cols): for i in self._state[::-1, j]: if i == 0: break self._levels[j] -= 1 @property def available_actions(self): actions = np.argwhere((self._levels >= 0) & (self._levels < self.num_rows)).ravel() assert actions.size <= self.num_cols return actions @property def available_actions_mask(self): mask = np.zeros(self.num_cols, dtype='bool') mask[self.available_actions] = True return mask def _init_state(self): self._prev_action = None self._players = np.array([1, 2], dtype='uint8') self._state = np.zeros((self.num_rows, self.num_cols), dtype='uint8') self._levels = np.full(self.num_cols, self.num_rows - 1, dtype='uint8') self.done = False def _done_reward(self, a): r""" Check whether the last action `a` by the current player resulted in a win or draw for player 1 (the agent). This contains the main logic and implements the rules of the game. """ assert self.action_space.contains(a) # update filling levels self._levels[a] -= 1 s = self._state == self._players[0] for i0 in range(2, -1, -1): i1 = i0 + 4 for j0 in range(4): j1 = j0 + 4 if np.any(np.tensordot(self.filters, s[i0:i1, j0:j1]) == 4): return True, 1.0 # check for a draw if len(self.available_actions) == 0: return True, 0.0 # this is what's returned throughout the episode return False, 0.0
def __init__(self, env_name, config, bootstrap_env=None): """ :param env_name: Name of the environment to create :param config: Configuration to use :param bootstra_env: Environment used for defining """ self.tolerance = 0.5 self.env_type = None self.env_name = env_name self.config = config if env_name == 'MAB': # Mario Brother environment raise NotImplementedError() elif env_name == 'combolock': # Deterministic Combination Lock self.env_type = GenerateEnvironmentWrapper.RL_ACID self.thread_safe = True assert config["obs_dim"] == 3 * config["horizon"] + 2, "Set obs_dim to -1 in config for auto selection" if bootstrap_env is not None: self.env = bootstrap_env else: self.env = CombinationLock(horizon=config["horizon"]) # Reach both states at a given time step with probability at least 0.5 (minus some tolerance) self.homing_policy_validation_fn = lambda dist, step: str((0, step)) in dist and str((1, step)) in dist and \ dist[str((0, step))] > 50 - self.tolerance and \ dist[str((1, step))] > 50 - self.tolerance elif env_name == 'stochcombolock': # Stochastic Combination Lock self.env_type = GenerateEnvironmentWrapper.RL_ACID self.thread_safe = True if config["noise"] == "bernoulli": self.noise_type = Environment.BERNOULLI assert config["obs_dim"] == 4 * config["horizon"] + 3, "Set obs_dim to -1 in config for auto selection" elif config["noise"] == "gaussian": self.noise_type = Environment.GAUSSIAN assert config["obs_dim"] == 3 * config["horizon"] + 3, "Set obs_dim to -1 in config for auto selection" else: raise AssertionError("Unhandled noise type %r" % self.noise_type) if bootstrap_env is not None: self.env = bootstrap_env else: self.env = StochasticCombinationLock(horizon=config["horizon"], swap=0.5, noise_type=self.noise_type) # Reach the two states with probability at least 0.25 each and the third state with probability at least 0.5 self.homing_policy_validation_fn = lambda dist, step: \ str((0, step)) in dist and str((1, step)) in dist and str((2, step)) in dist and \ dist[str((0, step))] + dist[str((1, step))] > 50 - self.tolerance and \ dist[str((2, step))] > 50 - self.tolerance elif env_name == 'diabcombolock': # Diabolical Stochastic Combination Lock self.env_type = GenerateEnvironmentWrapper.RL_ACID self.thread_safe = True self.trajectories = [] self.trajectory_cntr = 0 self.num_envs = 1 if config["noise"] == "bernoulli": self.noise_type = Environment.BERNOULLI assert config["obs_dim"] == 2 * config["horizon"] + 4, "Set obs_dim to -1 in config for auto selection" elif config["noise"] == "gaussian": self.noise_type = Environment.GAUSSIAN assert config["obs_dim"] == config["horizon"] + 4, "Set obs_dim to -1 in config for auto selection" elif config["noise"] == "hadamhard": self.noise_type = Environment.HADAMHARD assert config["obs_dim"] == get_sylvester_hadamhard_matrix_dim(config["horizon"] + 4), \ "Set obs_dim to -1 in config for auto selection" elif config["noise"] == "hadamhardg": self.noise_type = Environment.HADAMHARDG assert config["obs_dim"] == get_sylvester_hadamhard_matrix_dim(config["horizon"] + 4), \ "Set obs_dim to -1 in config for auto selection" else: raise AssertionError("Unhandled noise type %r" % config["noise"]) if bootstrap_env is not None: self.env = bootstrap_env else: self.env = DiabolicalCombinationLock(horizon=config["horizon"], swap=0.5, num_actions=10, anti_shaping_reward=0.1, noise_type=self.noise_type) self.action_space = Discrete(10) self.reward_range = (0.0,1.0) self.state_space = MultiBinary((config["horizon"]+1)*3) self.observation_space = Box(low=0.0, high=1.0, shape=(config["obs_dim"],),dtype=np.float) self.metadata = None setattr(self.observation_space, 'n', config["obs_dim"]) # Reach the two states with probability at least 0.25 each and the third state with probability at least 0.5 self.homing_policy_validation_fn = lambda dist, step: \ str((0, step)) in dist and str((1, step)) in dist and str((2, step)) in dist and \ dist[str((0, step))] + dist[str((1, step))] > 50 - self.tolerance and \ dist[str((2, step))] > 50 - self.tolerance elif env_name == 'maze': # Maze world self.env_type = GenerateEnvironmentWrapper.RL_ACID self.thread_safe = True if bootstrap_env is not None: self.env = bootstrap_env else: self.env = RandomGridWorld(M=3, swap=0.1, dim=2, noise=0.0) self.homing_policy_validation_fn = None elif env_name == 'montezuma': # Montezuma Revenge self.env_type = GenerateEnvironmentWrapper.OpenAIGym self.thread_safe = True self.num_repeat_action = 4 # Repeat each action these many times. if bootstrap_env is not None: self.env = bootstrap_env else: self.env = gym.make('MontezumaRevengeDeterministic-v4') # Since we don't have access to underline state in this problem, we cannot define a validation function self.homing_policy_validation_fn = None elif env_name == 'gridworld' or env_name == 'gridworld-feat': # Grid World self.env_type = GenerateEnvironmentWrapper.GRIDWORLD self.thread_safe = True if bootstrap_env is not None: self.env = bootstrap_env else: self.env = GridWorld(num_grid_row=4, num_grid_col=4, horizon=config["horizon"], obs_dim=config["obs_dim"]) reachable_states = self.env.get_reachable_states() num_states = self.env.get_num_states() self.homing_policy_validation_fn = lambda dist, step: all( [str(state) in dist and dist[str(state)] >= 1.0 / float(max(1, num_states)) - self.tolerance for state in reachable_states[step]]) else: raise AssertionError("Environment name %r not in RL Acid Environments " % env_name)
class FREnv(Env): """ Flamme Rouge Environment """ TRACKS = tuple(track for track in ALL_TRACKS if len(track) == 78) game: Game _track: Optional[Track] track: Track opponents: Tuple[Team, ...] = ( Peloton(colors="red"), Muscle(colors="green"), # Simple(colors='black'), Heuristic(colors="white"), ) reward_range = (-1, len(opponents)) action_space = Discrete(len(FRAction)) observation_space = AvailableActions( nb_actions=action_space.n, space=Box(low=-1, high=77, shape=(524, )), ) def __init__( self, team: Team, opponents: Optional[Tuple[Team, ...]] = None, track: Optional[Track] = None, ) -> None: super().__init__() self.team = team self.opponents = opponents or self.opponents self._track = track def _play_others(self) -> None: while True: teams = [ team for team in self.game.active_teams if team != self.team ] if not teams: return team = choice(teams) team_action = team.select_action(self.game) assert team_action is not None self.game.take_action(team, team_action) def reset(self) -> np.ndarray: self.track = self._track if self._track is not None else choice( FREnv.TRACKS) teams = (self.team, ) + self.opponents self.game = Game(track=self.track, teams=teams) while self.game.phase is Phase.START: self.game.play_action() assert self.game.phase is Phase.RACE self._play_others() LOGGER.debug(self.game) return self.observation def step(self, action: int) -> Tuple[np.ndarray, float, bool, dict]: assert not self.game.finished assert self.game.phase is Phase.RACE assert self.game.active_teams == (self.team, ) try: act = _to_action(action, self.team) assert act is not None self.game.take_action(self.team, act) except Exception as exp: LOGGER.debug("encountered exception: %r", exp, exc_info=True) LOGGER.debug( "action: %d / %s / %s, available actions: %s", action, FR_ACTIONS[action], act, self.game.available_actions(self.team), ) return self.observation, -1, True, {} if self.game.finished: winner = self.game.winner assert winner is not None assert winner.team is not None teams = self.game.sorted_teams assert teams[0] == winner.team position = teams.index(self.team) + 1 reward = len(self.game.teams) - position return self.observation, reward, True, {} self._play_others() assert not self.game.finished assert self.game.phase is Phase.RACE assert self.game.active_teams == (self.team, ) return self.observation, 0, False, {} def render(self, mode="human", close=False): print(self.game) def close(self): del self.game def seed(self, seed=None): pass def configure(self, *args, **kwargs): pass @property def observation(self): """ game observation """ available = frozenset( filter( None, map(FRAction.from_action, self.game.available_actions(self.team)))) return { "actions": np.array([a in available for a in FR_ACTIONS], dtype=bool), "values": FRData.from_game(self.game, self.team).to_array(), }
class StringGameEnvV1(Env): def __init__(self, max_steps=MAX_STEP): np.random.seed(123) torch.manual_seed(123) self.max_steps = max_steps self.reward_map = defaultdict(float) self.terminal_probs = defaultdict(float) self._init_reward_and_terminal_probs() self.recent_actions = deque([], maxlen=MAX_STEP) self.action_space = Discrete(ACTION_DIM) self.observation_space = Box(low=0, high=1, shape=(STATE_DIM, )) self.step_cnt = 0 self.reset() def _init_reward_and_terminal_probs(self): self.reward_map["AAA"] = 5.0 self.reward_map["BA"] = 4.0 self.terminal_probs["A"] = 0.5 self.terminal_probs["B"] = 0.1 def seed(self, seed=None): np.random.seed(seed) torch.manual_seed(seed) @staticmethod def random_action(): return np.random.randint(0, ACTION_DIM) def get_reward(self): """ The function you can write to customize rewards. In this specific environment, the reward only depends on action history """ recent_characters = [CHARACTERS[c] for c in list(self.recent_actions)] string = "".join(recent_characters) if not self.done: reward = 0 else: reward = self.reward_map[string] return reward, string def step(self, action): assert self.action_space.contains(action) assert self.done is False self.step_cnt += 1 self.recent_actions.append(action) if self.step_cnt >= self.max_steps: self.done = True else: self.done = self.sample_terminal(action) reward, info = self.get_reward() ob = self.get_observation() return ob, reward, self.done, {"reward_str": info} def sample_terminal(self, action): terminal_probability = self.terminal_probs[CHARACTERS[action]] if np.random.rand() < terminal_probability: return True return False def get_observation(self): """ The function you can write to customize transitions. In this specific environment, the next state is exactly the latest action taken. The initial observation is all zeros. """ ob = np.zeros(STATE_DIM) if len(self.recent_actions) > 0: ob[self.recent_actions[-1]] = 1 return ob def reset(self): self.done = False self.recent_actions = deque([], maxlen=MAX_STEP) self.step_cnt = 0 ob = self.get_observation() return ob def print_internal_state(self): action_str = "".join([CHARACTERS[c] for c in self.recent_actions]) logger.debug( f"Step {self.step_cnt}, recent actions {action_str}, terminal={self.done}" ) @staticmethod def print_ob(ob): return str(ob) @staticmethod def print_action(action): return CHARACTERS[action]
def __init__(self): self.observation_space = Tuple( [Discrete(5), Box(0, 5, shape=(3, ), dtype=np.float32)])
def __init__(self, board_size=7, num_rocks=8, use_heuristic=False, observation='o', stay_inside=False): """ :param board_size: int board is a square of board_size x board_size :param num_rocks: int number of rocks on board :param use_heuristic: bool usage unclear :param observation: str must be one of 'o': observed value only 'po': position of the agent + the above 'poa': the above + the action taken """ assert board_size in list(config.keys()) and \ num_rocks == len(config[board_size]["rock_pos"]) self.num_rocks = num_rocks self._use_heuristic = use_heuristic self._rock_pos = \ [Coord(*rock) for rock in config[board_size]['rock_pos']] self._agent_pos = Coord(*config[board_size]['init_pos']) self.grid = Grid(board_size, board_size) for idx, rock in enumerate(self._rock_pos): self.grid.board[rock] = idx self.action_space = Discrete(len(Action) + self.num_rocks) self._discount = .95 self._reward_range = 20 self._penalization = -100 self._query = 0 if stay_inside: self._out_of_bounds_penalty = 0 else: self._out_of_bounds_penalty = self._penalization self.state = None self.last_action = None self.done = False self.gui = None assert observation in ['o', 'oa', 'po', 'poa'] if observation == 'o': self._make_obs = lambda obs, a: obs self.observation_space = Discrete(len(Obs)) elif observation == 'oa': self._make_obs = self._oa self.observation_space =\ Box(low=0, high=np.append(max(Obs), np.ones(self.action_space.n)), dtype=np.int) elif observation == 'po': self._make_obs = self._po self.observation_space = \ Box(low=0, high=np.append(np.ones(self.grid.n_tiles), max(Obs)), dtype=np.int) elif observation == 'poa': self._make_obs = self._poa self.observation_space = \ Box(low=0, high=np.concatenate((np.ones(self.grid.n_tiles), [max(Obs)], np.ones(self.action_space.n))), dtype=np.int)
def action_space(self) -> Discrete: """The discrete action space produced by the action scheme.""" return Discrete(len(self.actions))
class SawyerXYZEnv(SawyerMocapBase, metaclass=abc.ABCMeta): _HAND_SPACE = Box(np.array([-0.51, .38, -.05]), np.array([+0.51, 1.0, .51])) def __init__( self, model_name, frame_skip=5, hand_low=(-0.2, 0.55, 0.05), hand_high=(0.2, 0.75, 0.3), mocap_low=None, mocap_high=None, action_scale=1. / 100, action_rot_scale=1., ): super().__init__(model_name, frame_skip=frame_skip) self.random_init = True self.action_scale = action_scale self.action_rot_scale = action_rot_scale self.hand_low = np.array(hand_low) self.hand_high = np.array(hand_high) if mocap_low is None: mocap_low = hand_low if mocap_high is None: mocap_high = hand_high self.mocap_low = np.hstack(mocap_low) self.mocap_high = np.hstack(mocap_high) self.curr_path_length = 0 self._freeze_rand_vec = True self._last_rand_vec = None # We use continuous goal space by default and # can discretize the goal space by calling # the `discretize_goal_space` method. self.discrete_goal_space = None self.discrete_goals = [] self.active_discrete_goal = None self.action_space = Box( np.array([-1, -1, -1, -1]), np.array([+1, +1, +1, +1]), ) self._pos_obj_max_len = 6 self._pos_obj_possible_lens = (3, 6) self._set_task_called = False self._partially_observable = True self._state_goal = None # OVERRIDE ME self._random_reset_space = None # OVERRIDE ME def _set_task_inner(self): # Doesn't absorb "extra" kwargs, to ensure nothing's missed. pass def set_task(self, task): self._set_task_called = True data = pickle.loads(task.data) assert isinstance(self, data['env_cls']) del data['env_cls'] self._last_rand_vec = data['rand_vec'] self._freeze_rand_vec = True self._last_rand_vec = data['rand_vec'] del data['rand_vec'] self._partially_observable = data['partially_observable'] del data['partially_observable'] self._set_task_inner(**data) def set_xyz_action(self, action): action = np.clip(action, -1, 1) pos_delta = action * self.action_scale new_mocap_pos = self.data.mocap_pos + pos_delta[None] new_mocap_pos[0, :] = np.clip( new_mocap_pos[0, :], self.mocap_low, self.mocap_high, ) self.data.set_mocap_pos('mocap', new_mocap_pos) self.data.set_mocap_quat('mocap', np.array([1, 0, 1, 0])) def discretize_goal_space(self, goals): assert False assert len(goals) >= 1 self.discrete_goals = goals # update the goal_space to a Discrete space self.discrete_goal_space = Discrete(len(self.discrete_goals)) # Belows are methods for using the new wrappers. # `sample_goals` is implmented across the sawyer_xyz # as sampling from the task lists. This will be done # with the new `discrete_goals`. After all the algorithms # conform to this API (i.e. using the new wrapper), we can # just remove the underscore in all method signature. def sample_goals_(self, batch_size): assert False if self.discrete_goal_space is not None: return [ self.discrete_goal_space.sample() for _ in range(batch_size) ] else: return [self.goal_space.sample() for _ in range(batch_size)] def set_goal_(self, goal): assert False if self.discrete_goal_space is not None: self.active_discrete_goal = goal self.goal = self.discrete_goals[goal] self._state_goal_idx = np.zeros(len(self.discrete_goals)) self._state_goal_idx[goal] = 1. else: self.goal = goal def _set_obj_xyz(self, pos): qpos = self.data.qpos.flat.copy() qvel = self.data.qvel.flat.copy() qpos[9:12] = pos.copy() qvel[9:15] = 0 self.set_state(qpos, qvel) def get_site_pos(self, siteName): _id = self.model.site_names.index(siteName) return self.data.site_xpos[_id].copy() def _get_pos_objects(self): """Retrieves object position(s) from mujoco properties or instance vars Returns: np.ndarray: Flat array (usually 3 elements) representing the object(s)' position(s) """ # Throw error rather than making this an @abc.abstractmethod so that # V1 environments don't have to implement it raise NotImplementedError def _get_pos_goal(self): """Retrieves goal position from mujoco properties or instance vars Returns: np.ndarray: Flat array (3 elements) representing the goal position """ assert isinstance(self._state_goal, np.ndarray) assert self._state_goal.ndim == 1 return self._state_goal def _get_obs(self): """Combines positions of the end effector, object(s) and goal into a single flat observation Returns: np.ndarray: The flat observation array (12 elements) """ pos_hand = self.get_endeff_pos() pos_obj_padded = np.zeros(self._pos_obj_max_len) pos_obj = self._get_pos_objects() assert len(pos_obj) in self._pos_obj_possible_lens pos_obj_padded[:len(pos_obj)] = pos_obj pos_goal = self._get_pos_goal() if self._partially_observable: pos_goal = np.zeros_like(pos_goal) return np.hstack((pos_hand, pos_obj_padded, pos_goal)) def _get_obs_dict(self): obs = self._get_obs() return dict( state_observation=obs, state_desired_goal=self._get_pos_goal(), state_achieved_goal=obs[3:-3], ) @property def observation_space(self): obj_low = np.full(6, -np.inf) obj_high = np.full(6, +np.inf) return Box( np.hstack((self._HAND_SPACE.low, obj_low, self.goal_space.low)), np.hstack((self._HAND_SPACE.high, obj_high, self.goal_space.high))) def reset(self): self.curr_path_length = 0 return super().reset() def _get_state_rand_vec(self): if self._freeze_rand_vec: assert self._last_rand_vec is not None return self._last_rand_vec else: rand_vec = np.random.uniform( self._random_reset_space.low, self._random_reset_space.high, size=self._random_reset_space.low.size) self._last_rand_vec = rand_vec return rand_vec
def __init__(self, obs_space, action_space, num_outputs, model_config, name): super(AutoregressiveActionsModel, self).__init__( obs_space, action_space, num_outputs, model_config, name) if action_space != Tuple([Discrete(2), Discrete(2)]): raise ValueError( "This model only supports the [2, 2] action space") # Inputs obs_input = tf.keras.layers.Input( shape=obs_space.shape, name="obs_input") a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input") ctx_input = tf.keras.layers.Input( shape=(num_outputs, ), name="ctx_input") # Output of the model (normally 'logits', but for an autoregressive # dist this is more like a context/feature layer encoding the obs) context = tf.keras.layers.Dense( num_outputs, name="hidden", activation=tf.nn.tanh, kernel_initializer=normc_initializer(1.0))(obs_input) # V(s) value_out = tf.keras.layers.Dense( 1, name="value_out", activation=None, kernel_initializer=normc_initializer(0.01))(context) # P(a1 | obs) a1_logits = tf.keras.layers.Dense( 2, name="a1_logits", activation=None, kernel_initializer=normc_initializer(0.01))(ctx_input) # P(a2 | a1) # --note: typically you'd want to implement P(a2 | a1, obs) as follows: # a2_context = tf.keras.layers.Concatenate(axis=1)( # [ctx_input, a1_input]) a2_context = a1_input a2_hidden = tf.keras.layers.Dense( 16, name="a2_hidden", activation=tf.nn.tanh, kernel_initializer=normc_initializer(1.0))(a2_context) a2_logits = tf.keras.layers.Dense( 2, name="a2_logits", activation=None, kernel_initializer=normc_initializer(0.01))(a2_hidden) # Base layers self.base_model = tf.keras.Model(obs_input, [context, value_out]) self.register_variables(self.base_model.variables) self.base_model.summary() # Autoregressive action sampler self.action_model = tf.keras.Model([ctx_input, a1_input], [a1_logits, a2_logits]) self.action_model.summary() self.register_variables(self.action_model.variables)
import unittest import traceback import gym from gym.spaces import Box, Discrete, Tuple from gym.envs.registration import EnvSpec import ray from ray.rllib.agent import get_agent_class from ray.rllib.utils.error import UnsupportedSpaceException from ray.tune.registry import register_env ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(0.0, 1.0, (5, )), "simple_tuple": Tuple([Box(0.0, 1.0, (5, )), Box(0.0, 1.0, (5, ))]), "implicit_tuple": [Box(0.0, 1.0, (5, )), Box(0.0, 1.0, (5, ))], } OBSERVATION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(0.0, 1.0, (5, )), "image": Box(0.0, 1.0, (80, 80, 1)), "atari": Box(0.0, 1.0, (210, 160, 3)), "atari_ram": Box(0.0, 1.0, (128, )), "simple_tuple": Tuple([Box(0.0, 1.0, (5, )), Box(0.0, 1.0, (5, ))]), "mixed_tuple": Tuple([Discrete(10), Box(0.0, 1.0, (5, ))]), }
def __init__(self, _): self.observation_space = Discrete(2) self.action_space = Tuple([Discrete(2), Discrete(2)])
parser = argparse.ArgumentParser() parser.add_argument( "--framework", choices=["tf", "tf2", "tfe", "torch"], default="tf", help="The DL framework specifier.", ) if __name__ == "__main__": args = parser.parse_args() # Test API wrapper for dueling Q-head. obs_space = Box(-1.0, 1.0, (3, )) action_space = Discrete(3) # Run in eager mode for value checking and debugging. tf1.enable_eager_execution() # __sphinx_doc_model_construct_1_begin__ my_dueling_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=MODEL_DEFAULTS, framework=args.framework, # Providing the `model_interface` arg will make the factory # wrap the chosen default model with our new model API class # (DuelingQModel). This way, both `forward` and `get_q_values` # are available in the returned class.
class RockEnv(Env): metadata = {"render.modes": ["human", "ansi"]} def __init__(self, board_size=7, num_rocks=8, use_heuristic=False): assert board_size in list( config.keys()) and num_rocks in config[board_size]['size'] self.num_rocks = num_rocks self._use_heuristic = use_heuristic self._rock_pos = [ Coord(*rock) for rock in config[board_size]['rock_pos'] ] self._agent_pos = Coord(*config[board_size]['init_pos']) self.grid = Grid(board_size, board_size) for idx, rock in enumerate(self._rock_pos): self.grid.board[rock] = idx self.action_space = Discrete(len(Action) + self.num_rocks) self.observation_space = Discrete(len(Obs)) self._discount = .95 self._reward_range = 20 self._penalization = -100 self._query = 0 def seed(self, seed=None): np.random.seed(seed) def step(self, action): assert self.action_space.contains(action) assert self.done is False self.last_action = action self._query += 1 reward = 0 ob = Obs.NULL.value if action < Action.SAMPLE.value: if action == Action.EAST.value: if self.state.agent_pos.x + 1 < self.grid.x_size: self.state.agent_pos += Moves.EAST.value else: reward = 10 self.done = True return ob, reward, self.done, { "state": self._encode_state(self.state) } elif action == Action.NORTH.value: if self.state.agent_pos.y + 1 < self.grid.y_size: self.state.agent_pos += Moves.NORTH.value else: reward = self._penalization elif action == Action.SOUTH.value: if self.state.agent_pos.y - 1 >= 0: self.state.agent_pos += Moves.SOUTH.value else: reward = self._penalization elif action == Action.WEST.value: if self.state.agent_pos.x - 1 >= 0: self.state.agent_pos += Moves.WEST.value else: reward = self._penalization else: raise NotImplementedError() if action == Action.SAMPLE.value: rock = self.grid[self.state.agent_pos] if rock >= 0 and not self.state.rocks[ rock].status == 0: # collected if self.state.rocks[rock].status == 1: reward = 10 else: reward = -10 self.state.rocks[rock].status = 0 else: reward = self._penalization if action > Action.SAMPLE.value: rock = action - Action.SAMPLE.value - 1 assert rock < self.num_rocks ob = self._sample_ob(self.state.agent_pos, self.state.rocks[rock]) self.state.rocks[rock].measured += 1 eff = self._efficiency(self.state.agent_pos, self.state.rocks[rock].pos) if ob == Obs.GOOD.value: self.state.rocks[rock].count += 1 self.state.rocks[rock].lkv *= eff self.state.rocks[rock].lkw *= (1 - eff) else: self.state.rocks[rock].count -= 1 self.state.rocks[rock].lkw *= eff self.state.rocks[rock].lkv *= (1 - eff) denom = (.5 * self.state.rocks[rock].lkv) + ( .5 * self.state.rocks[rock].lkw) self.state.rocks[rock].prob_valuable = ( .5 * self.state.rocks[rock].lkv) / denom self.done = self._penalization == reward return ob, reward, self.done, {"state": self._encode_state(self.state)} def _decode_state(self, state, as_array=False): agent_pos = Coord(*state['agent_pos']) rock_state = RockState(agent_pos) for r in state['rocks']: rock = Rock(pos=0) rock.__dict__.update(r) rock_state.rocks.append(rock) if as_array: rocks = [] for rock in rock_state.rocks: rocks.append(rock.status) return np.concatenate([[self.grid.get_index(agent_pos)], rocks]) return rock_state def _encode_state(self, state): # use dictionary for state encodign return _encode_dict(state) # rocks can take 3 values: -1, 1, 0 if collected def render(self, mode='human', close=False): if close: return if mode == "human": if not hasattr(self, "gui"): start_pos = self.grid.get_index(self.state.agent_pos) obj_pos = [(self.grid.get_index(rock.pos), rock.status) for rock in self.state.rocks] self.gui = RockGui((self.grid.x_size, self.grid.y_size), start_pos=start_pos, obj=obj_pos) if self.last_action > Action.SAMPLE.value: rock = self.last_action - Action.SAMPLE.value - 1 print("Rock S: {} P:{}".format(self.state.rocks[rock].status, self.state.rocks[rock].pos)) # msg = "Action : " + action_to_str(self.last_action) + " Step: " + str(self.t) + " Rw: " + str(self.total_rw) agent_pos = self.grid.get_index(self.state.agent_pos) self.gui.render(agent_pos) def reset(self): self.done = False self._query = 0 self.last_action = Action.SAMPLE.value self.state = self._get_init_state(should_encode=False) return Obs.NULL.value def _set_state(self, state): self.done = False self.state = self._decode_state(state) def close(self): self.render(close=True) def _compute_prob(self, action, next_state, ob): next_state = self._decode_state(next_state) if action <= Action.SAMPLE.value: return int(ob == Obs.NULL.value) eff = self._efficiency( next_state.agent_pos, next_state.rocks[action - Action.SAMPLE.value - 1].pos) if ob == Obs.GOOD.value and next_state.rocks[action - Action.SAMPLE.value - 1].status == 1: return eff elif ob == Obs.BAD.value and next_state.rocks[action - Action.SAMPLE.value - 1].status == -1: return eff else: return 1 - eff def _get_init_state(self, should_encode=True): rock_state = RockState(self._agent_pos) for idx in range(self.num_rocks): rock_state.rocks.append(Rock(self._rock_pos[idx])) return self._encode_state(rock_state) if should_encode else rock_state def _generate_legal(self): legal = [Action.EAST.value] # can always go east if self.state.agent_pos.y + 1 < self.grid.y_size: legal.append(Action.NORTH.value) if self.state.agent_pos.y - 1 >= 0: legal.append(Action.SOUTH.value) if self.state.agent_pos.x - 1 >= 0: legal.append(Action.WEST.value) rock = self.grid[self.state.agent_pos] if rock >= 0 and self.state.rocks[rock].status != 0: legal.append(Action.SAMPLE.value) for rock in self.state.rocks: assert self.grid[rock.pos] != -1 if rock.status != 0: legal.append(self.grid[rock.pos] + 1 + Action.SAMPLE.value) return legal def _generate_preferred(self, history): if not self._use_heuristic: return self._generate_legal() actions = [] # sample rocks with high likelihood of being good rock = self.grid[self.state.agent_pos] if rock >= 0 and self.state.rocks[rock].status != 0 and history.size: total = 0 for transition in history: if transition.action == rock + 1 + Action.SAMPLE.value: if transition.next_observation == Obs.GOOD.value: total += 1 elif transition.next_observation == Obs.BAD.value: total -= 1 if total > 0: actions.append(Action.SAMPLE.value) return actions # process the rocks all_bad = True direction = { "north": False, "south": False, "west": False, "east": False } for idx in range(self.num_rocks): rock = self.state.rocks[idx] if rock.status != 0: total = 0 for transition in history: if transition.action == idx + 1 + Action.SAMPLE.value: if transition.next_observation == Obs.GOOD.value: total += 1 elif transition.observation == Obs.BAD.value: total -= 1 if total >= 0: all_bad = False if rock.pos.y > self.state.agent_pos.y: direction['north'] = True elif rock.pos.y < self.state.agent_pos.y: direction['south'] = True elif rock.pos.x < self.state.agent_pos.x: direction['west'] = True elif rock.pos.x > self.state.agent_pos.x: direction['east'] = True if all_bad: actions.append(Action.EAST.value) return actions # generate a random legal move # do not measure a collected rock # do no measure a rock too often # do not measure clearly bad rocks # don't move in a direction that puts you closer to bad rocks # never sample a rock if self.state.agent_pos.y + 1 < self.grid.y_size and direction['north']: actions.append(Action.NORTH.value) if direction['east']: actions.append(Action.EAST.value) if self.state.agent_pos.y - 1 >= 0 and direction['south']: actions.append(Action.SOUTH.value) if self.state.agent_pos.x - 1 >= 0 and direction['west']: actions.append(Action.WEST.value) for idx, rock in enumerate(self.state.rocks): if not rock.status == 0 and rock.measured < 5 and abs( rock.count) < 2 and 0 < rock.prob_valuable < 1: actions.append(idx + 1 + Action.SAMPLE.value) if len(actions) == 0: return self._generate_legal() return actions def __dict2np__(self, state): idx = self.grid.get_index(Coord(*state['agent_pos'])) rocks = [] for rock in state['rocks']: rocks.append(rock['status']) return np.concatenate([[idx], rocks]) @staticmethod def _efficiency(agent_pos, rock_pos, hed=20): d = Grid.euclidean_distance(agent_pos, rock_pos) eff = (1 + pow(2, -d / hed)) * .5 return eff @staticmethod def _select_target(rock_state, x_size): best_dist = x_size * 2 best_rock = -1 # Coord(-1, -1) for idx, rock in enumerate(rock_state.rocks): if rock.status != 0 and rock.count >= 0: d = Grid.manhattan_distance(rock_state.agent_pos, rock.pos) if d < best_dist: best_dist = d best_rock = idx # rock.pos return best_rock @staticmethod def _sample_ob(agent_pos, rock, hed=20): eff = RockEnv._efficiency(agent_pos, rock.pos, hed=hed) if np.random.binomial(1, eff): return Obs.GOOD.value if rock.status == 1 else Obs.BAD.value else: return Obs.BAD.value if rock.status == 1 else Obs.GOOD.value
from collections import OrderedDict import numpy as np import pytest from gym.spaces import Box, Dict, Discrete, MultiBinary, MultiDiscrete, Tuple, utils spaces = [ Discrete(3), Box(low=0.0, high=np.inf, shape=(2, 2)), Box(low=0.0, high=np.inf, shape=(2, 2), dtype=np.float16), Tuple([Discrete(5), Discrete(10)]), Tuple([ Discrete(5), Box(low=np.array([0.0, 0.0]), high=np.array([1.0, 5.0]), dtype=np.float64), ]), Tuple((Discrete(5), Discrete(2), Discrete(2))), MultiDiscrete([2, 2, 10]), MultiBinary(10), Dict({ "position": Discrete(5), "velocity": Box(low=np.array([0.0, 0.0]), high=np.array([1.0, 5.0]), dtype=np.float64), }), Discrete(3, start=2), Discrete(8, start=-5),
def __init__(self, config): self.end_pos = config["corridor_length"] self.cur_pos = 0 self.action_space = Discrete(2) self.observation_space = Box( 0.0, self.end_pos, shape=(1, ), dtype=np.float32)
def __init__(self, config: Config) -> None: spaces = { get_default_config().GOAL_SENSOR_UUID: Box( low=np.finfo(np.float32).min, high=np.finfo(np.float32).max, shape=(2,), dtype=np.float32, ) } if config.INPUT_TYPE in ["depth", "rgbd"]: spaces["depth"] = Box( low=0, high=1, shape=(config.RESOLUTION, config.RESOLUTION, 1), dtype=np.float32, ) if config.INPUT_TYPE in ["rgb", "rgbd"]: spaces["rgb"] = Box( low=0, high=255, shape=(config.RESOLUTION, config.RESOLUTION, 3), dtype=np.uint8, ) observation_spaces = SpaceDict(spaces) action_spaces = Discrete(4) self.device = ( torch.device("cuda:{}".format(config.PTH_GPU_ID)) if torch.cuda.is_available() else torch.device("cpu") ) self.hidden_size = config.HIDDEN_SIZE random.seed(config.RANDOM_SEED) torch.random.manual_seed(config.RANDOM_SEED) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True # type: ignore self.actor_critic = PointNavBaselinePolicy( observation_space=observation_spaces, action_space=action_spaces, hidden_size=self.hidden_size, ) self.actor_critic.to(self.device) if config.MODEL_PATH: ckpt = torch.load(config.MODEL_PATH, map_location=self.device) # Filter only actor_critic weights self.actor_critic.load_state_dict( { k[len("actor_critic.") :]: v for k, v in ckpt["state_dict"].items() if "actor_critic" in k } ) else: habitat.logger.error( "Model checkpoint wasn't loaded, evaluating " "a random model." ) self.test_recurrent_hidden_states: Optional[torch.Tensor] = None self.not_done_masks: Optional[torch.Tensor] = None self.prev_actions: Optional[torch.Tensor] = None
def __init__(self, seed, game_config, render=False, use_depth=False, use_rgb=True, reward_scale=1, frame_skip=4, jitter_rgb=False, noise_var=0.2, drop_input_prob=0.0, rotate_sensor=False, rotate_range=30, drop_input_freq=3, flicker_freq=1): # assign observation space self.use_rgb = use_rgb self.use_depth = use_depth channel_num = 0 if use_depth: channel_num = channel_num + 1 if use_rgb: channel_num = channel_num + 3 self.observation_shape = (channel_num, 84, 84) self.observation_space = Box(low=0, high=255, shape=self.observation_shape) self.reward_scale = reward_scale self.jitter_rgb = jitter_rgb self.noise_var = noise_var self.drop_input_prob = drop_input_prob self.drop_input_freq = drop_input_freq self.flicker_freq = flicker_freq self.prepare_drop_input() self.rotate_sensor = rotate_sensor self.rotate_range = rotate_range game = vzd.DoomGame() game.load_config(game_config) # game input setup game.set_screen_resolution(vzd.ScreenResolution.RES_160X120) game.set_screen_format(vzd.ScreenFormat.CRCGCB) if use_depth: game.set_depth_buffer_enabled(True) # Adds buttons that will be allowed. num_buttons = game.get_available_buttons_size() self.action_space = Discrete(num_buttons) actions = [([False] * num_buttons) for i in range(num_buttons)] for i in range(num_buttons): actions[i][i] = True self.actions = actions # set frame skip for taking action self.frame_skip = frame_skip game.set_seed(seed) random.seed(seed) game.set_window_visible(render) game.init() self.game = game
def _configure(self): self._load_activities() self.action_space = Discrete(len(self.activities)) self.observation_space = Box(0, 1, len(self.knowledges)) # self.simulator = StudentSimulator()
def test_action_space(self): """Test action spaces.""" assert self.env.action_space == Discrete(2)
def test_observation_space(self): """Test observation spaces.""" expected_size = len(WEATHER) * len(CAR_CONDITION) * len(ROAD_STATE) assert self.env.observation_space == Discrete(expected_size)
if __name__ == "__main__": ray.init(local_mode=True) args = parser.parse_args() ModelCatalog.register_custom_model( "cc_model", TorchCentralizedCriticModel if args.torch else CentralizedCriticModel) config = { "env": TwoStepGame, "batch_mode": "complete_episodes", "num_workers": 0, "multiagent": { "policies": { "pol1": (None, Discrete(6), TwoStepGame.action_space, { "framework": "torch" if args.torch else "tf", }), "pol2": (None, Discrete(6), TwoStepGame.action_space, { "framework": "torch" if args.torch else "tf", }), }, "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2", }, "model": { "custom_model": "cc_model", }, "framework": "torch" if args.torch else "tf", } stop = {