def __init__(self, config): self.cur_pos = 0 self.action_space = Discrete(4) # Represents an item. self.item_space = Discrete(5) # Represents an effect on the player. self.effect_space = Box(9000, 9999, shape=(4, )) # Represents a player. self.player_space = Dict({ "location": Box(-100, 100, shape=(2, )), "status": Box(-1, 1, shape=(10, )), "items": Repeated(self.item_space, max_len=MAX_ITEMS), "effects": Repeated(self.effect_space, max_len=MAX_EFFECTS), }) # Observation is a list of players. self.observation_space = Repeated(self.player_space, max_len=MAX_PLAYERS)
def make_obs_space(embed_dim=768, max_steps=None, max_utterances=5, max_command_length=5, max_variables=10, max_actions=10, **kwargs): true_obs = { 'dialog_history': Repeated(Dict({ 'sender': Discrete(3), 'utterance': Box(-10, 10, shape=(embed_dim, )) }), max_len=max_utterances), 'partial_command': Repeated(Box(-10, 10, shape=(embed_dim, )), max_len=max_command_length), 'variables': Repeated(Box(-10, 10, shape=(embed_dim, )), max_len=max_variables), } if max_steps: true_obs['steps'] = Discrete(max_steps) # return Dict(true_obs) For calculating true_obs_shsape return Dict({ "true_obs": Dict(true_obs), '_action_mask': MultiDiscrete([2 for _ in range(max_actions)]), '_action_embeds': Box(-10, 10, shape=(max_actions, embed_dim)), })
def test_repeated(self): space = Repeated(gym.spaces.Box(low=-1, high=1, shape=(1, 200)), max_len=8) d = gym_space_to_dict(space) sp = gym_space_from_dict(d) self.assertTrue(isinstance(sp.child_space, gym.spaces.Box)) self.assertEqual(space.max_len, sp.max_len) self.assertEqual(space.dtype, sp.dtype)
class SimpleRPG(gym.Env): """Example of a custom env with a complex, structured observation. The observation is a list of players, each of which is a Dict of attributes, and may further hold a list of items (categorical space). Note that the env doesn't train, it's just a dummy example to show how to use spaces.Repeated in a custom model (see CustomRPGModel below). """ def __init__(self, config): self.cur_pos = 0 self.action_space = Discrete(4) # Represents an item. self.item_space = Discrete(5) # Represents an effect on the player. self.effect_space = Box(9000, 9999, shape=(4, )) # Represents a player. self.player_space = Dict({ "location": Box(-100, 100, shape=(2, )), "status": Box(-1, 1, shape=(10, )), "items": Repeated(self.item_space, max_len=MAX_ITEMS), "effects": Repeated(self.effect_space, max_len=MAX_EFFECTS), }) # Observation is a list of players. self.observation_space = Repeated(self.player_space, max_len=MAX_PLAYERS) def reset(self): return self.observation_space.sample() def step(self, action): return self.observation_space.sample(), 1, True, {}
def __init__(self, environment): self.environment = environment self.machine_state = MultiDiscrete([2] * self.environment.schedule_length) self.job_state = Repeated(Box( low=-self.environment.max_steps_per_iterations, high=self.environment.max_steps_per_iterations, shape=(3, ), dtype=np.int), max_len=self.environment.max_job_slots) self.observation_space = Dict({ 'machine_state': self.machine_state, 'job_state': self.job_state })
def _repeated(d: Dict) -> Repeated: child_space = gym_space_from_dict(d["child_space"]) return Repeated(child_space=child_space, max_len=d["max_len"])
spaces.Box(low=0, high=1, shape=(10, 10, 3)))), spaces.Discrete(5), ]) TUPLE_SAMPLES = [TUPLE_SPACE.sample() for _ in range(10)] # Constraints on the Repeated space. MAX_PLAYERS = 4 MAX_ITEMS = 7 MAX_EFFECTS = 2 ITEM_SPACE = spaces.Box(-5, 5, shape=(1, )) EFFECT_SPACE = spaces.Box(9000, 9999, shape=(4, )) PLAYER_SPACE = spaces.Dict({ "location": spaces.Box(-100, 100, shape=(2, )), "items": Repeated(ITEM_SPACE, max_len=MAX_ITEMS), "effects": Repeated(EFFECT_SPACE, max_len=MAX_EFFECTS), "status": spaces.Box(-1, 1, shape=(10, )), }) REPEATED_SPACE = Repeated(PLAYER_SPACE, max_len=MAX_PLAYERS) REPEATED_SAMPLES = [REPEATED_SPACE.sample() for _ in range(10)] def one_hot(i, n): out = [0.0] * n out[i] = 1.0 return out
class NanoworldEnv(MultiAgentEnv): # Constants agents = ('passenger', 'driver') max_num_actions = 8 destination = ["", "starbucks", "peets"] # Action spaces passenger_actions = [ "wait for driver", "say starbucks", "say peets", "mental starbucks", "mental peets" ] # passenger_actions = ["wait for driver", "say starbucks", "say peets"] passenger_action_space = Discrete(len(passenger_actions)) driver_actions = ["wait for passenger", "drive starbucks", "drive peets"] driver_action_space = Discrete(len(driver_actions)) # observation spaces # wait, say starbucks, say peets -- can be repeated at most 4 times + # mental state (none, starbucks, peets) passenger_observation_space = Dict({ 'dialog_history': Repeated(Discrete(3), max_len=max_num_actions), 'destination': Discrete(3) }) # wait, say starbucks, say peets -- can be repeated at most 4 times driver_observation_space = Dict( {'dialog_history': Repeated(Discrete(3), max_len=max_num_actions)}) def __init__(self, config): destination_id = random.randint(1, 2) self.state = DialogStateNano( NanoworldEnv.max_num_actions, desired_destination=NanoworldEnv.destination[destination_id]) self.num_epidodes = 0 # self.is_supervised = config['is_supervised'] def reset(self): ''' Called before each episode, returns the first observation ''' if self.num_epidodes % 1000 == 0: logger.warning("completed {} episodes.".format(self.num_epidodes)) if self.num_epidodes >= 10000: logger.warning('episode ' + str(self.num_epidodes)) logger.warning('------------') _, _, history, _ = self.state.get_global_state() for h in history: logger.warning(h) logger.warning('-------------') self.num_epidodes += 1 destination_id = random.randint(1, 2) if self.num_epidodes >= 10000: logger.warning('set destination: ' + NanoworldEnv.destination[destination_id]) self.state = DialogStateNano( NanoworldEnv.max_num_actions, desired_destination=NanoworldEnv.destination[destination_id]) self.obs = { 'driver': self.state.make_driver_observation(), 'passenger': self.state.make_passenger_observation() } return self.obs def driver_step(self, action): self.state.update_state(NanoworldEnv.driver_actions[action]) obs = self.state.make_driver_observation() return obs def passenger_step(self, action): self.state.update_state(NanoworldEnv.passenger_actions[action]) obs = self.state.make_passenger_observation() return obs def compute_passenger_reward(self): # if self.is_supervised: # return self.compute_episode_reward_supervised() # else: return self.compute_episode_reward() def compute_driver_reward(self): # return self.compute_episode_reward() driver_reward = 0 desired_destination, verbal_history, all_actions, driven_destination = self.state.get_global_state( ) if self.state.dialog_complete: # to compute at the very end if driven_destination: if len(verbal_history ) == 0: # driver drives before user says anything return -1 else: last_uttered_destination = verbal_history[-1].split(" ")[1] if driven_destination == last_uttered_destination: return 1 else: return -1 else: # timeout return -10 else: return 0 def compute_episode_reward(self): desired_destination, verbal_history, all_actions, driven_destination = self.state.get_global_state( ) if self.state.dialog_complete: # to compute at the very end if driven_destination: if desired_destination == driven_destination: return 1 else: return -1 else: # timeout return -10 else: return 0 def step(self, action_dict): ''' Given an action_dict, compute the next observation, rewards, and dones ''' if 'driver' in action_dict: driver_obs = self.driver_step(action_dict['driver']) if self.state.is_done(): driver_reward = self.compute_driver_reward() return {'driver': driver_obs, 'passenger': self.state.make_passenger_observation()}, \ {'driver': driver_reward, 'passenger': self.compute_passenger_reward()}, \ {'__all__': self.state.is_done()}, {} if 'passenger' in action_dict: passenger_obs = self.passenger_step(action_dict['passenger']) self.obs = {'driver': driver_obs, 'passenger': passenger_obs} self.rewards = { 'driver': self.compute_driver_reward(), 'passenger': self.compute_passenger_reward() } self.dones = {'__all__': self.state.is_done()} self.infos = {} return self.obs, self.rewards, self.dones, self.infos
class NanoworldEnv(MultiAgentEnv): # Constants agents = ('passenger', 'driver') max_num_actions = 10 parameters = [ ".", "yes", "no", "starbucks", "peets" ] # dummy destination '.' to be paired with 'OVER', 'YES', 'NO' # parameters = [ # ".", # "yes", # "no", # "starbucks", # "peets", # "ralphs", # "traderjoes", # "wholefoods", # "walmart", # "cvs", # "toysrus", # "applestore", # "bestbuy", # ] # Action spaces passenger_actions = ["SAY", "OVER"] passenger_action_space = Tuple( [Discrete(len(passenger_actions)), Discrete(len(parameters))]) driver_actions = ["CONFIRM", "DRIVE"] driver_action_space = Tuple( [Discrete(len(driver_actions)), Discrete(len(parameters))]) # observation spaces passenger_observation_space = Dict({ 'dialog_history': Repeated(Discrete(len(agents)), Tuple([ Discrete(len(passenger_actions)), Discrete(len(parameters)) ]), max_len=max_num_actions), 'destination': Discrete(len(parameters)) }) driver_observation_space = Dict({ 'dialog_history': Repeated(Discrete(len(agents)), Tuple([ Discrete(len(passenger_actions)), Discrete(len(parameters)) ]), max_len=max_num_actions) }) perfect_dialogs = [ # ("starbucks", [('SAY', 'starbucks'), ('OVER', '.'), ('DRIVE', 'starbucks')]), # ("peets", [('SAY', 'peets'), ('OVER', '.'), ('DRIVE', 'peets')]), ("starbucks", [('SAY', 'starbucks'), ('OVER', '.'), ('CONFIRM', 'starbucks'), ('DRIVE', 'starbucks')]), ("peets", [('SAY', 'peets'), ('OVER', '.'), ('CONFIRM', 'peets'), ('DRIVE', 'peets')]), # ("starbucks", [('SAY', 'starbucks'), # ('OVER', '.'), # ('CONFIRM', 'starbucks'), # ('OVER', '.'), # ('YES', '.'), # ('OVER', '.'), # ('DRIVE', 'starbucks')]), # # ("peets", [('SAY', 'peets'), # ('OVER', '.'), # ('CONFIRM', 'peets'), # ('OVER', '.'), # ('YES', '.'), # ('OVER', '.'), # ('DRIVE', 'peets')]), ] def __init__(self, config): self.is_supervised = False destination_id = random.randint(3, len(NanoworldEnv.parameters) - 1) self.state = DialogStateNano( NanoworldEnv.max_num_actions, desired_destination=NanoworldEnv.parameters[destination_id]) self.num_episodes = 0 self.supervised_episodes = 10000 self.rewards = dict() self.print_episodes = 10000 def reset(self): ''' Called before each episode, returns the first observation ''' if self.num_episodes % 1000 == 0: logger.warning("completed {} episodes.".format(self.num_episodes)) if self.num_episodes >= self.print_episodes: logger.warning('episode ' + str(self.num_episodes)) logger.warning('------------') _, _, history, _ = self.state.get_global_state() for h in history: logger.warning(h) logger.warning('-------------') self.num_episodes += 1 # select the destination if self.is_supervised and self.num_episodes < self.supervised_episodes: a_list = [3, 4] distribution = [.5, .5] destination_id = random.choices(a_list, distribution)[0] else: destination_id = random.randint(3, len(NanoworldEnv.parameters) - 1) if self.num_episodes >= self.print_episodes: logger.warning('set destination: ' + NanoworldEnv.parameters[destination_id]) self.state = DialogStateNano( NanoworldEnv.max_num_actions, desired_destination=NanoworldEnv.parameters[destination_id]) self.obs = {'passenger': self.state.make_passenger_observation()} return self.obs def driver_step(self, action): a1, a2 = action self.state.update_state(NanoworldEnv.driver_actions[a1], NanoworldEnv.parameters[a2]) obs = self.state.make_driver_observation() return obs def passenger_step(self, action): a1, a2 = action self.state.update_state(NanoworldEnv.passenger_actions[a1], NanoworldEnv.parameters[a2]) obs = self.state.make_passenger_observation() return obs def compute_driver_reward(self): driver_reward = 0 _, verbal_history, _, driven_destination = self.state.get_global_state( ) if self.state.is_done(): # to compute at the very end if self.state.dialog_complete: if driven_destination: # completion through a final drive action if len(verbal_history ) == 0: # driver drives before user says anything driver_reward += -1 else: last_uttered_destination = verbal_history[-1].split( " ")[1] if driven_destination == last_uttered_destination: driver_reward += 1 else: driver_reward += -1 else: # timeout driver_reward += -10 else: # dialog not yet over driver_reward += 0 if self.is_supervised: # and self.num_episodes < self.supervised_episodes: driver_reward += self.compositional_supervision_reward() return driver_reward def compute_passenger_reward(self): desired_destination, verbal_history, _, driven_destination = self.state.get_global_state( ) passenger_reward = 0 if self.state.is_done(): # to compute at the very end if self.state.dialog_complete: # completion through a final drive action if desired_destination == driven_destination: passenger_reward += 1 else: passenger_reward += -1 else: # timeout passenger_reward += -10 else: # dialog not yet over passenger_reward += 0 if self.is_supervised: # and self.num_episodes < self.supervised_episodes: passenger_reward += self.compositional_supervision_reward() return passenger_reward def compute_supervision_reward(self): desired_dest, _, all_actions, _ = self.state.get_global_state() dialog_so_far = ", ".join(all_actions) for dest, dialog_raw in NanoworldEnv.perfect_dialogs: dialog = ", ".join([a + " " + p for a, p in dialog_raw]) if dest == desired_dest and dialog.startswith( dialog_so_far): # and len(all_actions) > 2: return 1 return 0 def compositional_supervision_reward(self): desired_dest, _, all_actions_sofar, _ = self.state.get_global_state() all_actions_sofar = " ".join( [action.split(" ")[0] for action in all_actions_sofar]) perfect_dialog = " ".join( [x[0] for x in NanoworldEnv.perfect_dialogs[0][1]]) if perfect_dialog.startswith(all_actions_sofar): return 1 else: return 0 # any kind of exploration is punished ... so negative reward in supervision is bad.. def compute_supervision_reward_negative(self): desired_dest, _, all_actions, _ = self.state.get_global_state() dialog_so_far = ", ".join(all_actions) for dest, dialog_raw in NanoworldEnv.perfect_dialogs: dialog = ", ".join([a + " " + p for a, p in dialog_raw]) if dest == desired_dest and dialog.startswith( dialog_so_far): # and len(all_actions) > 2: return 1 return -1 def step(self, action_dict): ''' Given an action_dict, compute the next observation, rewards, and dones ''' # pdb.set_trace() driver_obs = None passenger_obs = None if 'driver' in action_dict: driver_obs = self.driver_step(action_dict['driver']) if 'passenger' in action_dict: passenger_obs = self.passenger_step(action_dict['passenger']) if self.state.turn == 0: passenger_obs = self.state.make_passenger_observation() driver_obs = None elif self.state.turn == 1: driver_obs = self.state.make_driver_observation() passenger_obs = None self.obs = {} self.rewards = {} if passenger_obs: self.obs['passenger'] = passenger_obs self.rewards['passenger'] = self.compute_passenger_reward() if driver_obs: self.obs['driver'] = driver_obs self.rewards['driver'] = self.compute_driver_reward() self.dones = {'__all__': self.state.is_done()} if self.state.is_done(): self.obs['passenger'] = self.state.make_passenger_observation() self.rewards['passenger'] = self.compute_passenger_reward() self.obs['driver'] = self.state.make_driver_observation() self.rewards['driver'] = self.compute_driver_reward() self.infos = {} return self.obs, self.rewards, self.dones, self.infos