class ASTEnv(gym.Env): # class ASTEnv(GarageEnv): def __init__(self, open_loop=True, blackbox_sim_state=True, fixed_init_state=False, s_0=None, simulator=None, reward_function=None, spaces=None): # Constant hyper-params -- set by user self.open_loop = open_loop self.blackbox_sim_state = blackbox_sim_state # is this redundant? self.spaces = spaces # These are set by reset, not the user self._done = False self._reward = 0.0 self._info = [] self._step = 0 self._action = None self._actions = [] self._first_step = True self.reward_range = (-float('inf'), float('inf')) self.metadata = None self._cum_reward = 0.0 if s_0 is None: self._init_state = self.observation_space.sample() else: self._init_state = s_0 self._fixed_init_state = fixed_init_state self.simulator = simulator if self.simulator is None: self.simulator = ExampleAVSimulator() self.reward_function = reward_function if self.reward_function is None: self.reward_function = ExampleAVReward() if hasattr(self.simulator, "vec_env_executor") and callable( getattr(self.simulator, "vec_env_executor")): self.vectorized = True else: self.vectorized = False # super().__init__(self) def step(self, action): """ Run one timestep of the environment's dynamics. When end of episode is reached, reset() should be called to reset the environment's internal state. Input ----- action : an action provided by the environment Outputs ------- (observation, reward, done, info) observation : agent's observation of the current environment reward [Float] : amount of reward due to the previous action done : a boolean, indicating whether the episode has ended info : a dictionary containing other diagnostic information from the previous action """ self._action = action self._actions.append(action) action_return = self._action # Update simulation step obs = self.simulator.step(self._action) if (obs is None) or (self.open_loop is True) or (self.blackbox_sim_state): obs = np.array(self._init_state) # if self.simulator.is_goal(): if self.simulator.is_terminal() or self.simulator.is_goal(): self._done = True # Calculate the reward for this step self._reward = self.reward_function.give_reward( action=self._action, info=self.simulator.get_reward_info()) self._cum_reward += self._reward # Update instance attributes self._step = self._step + 1 self._simulator_state = self.simulator.clone_state() self._env_state = np.concatenate( (self._simulator_state, np.array([self._cum_reward ]), np.array([self._step])), axis=0) if self._done: self.simulator.simulate(self._actions, self._init_state) if not (self.simulator.is_goal() or self.simulator.is_terminal()): pdb.set_trace() return Step( observation=obs, reward=self._reward, done=self._done, cache=self._info, actions=action_return, # step = self._step -1, # real_actions=self._action, state=self._env_state, # root_action=self.root_action, is_terminal=self.simulator.is_terminal(), is_goal=self.simulator.is_goal()) def simulate(self, actions): if not self._fixed_init_state: self._init_state = self.observation_space.sample() self.simulator.simulate(actions, self._init_state) def reset(self): """ Resets the state of the environment, returning an initial observation. Outputs ------- observation : the initial observation of the space. (Initial reward is assumed to be 0.) """ self._actions = [] if not self._fixed_init_state: self._init_state = self.observation_space.sample() self._done = False self._reward = 0.0 self._cum_reward = 0.0 self._info = [] self._action = None self._actions = [] self._first_step = True self._step = 0 obs = np.array(self.simulator.reset(self._init_state)) if not self._fixed_init_state: obs = np.concatenate((obs, np.array(self._init_state)), axis=0) return obs @property def action_space(self): """ Returns a Space object """ if self.spaces is None: # return self._to_garage_space(self.simulator.action_space) return self.simulator.action_space else: return self.spaces.action_space @property def observation_space(self): """ Returns a Space object """ if self.spaces is None: # return self._to_garage_space(self.simulator.observation_space) return self.simulator.observation_space else: return self.spaces.observation_space def get_cache_list(self): return self._info def log(self): self.simulator.log() def render(self): if hasattr(self.simulator, "render") and callable( getattr(self.simulator, "render")): return self.simulator.render() else: return None def close(self): if hasattr(self.simulator, "close") and callable( getattr(self.simulator, "close")): self.simulator.close() else: return None def vec_env_executor(self, n_envs, max_path_length): return self.simulator.vec_env_executor(n_envs, max_path_length, self.reward_function, self._fixed_init_state, self._init_state, self.open_loop) def log_diagnostics(self, paths): pass @cached_property def spec(self): """ Returns an EnvSpec. Returns: spec (garage.envs.EnvSpec) """ return EnvSpec(observation_space=self.observation_space, action_space=self.action_space)
class GoExploreASTEnv(gym.Env, Parameterized): def __init__(self, open_loop=True, blackbox_sim_state=True, fixed_init_state=False, s_0=None, simulator=None, reward_function=None, spaces=None): # gym_env = gym.make('ast_toolbox:GoExploreAST-v0', {'test':'test string'}) # pdb.set_trace() # super().__init__(gym_env) # Constant hyper-params -- set by user self.open_loop = open_loop self.blackbox_sim_state = blackbox_sim_state # is this redundant? self.spaces = spaces if spaces is None: self.spaces = ExampleAVSpaces() # These are set by reset, not the user self._done = False self._reward = 0.0 self._info = {} self._step = 0 self._action = None self._actions = [] self._first_step = True self.reward_range = (-float('inf'), float('inf')) self.metadata = None self.spec._entry_point = [] self._cum_reward = 0.0 self.root_action = None self.sample_limit = 10000 self.simulator = simulator if self.simulator is None: self.simulator = ExampleAVSimulator() if s_0 is None: self._init_state = self.observation_space.sample() else: self._init_state = s_0 self._fixed_init_state = fixed_init_state self.reward_function = reward_function if self.reward_function is None: self.reward_function = ExampleAVReward() if hasattr(self.simulator, "vec_env_executor") and callable( getattr(self.simulator, "vec_env_executor")): self.vectorized = True else: self.vectorized = False # super().__init__(self) # Always call Serializable constructor last self.params_set = False self.db_filename = 'database.dat' self.key_list = [] self.max_value = 0 self.robustify_state = [] self.robustify = False Parameterized.__init__(self) def sample(self, population): # Proportional sampling: Stochastic Acceptance # https://arxiv.org/pdf/1109.3627.pdf # https://jbn.github.io/fast_proportional_selection/ attempts = 0 while attempts < self.sample_limit: attempts += 1 candidate = population[random.choice(self.p_key_list.value)] if random.random() < (candidate.fitness / self.p_max_value.value): return candidate attempts = 0 while attempts < self.sample_limit: attempts += 1 candidate = population[random.choice(self.p_key_list.value)] if candidate.fitness > 0: print( "Returning Uniform Random Sample - Max Attempts Reached!") return candidate print("Failed to find a valid state for reset!") raise ValueError # return population[random.choice(self.p_key_list.value)] def get_first_cell(self): # obs = self.env.env.reset() # state = self.env.env.clone_state() obs = self.env_reset() if self.blackbox_sim_state: # obs = self.downsample(self.simulator.get_first_action()) obs = self.simulator.get_first_action() # else: # obs = self.env_reset() # obs = self.simulator.reset(self._init_state) state = np.concatenate( (self.simulator.clone_state(), np.array([self._cum_reward ]), np.array([-1])), axis=0) # pdb.set_trace() return obs, state def step(self, action): """ Run one timestep of the environment's dynamics. When end of episode is reached, reset() should be called to reset the environment's internal state. Input ----- action : an action provided by the environment Outputs ------- (observation, reward, done, info) observation : agent's observation of the current environment reward [Float] : amount of reward due to the previous action done : a boolean, indicating whether the episode has ended info : a dictionary containing other diagnostic information from the previous action """ self._action = action self._actions.append(action) action_return = self._action # Update simulation step obs = self.simulator.step(self._action) if (obs is None) or (self.open_loop is True) or (self.blackbox_sim_state): # print('Open Loop:', obs) obs = np.array(self._init_state) # if not self.robustify: # action_return = self.downsample(action_return) # if self.simulator.is_goal(): # Add step number to differentiate identical actions # obs = np.concatenate((np.array([self._step]), self.downsample(obs)), axis=0) if self.simulator.is_terminal() or self.simulator.is_goal(): self._done = True # Calculate the reward for this step self._reward = self.reward_function.give_reward( action=self._action, info=self.simulator.get_reward_info()) self._cum_reward += self._reward # Update instance attributes self._step = self._step + 1 self._simulator_state = self.simulator.clone_state() self._env_state = np.concatenate( (self._simulator_state, np.array([self._cum_reward ]), np.array([self._step])), axis=0) # if self.robustify: # # No obs? # # pdb.set_trace() # obs = self._init_state # else: # # print(self.robustify_state) # obs = self.downsample(obs) # pdb.set_trace() return Step( observation=obs, reward=self._reward, done=self._done, cache=self._info, actions=action_return, # step = self._step -1, # real_actions=self._action, state=self._env_state, root_action=self.root_action, is_terminal=self.simulator.is_terminal(), is_goal=self.simulator.is_goal()) def simulate(self, actions): if not self._fixed_init_state: self._init_state = self.observation_space.sample() return self.simulator.simulate(actions, self._init_state) def reset(self, **kwargs): """ This method is necessary to suppress a deprecated warning thrown by gym.Wrapper. Calls reset on wrapped env. """ try: # print(self.p_robustify_state.value) if self.p_robustify_state is not None and self.p_robustify_state.value is not None and len( self.p_robustify_state.value) > 0: state = self.p_robustify_state.value print('-----------Robustify Init-----------------') print('-----------Robustify Init: ', state, ' -----------------') self.simulator.restore_state(state[:-2]) obs = self.simulator._get_obs() self._done = False self._cum_reward = state[-2] self._step = state[-1] # pdb.set_trace() self.robustify = True return self._init_state # pdb.set_trace() # start = time.time() flag = db.DB_RDONLY pool_DB = db.DB() # tick1 = time.time() pool_DB.open(self.p_db_filename.value, dbname=None, dbtype=db.DB_HASH, flags=flag) # tick2 = time.time() dd_pool = shelve.Shelf(pool_DB, protocol=pickle.HIGHEST_PROTOCOL) # tick3 = time.time() # keys = dd_pool.keys() # tick4_1 = time.time() # list_of_keys = list(keys) # tick4_2 = time.time() # choice = random.choice(self.p_key_list.value) # import pdb; pdb.set_trace() # tick4_3 = time.time() # cell = dd_pool[choice] cell = self.sample(dd_pool) # tick5 = time.time() dd_pool.close() # tick6 = time.time() pool_DB.close() # tick7 = time.time() # print("Make DB: ", 100*(tick1 - start)/(tick7 - start), " %") # print("Open DB: ", 100*(tick2 - tick1) / (tick7 - start), " %") # print("Open Shelf: ", 100*(tick4_2 - tick2) / (tick7 - start), " %") # # print("Get all keys: ", 100*(tick4_1 - tick3) / (tick7 - start), " %") # # print("Make list of all keys: ", 100 * (tick4_2 - tick4_1) / (tick7 - start), " %") # print("Choose random cell: ", 100 * (tick4_3 - tick4_2) / (tick7 - start), " %") # print("Get random cell: ", 100*(tick5 - tick4_3) / (tick7 - start), " %") # print("Close shelf: ", 100*(tick6 - tick5) / (tick7 - start), " %") # print("Close DB: ", 100*(tick7 - tick6) / (tick7 - start), " %") # print("DB Access took: ", time.time() - start, " s") if cell.state is not None: # pdb.set_trace() if np.all(cell.state == 0): print("-------DEFORMED CELL STATE-------") obs = self.env_reset() else: # print("restore state: ", cell.state) self.simulator.restore_state(cell.state[:-2]) if self.simulator.is_terminal() or self.simulator.is_goal( ): print('-------SAMPLED TERMINAL STATE-------') pdb.set_trace() obs = self.env_reset() else: # print("restored") if cell.score == 0.0 and cell.parent is not None: print( "Reset to cell with score 0.0 ---- terminal: ", self.simulator.is_terminal(), " goal: ", self.simulator.is_goal(), " obs: ", cell.observation) obs = self.simulator._get_obs() self._done = False self._cum_reward = cell.state[-2] self._step = cell.state[-1] self.root_action = cell.observation # print("restore obs: ", obs) else: print("Reset from start") obs = self.env_reset() # pdb.set_trace() except db.DBBusyError: print("DBBusyError") obs = self.env_reset() except db.DBLockNotGrantedError or db.DBLockDeadlockError: print("db.DBLockNotGrantedError or db.DBLockDeadlockError") obs = self.env_reset() except db.DBForeignConflictError: print("DBForeignConflictError") obs = self.env_reset() except db.DBAccessError: print("DBAccessError") obs = self.env_reset() except db.DBPermissionsError: print("DBPermissionsError") obs = self.env_reset() except db.DBNoSuchFileError: print("DBNoSuchFileError") obs = self.env_reset() except db.DBError: print("DBError") obs = self.env_reset() except BaseException: print("Failed to get state from database") pdb.set_trace() obs = self.env_reset() return obs def env_reset(self): """ Resets the state of the environment, returning an initial observation. Outputs ------- observation : the initial observation of the space. (Initial reward is assumed to be 0.) """ self._actions = [] if not self._fixed_init_state: self._init_state = self.observation_space.sample() self._done = False self._reward = 0.0 self._cum_reward = 0.0 self._info = {'actions': []} self._action = self.simulator.get_first_action() self._actions = [] self._first_step = True self._step = 0 obs = np.array(self.simulator.reset(self._init_state)) # if self.blackbox_sim_state: # obs = np.array([0] * self.action_space.shape[0]) # else: # print('Not action only') if not self.blackbox_sim_state: obs = np.concatenate((obs, np.array(self._init_state)), axis=0) # self.root_action = self.downsample(self._action) self.root_action = self._action # obs = np.concatenate((np.array([self._step]), self.downsample(obs)), axis=0) return obs @property def action_space(self): """ Returns a Space object """ if self.spaces is None: # return self._to_garage_space(self.simulator.action_space) return self.simulator.action_space else: return self.spaces.action_space @property def observation_space(self): """ Returns a Space object """ if self.spaces is None: # return self._to_garage_space(self.simulator.observation_space) return self.simulator.observation_space else: return self.spaces.observation_space def get_cache_list(self): return self._info def log(self): self.simulator.log() def render(self, **kwargs): if hasattr(self.simulator, "render") and callable( getattr(self.simulator, "render")): return self.simulator.render(**kwargs) else: return None def close(self): if hasattr(self.simulator, "close") and callable( getattr(self.simulator, "close")): self.simulator.close() else: return None def vec_env_executor(self, n_envs, max_path_length): return self.simulator.vec_env_executor(n_envs, max_path_length, self.reward_function, self._fixed_init_state, self._init_state, self.open_loop) def log_diagnostics(self, paths): pass @cached_property def spec(self): """ Returns an EnvSpec. Returns: spec (garage.envs.EnvSpec) """ return EnvSpec(observation_space=self.observation_space, action_space=self.action_space) def get_params_internal(self, **tags): # this lasagne function also returns all var below the passed layers if not self.params_set: self.p_db_filename = GoExploreParameter("db_filename", self.db_filename) self.p_key_list = GoExploreParameter("key_list", self.key_list) self.p_max_value = GoExploreParameter("max_value", self.max_value) self.p_robustify_state = GoExploreParameter( "robustify_state", self.robustify_state) self.params_set = True if tags.pop("db_filename", False): return [self.p_db_filename] if tags.pop("key_list", False): return [self.p_key_list] if tags.pop("max_value", False): return [self.p_max_value] if tags.pop("robustify_state", False): return [self.p_robustify_state] return [ self.p_db_filename, self.p_key_list, self.p_max_value, self.p_robustify_state ] # , self.p_downsampler] def set_param_values(self, param_values, **tags): debug = tags.pop("debug", False) for param, value in zip(self.get_params(**tags), param_values): param.set_value(value) if debug: print("setting value of %s" % param.name) def get_param_values(self, **tags): return [ param.get_value(borrow=True) for param in self.get_params(**tags) ] def downsample(self, obs): return obs def _get_obs(self): return self.simulator._get_obs()