def _convert_state(self, state): if isinstance(state, (float, int)): state = ch.totensor(state) if isinstance(state, dict): state = {k: self._convert_state(state[k]) for k in state} if isinstance(state, np.ndarray): state = ch.totensor(state) if self.is_vectorized and isinstance(state, th.Tensor): state = state.squeeze(0) return state
def _convert_state(self, state): if isinstance(state, (float, int)): state = ch.totensor(state) if isinstance(state, dict): state = {k: self._convert_state(state[k]) for k in state} if isinstance(state, np.ndarray): state = ch.totensor(state) # we need to check for num_envs because self.is_vectorized returns # False when the num_envs=1, but the state still needs squeezing. if hasattr(self, 'num_envs') and isinstance(state, th.Tensor): state = state.squeeze(0) return state
def test_config(n_envs, base_env, use_torch, use_logger, return_info): config = 'n_envs' + str(n_envs) + '-base_env' + str(base_env) \ + '-torch' + str(use_torch) + '-logger' + str(use_logger) \ + '-info' + str(return_info) if isinstance(base_env, str): env = vec_env = gym.vector.make(base_env, num_envs=n_envs) else: def make_env(): env = base_env() return env env_fns = [make_env for _ in range(n_envs)] env = vec_env = AsyncVectorEnv(env_fns) if use_logger: env = envs.Logger(env, interval=5, logger=self.logger) if use_torch: env = envs.Torch(env) policy = lambda x: ch.totensor(vec_env.action_space.sample()) else: policy = lambda x: vec_env.action_space.sample() if return_info: agent = lambda x: (policy(x), {'policy': policy(x)[0]}) else: agent = policy # Gather experience env = envs.Runner(env) replay = env.run(agent, steps=NUM_STEPS) # Pre-compute some shapes shape = (NUM_STEPS, n_envs) state_shape = vec_env.observation_space.sample()[0] if isinstance(state_shape, (int, float)): state_shape = tuple() else: state_shape = state_shape.shape action_shape = vec_env.action_space.sample()[0] if isinstance(action_shape, (int, float)): action_shape = (1, ) else: action_shape = action_shape.shape done_shape = tuple() # Check shapes states = replay.state() self.assertEqual(states.shape, shape + state_shape, config) actions = replay.action() self.assertEqual(actions.shape, shape + action_shape, config) dones = replay.done() self.assertEqual(dones.shape, shape + done_shape, config) if return_info: policies = replay.policy() self.assertEqual(policies.shape, (NUM_STEPS, ) + action_shape, config)
def append(self, state=None, action=None, reward=None, next_state=None, done=None, **infos): """ **Description** Appends new data to the list ExperienceReplay. **Arguments** * **state** (tensor/ndarray/list) - Originating state. * **action** (tensor/ndarray/list) - Executed action. * **reward** (tensor/ndarray/list) - Observed reward. * **next_state** (tensor/ndarray/list) - Resulting state. * **done** (tensor/bool) - Is `next_state` a terminal (absorbing) state ? * **infos** (dict, *optional*, default=None) - Additional information on the transition. **Example** ~~~python replay.append(state, action, reward, next_state, done, info={ 'density': density, 'log_prob': density.log_prob(action), }) ~~~ """ for key in infos: if _istensorable(infos[key]): infos[key] = ch.totensor(infos[key]) sars = Transition(ch.totensor(state), ch.totensor(action), ch.totensor(reward), ch.totensor(next_state), ch.totensor(done), **infos) self._storage.append(sars.to(self.device))
def test_config(n_envs, n_episodes, base_env, use_torch, use_logger, return_info, retry): config = 'n_envs' + str(n_envs) + '-n_eps' + str(n_episodes) \ + '-base_env' + str(base_env) \ + '-torch' + str(use_torch) + '-logger' + str(use_logger) \ + '-info' + str(return_info) if isinstance(base_env, str): env = vec_env = gym.vector.make(base_env, num_envs=n_envs) else: def make_env(): env = base_env() return env env_fns = [make_env for _ in range(n_envs)] env = vec_env = AsyncVectorEnv(env_fns) if use_logger: env = envs.Logger(env, interval=5, logger=self.logger) if use_torch: env = envs.Torch(env) policy = lambda x: ch.totensor(vec_env.action_space.sample()) else: policy = lambda x: vec_env.action_space.sample() if return_info: agent = lambda x: (policy(x), { 'policy': policy(x)[0], 'act': policy(x) }) else: agent = policy # Gather experience env = envs.Runner(env) replay = env.run(agent, episodes=n_episodes) if retry: replay = env.run(agent, episodes=n_episodes) # Pre-compute some shapes shape = (len(replay), ) state_shape = vec_env.observation_space.sample().shape[1:] action_shape = np.array(vec_env.action_space.sample())[0].shape if len(action_shape) == 0: action_shape = (1, ) done_shape = (1, ) # Check shapes states = replay.state() self.assertEqual(states.shape, shape + state_shape, config) actions = replay.action() self.assertEqual(actions.shape, shape + action_shape, config) dones = replay.done() self.assertEqual(dones.shape, shape + done_shape, config) if return_info: policies = replay.policy() self.assertEqual(policies.shape, shape + action_shape, config) acts = replay.act() self.assertEqual(acts.shape, (len(replay), n_envs) + action_shape, config)