def get_half_filled_buffer(): """ Return simple preinitialized buffer """ observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8) action_space = gym.spaces.Discrete(4) buffer = DequeBufferBackend(20, observation_space, action_space) v1 = np.ones(4).reshape((2, 2, 1)) for i in range(10): buffer.store_transition(v1 * (i+1), 0, float(i)/2, False) return buffer
def get_filled_buffer_extra_info(): """ Return simple preinitialized buffer """ observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8) action_space = gym.spaces.Discrete(4) buffer = DequeBufferBackend(20, observation_space, action_space, extra_data={ 'neglogp': np.zeros(20, dtype=float) }) v1 = np.ones(4).reshape((2, 2, 1)) for i in range(30): buffer.store_transition(v1 * (i+1), 0, float(i)/2, False, extra_info={'neglogp': i / 30.0}) return buffer
def get_filled_buffer_with_dones(): """ Return simple preinitialized buffer with some done's in there """ observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8) action_space = gym.spaces.Discrete(4) buffer = DequeBufferBackend(20, observation_space, action_space) v1 = np.ones(4).reshape((2, 2, 1)) done_set = {2, 5, 10, 13, 18, 22, 28} for i in range(30): if i in done_set: buffer.store_transition(v1 * (i+1), 0, float(i)/2, True) else: buffer.store_transition(v1 * (i+1), 0, float(i)/2, False) return buffer
def get_filled_buffer1x1_history(): """ Return simple preinitialized buffer """ observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 1), dtype=int) action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=float) buffer = DequeBufferBackend(20, observation_space=observation_space, action_space=action_space) v1 = np.ones(2).reshape((2, 1)) a1 = np.arange(2).reshape((2,)) for i in range(30): item = v1.copy() item[0] *= (i+1) item[1] *= 10 * (i+1) buffer.store_transition(item, a1 * i, float(i)/2, False) return buffer
def test_buffer_filling_size(): """ Check if buffer size is properly updated when we add items """ observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8) action_space = gym.spaces.Discrete(4) buffer = DequeBufferBackend(20, observation_space, action_space) v1 = np.ones(4).reshape((2, 2, 1)) t.eq_(buffer.current_size, 0) buffer.store_transition(v1, 0, 0, False) buffer.store_transition(v1, 0, 0, False) t.eq_(buffer.current_size, 2) for i in range(30): buffer.store_transition(v1 * (i+1), 0, float(i)/2, False) t.eq_(buffer.current_size, buffer.buffer_capacity)
def test_simple_get_frame(): """ Check if get_frame returns frames from a buffer partially full """ observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8) action_space = gym.spaces.Discrete(4) buffer = DequeBufferBackend(20, observation_space, action_space) v1 = np.ones(4).reshape((2, 2, 1)) v2 = v1 * 2 v3 = v1 * 3 buffer.store_transition(v1, 0, 0, False) buffer.store_transition(v2, 0, 0, False) buffer.store_transition(v3, 0, 0, False) assert np.all(buffer.get_frame(0, 4).max(0).max(0) == np.array([0, 0, 0, 1])) assert np.all(buffer.get_frame(1, 4).max(0).max(0) == np.array([0, 0, 1, 2])) assert np.all(buffer.get_frame(2, 4).max(0).max(0) == np.array([0, 1, 2, 3])) with t.assert_raises(VelException): buffer.get_frame(3, 4) with t.assert_raises(VelException): buffer.get_frame(4, 4)
class DequeReplayRollerOuNoise(ReplayEnvRollerBase): """ Enrionment roller with experience replay buffer rolling out a **single** environment with Ornstein–Uhlenbeck noise process """ def __init__(self, environment, device, batch_size, buffer_capacity, buffer_initial_size, noise_std_dev, normalize_observations=False): self.device = device self.batch_size = batch_size self.buffer_capacity = buffer_capacity self.buffer_initial_size = buffer_initial_size self.normalize_observations = normalize_observations self.device = device self._environment = environment self.backend = DequeBufferBackend( buffer_capacity=self.buffer_capacity, observation_space=environment.observation_space, action_space=environment.action_space ) self.last_observation = self.environment.reset() len_action_space = self.environment.action_space.shape[-1] self.noise_process = OrnsteinUhlenbeckNoiseProcess( np.zeros(len_action_space), float(noise_std_dev) * np.ones(len_action_space) ) self.ob_rms = RunningMeanStd(shape=self.environment.observation_space.shape) if normalize_observations else None self.clip_obs = 10.0 @property def environment(self): """ Return environment of this env roller """ return self._environment def is_ready_for_sampling(self) -> bool: """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """ return self.backend.current_size >= self.buffer_initial_size @torch.no_grad() def rollout(self, batch_info, model) -> Rollout: """ Roll-out the environment and return it """ observation_tensor = torch.from_numpy(self.last_observation).to(self.device) step = model.step(observation_tensor[None]) action = step['actions'].detach().cpu().numpy()[0] noise = self.noise_process() action_perturbed = np.clip( action + noise, self.environment.action_space.low, self.environment.action_space.high ) observation, reward, done, info = self.environment.step(action_perturbed) if self.ob_rms is not None: self.ob_rms.update(observation) self.backend.store_transition(self.last_observation, action_perturbed, reward, done) # Usual, reset on done if done: observation = self.environment.reset() self.noise_process.reset() self.last_observation = observation return Transitions( size=1, environment_information=[info], transition_tensors={ 'actions': step['actions'], 'values': step['values'] }, ) def _filter_observation(self, obs): """ Potentially normalize observation """ if self.ob_rms is not None: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + 1e-8), -self.clip_obs, self.clip_obs) return obs.astype(np.float32) else: return obs def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1) batch = self.backend.get_batch(indexes, history_length=1) observations = torch.from_numpy(self._filter_observation(batch['states'])).to(self.device) observations_plus1 = torch.from_numpy(self._filter_observation(batch['states+1'])).to(self.device) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device) rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device) actions = torch.from_numpy(batch['actions']).to(self.device) return Transitions( size=self.batch_size, environment_information=[], transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions } )
class DequeReplayRollerEpsGreedy(ReplayEnvRollerBase): """ Environment roller for action-value models using experience replay. Simplest buffer implementation just holding up to given number of samples. Because framestack is implemented directly in the buffer, we can use *much* less space to hold samples in memory for very little additional cost. """ def __init__(self, environment, device, epsilon_schedule: Schedule, batch_size: int, buffer_capacity: int, buffer_initial_size: int, frame_stack: int): self.epsilon_schedule = epsilon_schedule self.batch_size = batch_size self.buffer_capacity = buffer_capacity self.buffer_initial_size = buffer_initial_size self.frame_stack = frame_stack self.device = device self._environment = environment self.backend = DequeBufferBackend( buffer_capacity=self.buffer_capacity, observation_space=environment.observation_space, action_space=environment.action_space) self.last_observation = self.environment.reset() @property def environment(self): """ Return environment of this env roller """ return self._environment def is_ready_for_sampling(self) -> bool: """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """ return self.backend.current_size >= self.buffer_initial_size def epsgreedy_action(self, policy_samples, epsilon): """ Sample e-greedy action using curreny policy and epsilon value """ random_samples = torch.randint_like(policy_samples, self.environment.action_space.n) selector = torch.rand_like(random_samples, dtype=torch.float32) return torch.where(selector > epsilon, policy_samples, random_samples) @torch.no_grad() def rollout(self, batch_info, model) -> Rollout: """ Roll-out the environment and return it """ epsilon_value = self.epsilon_schedule.value(batch_info['progress']) batch_info['epsilon'] = epsilon_value last_observation = np.concatenate([ self.backend.get_frame(self.backend.current_idx, self.frame_stack - 1), self.last_observation ], axis=-1) observation_tensor = torch.from_numpy(last_observation[None]).to( self.device) step = model.step(observation_tensor) epsgreedy_step = self.epsgreedy_action(step['actions'], epsilon_value) action = epsgreedy_step.item() observation, reward, done, info = self.environment.step(action) self.backend.store_transition(self.last_observation, action, reward, done) # Usual, reset on done if done: observation = self.environment.reset() self.last_observation = observation return Transitions(size=1, environment_information=[info], transition_tensors={ 'actions': epsgreedy_step.unsqueeze(0), 'values': step['values'] }, extra_data={'epsilon': epsilon_value}) def metrics(self): """ List of metrics to track for this learning process """ return [ AveragingNamedMetric("epsilon"), ] def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ indexes = self.backend.sample_batch_uniform(self.batch_size, self.frame_stack) batch = self.backend.get_batch(indexes, self.frame_stack) observations = torch.from_numpy(batch['states']).to(self.device) observations_plus1 = torch.from_numpy(batch['states+1']).to( self.device) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to( self.device) rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to( self.device) actions = torch.from_numpy(batch['actions']).to(self.device) return Transitions(size=self.batch_size, environment_information=None, transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions, 'weights': torch.ones_like(rewards) })