def __init__(self, environment, device, replay_buffer: ReplayBuffer, discount_factor: typing.Optional[float]=None, normalize_returns: bool=False, forward_steps: int=1, action_noise: typing.Optional[nn.Module]=None): self._environment = environment self.device = device self.replay_buffer = replay_buffer self.normalize_returns = normalize_returns self.forward_steps = forward_steps self.discount_factor = discount_factor self.action_noise = action_noise.to(self.device) if action_noise is not None else None if self.normalize_returns: assert self.discount_factor is not None, \ "TransitionReplayEnvRoller must have a discount factor defined if normalize_returns is turned on" if self.forward_steps > 1: assert self.discount_factor is not None, \ "TransitionReplayEnvRoller must have a discount factor defined if forward_steps is larger than one" self.ret_rms = RunningMeanStd(shape=()) if normalize_returns else None # Initial observation self.last_observation_cpu = torch.from_numpy(self.environment.reset()).clone() self.last_observation = self.last_observation_cpu.to(self.device) # Return normalization self.clip_obs = 5.0 self.accumulated_returns = np.zeros(environment.num_envs, dtype=np.float32)
def __init__(self, environment, device, batch_size, buffer_capacity, buffer_initial_size, noise_std_dev, normalize_observations=False): self.device = device self.batch_size = batch_size self.buffer_capacity = buffer_capacity self.buffer_initial_size = buffer_initial_size self.normalize_observations = normalize_observations self.device = device self._environment = environment self.backend = DequeBufferBackend( buffer_capacity=self.buffer_capacity, observation_space=environment.observation_space, action_space=environment.action_space ) self.last_observation = self.environment.reset() len_action_space = self.environment.action_space.shape[-1] self.noise_process = OrnsteinUhlenbeckNoiseProcess( np.zeros(len_action_space), float(noise_std_dev) * np.ones(len_action_space) ) self.ob_rms = RunningMeanStd(shape=self.environment.observation_space.shape) if normalize_observations else None self.clip_obs = 10.0
class VecNormalize(VecEnvWrapper): """ Vectorized environment base class """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs.astype(np.float32) else: return obs def reset(self): """ Reset all environments """ obs = self.venv.reset() return self._obfilt(obs)
def __init__(self, env, normalize_observations=True, normalize_returns=True, clip_observations=10., clip_rewards=10., gamma=0.99, epsilon=1e-8): super().__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if normalize_observations else None self.ret_rms = RunningMeanStd(shape=()) if normalize_returns else None self.clipob = clip_observations self.cliprew = clip_rewards self.ret = 0.0 self.gamma = gamma self.epsilon = epsilon
class EnvNormalize(gym.Wrapper): """ Single environment normalization based on VecNormalize from OpenAI baselines """ def __init__(self, env, normalize_observations=True, normalize_returns=True, clip_observations=10., clip_rewards=10., gamma=0.99, epsilon=1e-8): super().__init__(env) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if normalize_observations else None self.ret_rms = RunningMeanStd(shape=()) if normalize_returns else None self.clipob = clip_observations self.cliprew = clip_rewards self.ret = 0.0 self.gamma = gamma self.epsilon = epsilon def step(self, action): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.env.step(action) self.ret = self.ret * self.gamma + rews obs = self._filter_observation(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _filter_observation(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs.astype(np.float32) else: return obs def reset(self): """ Reset all environments """ obs = self.env.reset() return self._filter_observation(obs)
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs)
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
class DequeReplayRollerOuNoise(ReplayEnvRollerBase): """ Enrionment roller with experience replay buffer rolling out a **single** environment with Ornstein–Uhlenbeck noise process """ def __init__(self, environment, device, batch_size, buffer_capacity, buffer_initial_size, noise_std_dev, normalize_observations=False): self.device = device self.batch_size = batch_size self.buffer_capacity = buffer_capacity self.buffer_initial_size = buffer_initial_size self.normalize_observations = normalize_observations self.device = device self._environment = environment self.backend = DequeBufferBackend( buffer_capacity=self.buffer_capacity, observation_space=environment.observation_space, action_space=environment.action_space ) self.last_observation = self.environment.reset() len_action_space = self.environment.action_space.shape[-1] self.noise_process = OrnsteinUhlenbeckNoiseProcess( np.zeros(len_action_space), float(noise_std_dev) * np.ones(len_action_space) ) self.ob_rms = RunningMeanStd(shape=self.environment.observation_space.shape) if normalize_observations else None self.clip_obs = 10.0 @property def environment(self): """ Return environment of this env roller """ return self._environment def is_ready_for_sampling(self) -> bool: """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """ return self.backend.current_size >= self.buffer_initial_size @torch.no_grad() def rollout(self, batch_info, model) -> Rollout: """ Roll-out the environment and return it """ observation_tensor = torch.from_numpy(self.last_observation).to(self.device) step = model.step(observation_tensor[None]) action = step['actions'].detach().cpu().numpy()[0] noise = self.noise_process() action_perturbed = np.clip( action + noise, self.environment.action_space.low, self.environment.action_space.high ) observation, reward, done, info = self.environment.step(action_perturbed) if self.ob_rms is not None: self.ob_rms.update(observation) self.backend.store_transition(self.last_observation, action_perturbed, reward, done) # Usual, reset on done if done: observation = self.environment.reset() self.noise_process.reset() self.last_observation = observation return Transitions( size=1, environment_information=[info], transition_tensors={ 'actions': step['actions'], 'values': step['values'] }, ) def _filter_observation(self, obs): """ Potentially normalize observation """ if self.ob_rms is not None: obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + 1e-8), -self.clip_obs, self.clip_obs) return obs.astype(np.float32) else: return obs def sample(self, batch_info, model) -> Transitions: """ Sample experience from replay buffer and return a batch """ indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1) batch = self.backend.get_batch(indexes, history_length=1) observations = torch.from_numpy(self._filter_observation(batch['states'])).to(self.device) observations_plus1 = torch.from_numpy(self._filter_observation(batch['states+1'])).to(self.device) dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device) rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device) actions = torch.from_numpy(batch['actions']).to(self.device) return Transitions( size=self.batch_size, environment_information=[], transition_tensors={ 'observations': observations, 'observations_next': observations_plus1, 'dones': dones, 'rewards': rewards, 'actions': actions } )
class TransitionReplayEnvRoller(ReplayEnvRollerBase): """ Calculate environment rollouts using a replay buffer for experience replay. Replay buffer is parametrized Samples transitions from the replay buffer (individual frame transitions) """ def __init__(self, environment, device, replay_buffer: ReplayBuffer, discount_factor: typing.Optional[float]=None, normalize_returns: bool=False, forward_steps: int=1, action_noise: typing.Optional[nn.Module]=None): self._environment = environment self.device = device self.replay_buffer = replay_buffer self.normalize_returns = normalize_returns self.forward_steps = forward_steps self.discount_factor = discount_factor self.action_noise = action_noise.to(self.device) if action_noise is not None else None if self.normalize_returns: assert self.discount_factor is not None, \ "TransitionReplayEnvRoller must have a discount factor defined if normalize_returns is turned on" if self.forward_steps > 1: assert self.discount_factor is not None, \ "TransitionReplayEnvRoller must have a discount factor defined if forward_steps is larger than one" self.ret_rms = RunningMeanStd(shape=()) if normalize_returns else None # Initial observation self.last_observation_cpu = torch.from_numpy(self.environment.reset()).clone() self.last_observation = self.last_observation_cpu.to(self.device) # Return normalization self.clip_obs = 5.0 self.accumulated_returns = np.zeros(environment.num_envs, dtype=np.float32) @property def environment(self): """ Return environment of this env roller """ return self._environment @torch.no_grad() def rollout(self, batch_info: BatchInfo, model: RlModel, number_of_steps: int) -> Rollout: """ Calculate env rollout """ assert not model.is_recurrent, "Replay env roller does not support recurrent models" accumulator = TensorAccumulator() episode_information = [] # List of dictionaries with episode information for step_idx in range(number_of_steps): step = model.step(self.last_observation) if self.action_noise is not None: step['actions'] = self.action_noise(step['actions'], batch_info=batch_info) replay_extra_information = {} accumulator.add('observations', self.last_observation_cpu) # Add step to the tensor accumulator for name, tensor in step.items(): tensor_cpu = tensor.cpu() accumulator.add(name, tensor_cpu) if name != 'actions': replay_extra_information[name] = tensor_cpu.numpy() actions_numpy = step['actions'].detach().cpu().numpy() new_obs, new_rewards, new_dones, new_infos = self.environment.step(actions_numpy) # Store rollout in the experience replay buffer self.replay_buffer.store_transition( frame=self.last_observation_cpu.numpy(), action=actions_numpy, reward=new_rewards, done=new_dones, extra_info=replay_extra_information ) if self.ret_rms is not None: self.accumulated_returns = new_rewards + self.discount_factor * self.accumulated_returns self.ret_rms.update(self.accumulated_returns) # Done is flagged true when the episode has ended AND the frame we see is already a first frame from the # next episode dones_tensor = torch.from_numpy(new_dones.astype(np.float32)).clone() accumulator.add('dones', dones_tensor) if self.action_noise is not None: self.action_noise.reset_training_state(dones_tensor, batch_info=batch_info) self.accumulated_returns = self.accumulated_returns * (1.0 - new_dones.astype(np.float32)) self.last_observation_cpu = torch.from_numpy(new_obs).clone() self.last_observation = self.last_observation_cpu.to(self.device) if self.ret_rms is not None: new_rewards = np.clip(new_rewards / np.sqrt(self.ret_rms.var + 1e-8), -self.clip_obs, self.clip_obs) accumulator.add('rewards', torch.from_numpy(new_rewards.astype(np.float32)).clone()) episode_information.append(new_infos) accumulated_tensors = accumulator.result() return Trajectories( num_steps=accumulated_tensors['observations'].size(0), num_envs=accumulated_tensors['observations'].size(1), environment_information=episode_information, transition_tensors=accumulated_tensors, rollout_tensors={} ).to_transitions() def sample(self, batch_info: BatchInfo, model: RlModel, number_of_steps: int) -> Rollout: """ Sample experience from replay buffer and return a batch """ if self.forward_steps > 1: transitions = self.replay_buffer.sample_forward_transitions( batch_size=number_of_steps, batch_info=batch_info, forward_steps=self.forward_steps, discount_factor=self.discount_factor ) else: transitions = self.replay_buffer.sample_transitions(batch_size=number_of_steps, batch_info=batch_info) if self.ret_rms is not None: rewards = transitions.transition_tensors['rewards'] new_rewards = torch.clamp(rewards / np.sqrt(self.ret_rms.var + 1e-8), -self.clip_obs, self.clip_obs) transitions.transition_tensors['rewards'] = new_rewards return transitions def is_ready_for_sampling(self) -> bool: """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """ return self.replay_buffer.is_ready_for_sampling() def initial_memory_size_hint(self) -> typing.Optional[int]: """ Hint how much data is needed to begin sampling, required only for diagnostics """ return self.replay_buffer.initial_memory_size_hint() def update(self, rollout, batch_info): """ Perform update of the internal state of the buffer - e.g. for the prioritized replay weights """ self.replay_buffer.update(rollout, batch_info)