def __init__(self, environment, device, replay_buffer: ReplayBuffer, discount_factor: typing.Optional[float]=None,
                 normalize_returns: bool=False, forward_steps: int=1, action_noise: typing.Optional[nn.Module]=None):
        self._environment = environment
        self.device = device
        self.replay_buffer = replay_buffer
        self.normalize_returns = normalize_returns
        self.forward_steps = forward_steps
        self.discount_factor = discount_factor
        self.action_noise = action_noise.to(self.device) if action_noise is not None else None

        if self.normalize_returns:
            assert self.discount_factor is not None, \
                "TransitionReplayEnvRoller must have a discount factor defined if normalize_returns is turned on"

        if self.forward_steps > 1:
            assert self.discount_factor is not None, \
                "TransitionReplayEnvRoller must have a discount factor defined if forward_steps is larger than one"

        self.ret_rms = RunningMeanStd(shape=()) if normalize_returns else None

        # Initial observation
        self.last_observation_cpu = torch.from_numpy(self.environment.reset()).clone()
        self.last_observation = self.last_observation_cpu.to(self.device)

        # Return normalization
        self.clip_obs = 5.0
        self.accumulated_returns = np.zeros(environment.num_envs, dtype=np.float32)
    def __init__(self, environment, device, batch_size, buffer_capacity, buffer_initial_size, noise_std_dev,
                 normalize_observations=False):
        self.device = device
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.normalize_observations = normalize_observations

        self.device = device
        self._environment = environment

        self.backend = DequeBufferBackend(
            buffer_capacity=self.buffer_capacity,
            observation_space=environment.observation_space,
            action_space=environment.action_space
        )

        self.last_observation = self.environment.reset()

        len_action_space = self.environment.action_space.shape[-1]

        self.noise_process = OrnsteinUhlenbeckNoiseProcess(
            np.zeros(len_action_space), float(noise_std_dev) * np.ones(len_action_space)
        )

        self.ob_rms = RunningMeanStd(shape=self.environment.observation_space.shape) if normalize_observations else None
        self.clip_obs = 10.0
Example #3
0
class VecNormalize(VecEnvWrapper):
    """
    Vectorized environment base class
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs.astype(np.float32)
        else:
            return obs

    def reset(self):
        """
        Reset all environments
        """
        obs = self.venv.reset()
        return self._obfilt(obs)
Example #4
0
    def __init__(self, env, normalize_observations=True, normalize_returns=True,
                 clip_observations=10., clip_rewards=10., gamma=0.99, epsilon=1e-8):
        super().__init__(env)

        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if normalize_observations else None
        self.ret_rms = RunningMeanStd(shape=()) if normalize_returns else None
        self.clipob = clip_observations
        self.cliprew = clip_rewards
        self.ret = 0.0
        self.gamma = gamma
        self.epsilon = epsilon
Example #5
0
class EnvNormalize(gym.Wrapper):
    """
    Single environment normalization based on VecNormalize from OpenAI baselines
    """
    def __init__(self, env, normalize_observations=True, normalize_returns=True,
                 clip_observations=10., clip_rewards=10., gamma=0.99, epsilon=1e-8):
        super().__init__(env)

        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if normalize_observations else None
        self.ret_rms = RunningMeanStd(shape=()) if normalize_returns else None
        self.clipob = clip_observations
        self.cliprew = clip_rewards
        self.ret = 0.0
        self.gamma = gamma
        self.epsilon = epsilon

    def step(self, action):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.env.step(action)

        self.ret = self.ret * self.gamma + rews

        obs = self._filter_observation(obs)

        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)

        return obs, rews, news, infos

    def _filter_observation(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)

            return obs.astype(np.float32)
        else:
            return obs

    def reset(self):
        """
        Reset all environments
        """
        obs = self.env.reset()
        return self._filter_observation(obs)
Example #6
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)
Example #7
0
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8):
     VecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(
         shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
class DequeReplayRollerOuNoise(ReplayEnvRollerBase):
    """
    Enrionment roller with experience replay buffer rolling out a **single** environment
    with Ornstein–Uhlenbeck noise process
    """

    def __init__(self, environment, device, batch_size, buffer_capacity, buffer_initial_size, noise_std_dev,
                 normalize_observations=False):
        self.device = device
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.normalize_observations = normalize_observations

        self.device = device
        self._environment = environment

        self.backend = DequeBufferBackend(
            buffer_capacity=self.buffer_capacity,
            observation_space=environment.observation_space,
            action_space=environment.action_space
        )

        self.last_observation = self.environment.reset()

        len_action_space = self.environment.action_space.shape[-1]

        self.noise_process = OrnsteinUhlenbeckNoiseProcess(
            np.zeros(len_action_space), float(noise_std_dev) * np.ones(len_action_space)
        )

        self.ob_rms = RunningMeanStd(shape=self.environment.observation_space.shape) if normalize_observations else None
        self.clip_obs = 10.0

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.backend.current_size >= self.buffer_initial_size

    @torch.no_grad()
    def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        observation_tensor = torch.from_numpy(self.last_observation).to(self.device)

        step = model.step(observation_tensor[None])
        action = step['actions'].detach().cpu().numpy()[0]
        noise = self.noise_process()

        action_perturbed = np.clip(
            action + noise, self.environment.action_space.low, self.environment.action_space.high
        )

        observation, reward, done, info = self.environment.step(action_perturbed)

        if self.ob_rms is not None:
            self.ob_rms.update(observation)

        self.backend.store_transition(self.last_observation, action_perturbed, reward, done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()
            self.noise_process.reset()

        self.last_observation = observation

        return Transitions(
            size=1,
            environment_information=[info],
            transition_tensors={
                'actions': step['actions'],
                'values': step['values']
            },
        )

    def _filter_observation(self, obs):
        """ Potentially normalize observation """
        if self.ob_rms is not None:
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + 1e-8), -self.clip_obs, self.clip_obs)

            return obs.astype(np.float32)
        else:
            return obs

    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1)
        batch = self.backend.get_batch(indexes, history_length=1)

        observations = torch.from_numpy(self._filter_observation(batch['states'])).to(self.device)
        observations_plus1 = torch.from_numpy(self._filter_observation(batch['states+1'])).to(self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(
            size=self.batch_size,
            environment_information=[],
            transition_tensors={
                'observations': observations,
                'observations_next': observations_plus1,
                'dones': dones,
                'rewards': rewards,
                'actions': actions
            }
        )
class TransitionReplayEnvRoller(ReplayEnvRollerBase):
    """
    Calculate environment rollouts using a replay buffer for experience replay.
    Replay buffer is parametrized
    Samples transitions from the replay buffer (individual frame transitions)
    """

    def __init__(self, environment, device, replay_buffer: ReplayBuffer, discount_factor: typing.Optional[float]=None,
                 normalize_returns: bool=False, forward_steps: int=1, action_noise: typing.Optional[nn.Module]=None):
        self._environment = environment
        self.device = device
        self.replay_buffer = replay_buffer
        self.normalize_returns = normalize_returns
        self.forward_steps = forward_steps
        self.discount_factor = discount_factor
        self.action_noise = action_noise.to(self.device) if action_noise is not None else None

        if self.normalize_returns:
            assert self.discount_factor is not None, \
                "TransitionReplayEnvRoller must have a discount factor defined if normalize_returns is turned on"

        if self.forward_steps > 1:
            assert self.discount_factor is not None, \
                "TransitionReplayEnvRoller must have a discount factor defined if forward_steps is larger than one"

        self.ret_rms = RunningMeanStd(shape=()) if normalize_returns else None

        # Initial observation
        self.last_observation_cpu = torch.from_numpy(self.environment.reset()).clone()
        self.last_observation = self.last_observation_cpu.to(self.device)

        # Return normalization
        self.clip_obs = 5.0
        self.accumulated_returns = np.zeros(environment.num_envs, dtype=np.float32)

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    @torch.no_grad()
    def rollout(self, batch_info: BatchInfo, model: RlModel, number_of_steps: int) -> Rollout:
        """ Calculate env rollout """
        assert not model.is_recurrent, "Replay env roller does not support recurrent models"

        accumulator = TensorAccumulator()
        episode_information = []  # List of dictionaries with episode information

        for step_idx in range(number_of_steps):
            step = model.step(self.last_observation)

            if self.action_noise is not None:
                step['actions'] = self.action_noise(step['actions'], batch_info=batch_info)

            replay_extra_information = {}

            accumulator.add('observations', self.last_observation_cpu)

            # Add step to the tensor accumulator
            for name, tensor in step.items():
                tensor_cpu = tensor.cpu()
                accumulator.add(name, tensor_cpu)

                if name != 'actions':
                    replay_extra_information[name] = tensor_cpu.numpy()

            actions_numpy = step['actions'].detach().cpu().numpy()
            new_obs, new_rewards, new_dones, new_infos = self.environment.step(actions_numpy)

            # Store rollout in the experience replay buffer
            self.replay_buffer.store_transition(
                frame=self.last_observation_cpu.numpy(),
                action=actions_numpy,
                reward=new_rewards,
                done=new_dones,
                extra_info=replay_extra_information
            )

            if self.ret_rms is not None:
                self.accumulated_returns = new_rewards + self.discount_factor * self.accumulated_returns
                self.ret_rms.update(self.accumulated_returns)

            # Done is flagged true when the episode has ended AND the frame we see is already a first frame from the
            # next episode
            dones_tensor = torch.from_numpy(new_dones.astype(np.float32)).clone()
            accumulator.add('dones', dones_tensor)

            if self.action_noise is not None:
                self.action_noise.reset_training_state(dones_tensor, batch_info=batch_info)

            self.accumulated_returns = self.accumulated_returns * (1.0 - new_dones.astype(np.float32))

            self.last_observation_cpu = torch.from_numpy(new_obs).clone()
            self.last_observation = self.last_observation_cpu.to(self.device)

            if self.ret_rms is not None:
                new_rewards = np.clip(new_rewards / np.sqrt(self.ret_rms.var + 1e-8), -self.clip_obs, self.clip_obs)

            accumulator.add('rewards', torch.from_numpy(new_rewards.astype(np.float32)).clone())

            episode_information.append(new_infos)

        accumulated_tensors = accumulator.result()

        return Trajectories(
            num_steps=accumulated_tensors['observations'].size(0),
            num_envs=accumulated_tensors['observations'].size(1),
            environment_information=episode_information,
            transition_tensors=accumulated_tensors,
            rollout_tensors={}
        ).to_transitions()

    def sample(self, batch_info: BatchInfo, model: RlModel, number_of_steps: int) -> Rollout:
        """ Sample experience from replay buffer and return a batch """
        if self.forward_steps > 1:
            transitions = self.replay_buffer.sample_forward_transitions(
                batch_size=number_of_steps, batch_info=batch_info, forward_steps=self.forward_steps,
                discount_factor=self.discount_factor
            )
        else:
            transitions = self.replay_buffer.sample_transitions(batch_size=number_of_steps, batch_info=batch_info)

        if self.ret_rms is not None:
            rewards = transitions.transition_tensors['rewards']
            new_rewards = torch.clamp(rewards / np.sqrt(self.ret_rms.var + 1e-8), -self.clip_obs, self.clip_obs)
            transitions.transition_tensors['rewards'] = new_rewards

        return transitions

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.replay_buffer.is_ready_for_sampling()

    def initial_memory_size_hint(self) -> typing.Optional[int]:
        """ Hint how much data is needed to begin sampling, required only for diagnostics """
        return self.replay_buffer.initial_memory_size_hint()

    def update(self, rollout, batch_info):
        """ Perform update of the internal state of the buffer - e.g. for the prioritized replay weights """
        self.replay_buffer.update(rollout, batch_info)