def __init__(self, environment: VecEnv, device, number_of_steps,
                 discount_factor, buffer_capacity, buffer_initial_size,
                 frame_stack_compensation):
        self._environment = environment
        self.device = device
        self.number_of_steps = number_of_steps
        self.discount_factor = discount_factor
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.frame_stack_compensation = frame_stack_compensation

        # Initial observation
        self.last_observation_cpu = self.environment.reset()
        self.last_observation = self._to_tensor(self.last_observation_cpu)

        # Replay buffer
        self.replay_buffer = DequeMultiEnvBufferBackend(
            buffer_capacity=self.buffer_capacity,
            num_envs=self.environment.num_envs,
            observation_space=self.environment.observation_space,
            action_space=self.environment.action_space,
            extra_data={
                'action_logits':
                np.zeros((self.buffer_capacity, self.environment.num_envs,
                          self.environment.action_space.n),
                         dtype=np.float32)
            },
            frame_stack_compensation=self.frame_stack_compensation is not None)
Beispiel #2
0
def get_half_filled_buffer():
    """ Return simple preinitialized buffer """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=int)
    action_space = gym.spaces.Discrete(4)

    buffer = DequeMultiEnvBufferBackend(20, num_envs=2, observation_space=observation_space, action_space=action_space)

    v1 = np.ones(8).reshape((2, 2, 2, 1))

    for i in range(10):
        item = v1.copy()
        item[0] *= (i+1)
        item[1] *= 10 * (i+1)

        buffer.store_transition(item, 0, float(i)/2, False)

    return buffer
Beispiel #3
0
def get_filled_buffer3x3():
    """ Return simple preinitialized buffer """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 2), dtype=int)
    action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(2, 2, 2), dtype=float)

    buffer = DequeMultiEnvBufferBackend(20, num_envs=2, observation_space=observation_space, action_space=action_space)

    v1 = np.ones(16).reshape((2, 2, 2, 2))
    a1 = np.arange(16).reshape((2, 2, 2, 2))

    for i in range(30):
        item = v1.copy()
        item[:, 0] *= (i+1)
        item[:, 1] *= 10 * (i+1)

        buffer.store_transition(item, i * a1, float(i)/2, False)

    return buffer
Beispiel #4
0
def get_filled_buffer_with_dones():
    """ Return simple preinitialized buffer with some done's in there """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=int)
    action_space = gym.spaces.Discrete(4)

    buffer = DequeMultiEnvBufferBackend(20, num_envs=2, observation_space=observation_space, action_space=action_space)

    v1 = np.ones(8).reshape((2, 2, 2, 1))

    done_set = {2, 5, 10, 13, 18, 22, 28}

    for i in range(30):
        item = v1.copy()
        item[0] *= (i+1)
        item[1] *= 10 * (i+1)

        done_array = np.array([i in done_set, (i+1) in done_set], dtype=bool)
        buffer.store_transition(item, 0, float(i)/2, done_array)

    return buffer
Beispiel #5
0
def get_filled_buffer_extra_info():
    """ Return simple preinitialized buffer """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=int)
    action_space = gym.spaces.Discrete(4)

    buffer = DequeMultiEnvBufferBackend(
        20, num_envs=2, observation_space=observation_space, action_space=action_space,
        extra_data={
            'neglogp': np.zeros((20, 2), dtype=float)
        }
    )

    v1 = np.ones(8).reshape((2, 2, 2, 1))

    for i in range(30):
        item = v1.copy()
        item[0] *= (i+1)
        item[1] *= 10 * (i+1)
        buffer.store_transition(item, 0, float(i)/2, False, extra_info={
            'neglogp': np.array([i / 30.0, (i+1) / 30.0])
        })

    return buffer
Beispiel #6
0
def test_buffer_filling_size():
    """ Check if buffer size is properly updated when we add items """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=int)
    action_space = gym.spaces.Discrete(4)
    buffer = DequeMultiEnvBufferBackend(20, num_envs=2, observation_space=observation_space, action_space=action_space)

    v1 = np.ones(8).reshape((2, 2, 2, 1))

    t.eq_(buffer.current_size, 0)

    buffer.store_transition(v1, 0, 0, False)
    buffer.store_transition(v1, 0, 0, False)

    t.eq_(buffer.current_size, 2)

    for i in range(30):
        buffer.store_transition(v1 * (i+1), 0, float(i)/2, False)

    t.eq_(buffer.current_size, buffer.buffer_capacity)
Beispiel #7
0
def test_simple_get_frame():
    """ Check if get_frame returns frames from a buffer partially full """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=int)
    action_space = gym.spaces.Discrete(4)
    buffer = DequeMultiEnvBufferBackend(20, num_envs=2, observation_space=observation_space, action_space=action_space)

    v1 = np.ones(8).reshape((2, 2, 2, 1))
    v1[1] *= 2

    v2 = v1 * 2
    v3 = v1 * 3

    buffer.store_transition(v1, 0, 0, False)
    buffer.store_transition(v2, 0, 0, False)
    buffer.store_transition(v3, 0, 0, False)

    assert np.all(buffer.get_frame(0, 0, 4).max(0).max(0) == np.array([0, 0, 0, 1]))
    assert np.all(buffer.get_frame(1, 0, 4).max(0).max(0) == np.array([0, 0, 1, 2]))
    assert np.all(buffer.get_frame(2, 0, 4).max(0).max(0) == np.array([0, 1, 2, 3]))

    assert np.all(buffer.get_frame(0, 1, 4).max(0).max(0) == np.array([0, 0, 0, 2]))
    assert np.all(buffer.get_frame(1, 1, 4).max(0).max(0) == np.array([0, 0, 2, 4]))
    assert np.all(buffer.get_frame(2, 1, 4).max(0).max(0) == np.array([0, 2, 4, 6]))

    with t.assert_raises(VelException):
        buffer.get_frame(3, 0, 4)

    with t.assert_raises(VelException):
        buffer.get_frame(4, 0, 4)

    with t.assert_raises(VelException):
        buffer.get_frame(3, 1, 4)

    with t.assert_raises(VelException):
        buffer.get_frame(4, 1, 4)
class ReplayQEnvRoller(ReplayEnvRollerBase):
    """
    Class calculating env rollouts and storing them in a buffer for experience replay
    Idea behind this class is to store as much as we can as pytorch tensors to minimize tensor copying.
    """
    def __init__(self, environment: VecEnv, device, number_of_steps,
                 discount_factor, buffer_capacity, buffer_initial_size,
                 frame_stack_compensation):
        self._environment = environment
        self.device = device
        self.number_of_steps = number_of_steps
        self.discount_factor = discount_factor
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.frame_stack_compensation = frame_stack_compensation

        # Initial observation
        self.last_observation_cpu = self.environment.reset()
        self.last_observation = self._to_tensor(self.last_observation_cpu)

        # Replay buffer
        self.replay_buffer = DequeMultiEnvBufferBackend(
            buffer_capacity=self.buffer_capacity,
            num_envs=self.environment.num_envs,
            observation_space=self.environment.observation_space,
            action_space=self.environment.action_space,
            extra_data={
                'action_logits':
                np.zeros((self.buffer_capacity, self.environment.num_envs,
                          self.environment.action_space.n),
                         dtype=np.float32)
            },
            frame_stack_compensation=self.frame_stack_compensation is not None)

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    def _to_tensor(self, numpy_array):
        """ Convert numpy array to a tensor """
        return torch.from_numpy(numpy_array).to(self.device)

    @torch.no_grad()
    def rollout(self, batch_info, model) -> Rollout:
        """ Calculate env rollout """
        observation_accumulator = []  # Device tensors
        action_accumulator = []  # Device tensors
        logprob_accumulator = []  # Device tensors
        done_accumulator = []  # Device tensors
        reward_accumulator = []  # Device tensors
        episode_information = []  # Python objects

        for step_idx in range(self.number_of_steps):
            step = model.step(self.last_observation)
            actions = step['actions']

            observation_accumulator.append(self.last_observation)
            action_accumulator.append(actions)

            logprobs = step['logprobs']
            logprob_accumulator.append(logprobs)

            actions_numpy = actions.detach().cpu().numpy()
            new_obs, new_rewards, new_dones, new_infos = self.environment.step(
                actions_numpy)

            # Store rollout in the experience replay buffer
            self.replay_buffer.store_transition(
                frame=self.last_observation_cpu,
                action=actions_numpy,
                reward=new_rewards,
                done=new_dones,
                extra_info={
                    'action_logits': logprobs.detach().cpu().numpy(),
                })

            # Done is flagged true when the episode has ended AND the frame we see is already a first frame from the
            # Next episode
            self.last_observation_cpu = new_obs[:]
            self.last_observation = self._to_tensor(self.last_observation_cpu)

            done_accumulator.append(
                self._to_tensor(new_dones.astype(np.float32)))
            reward_accumulator.append(
                self._to_tensor(new_rewards.astype(np.float32)))

            episode_information.append(new_infos)

        final_values = model.value(self.last_observation)

        observations_buffer = torch.stack(observation_accumulator)
        rewards_buffer = torch.stack(reward_accumulator)
        actions_buffer = torch.stack(action_accumulator)
        dones_buffer = torch.stack(done_accumulator)
        action_logit_buffer = torch.stack(logprob_accumulator)

        return Trajectories(
            num_steps=self.number_of_steps,
            num_envs=self.environment.num_envs,
            environment_information=episode_information,
            transition_tensors={
                'observations': observations_buffer,
                'rewards': rewards_buffer,
                'dones': dones_buffer,
                'actions': actions_buffer,
                'logprobs': action_logit_buffer,
            },
            rollout_tensors={'final_estimated_values': final_values})

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.replay_buffer.current_size >= self.buffer_initial_size

    @torch.no_grad()
    def sample(self, batch_info, model):
        """ Sample experience from replay buffer and return a batch """
        rollout_idx = self.replay_buffer.sample_batch_rollout(
            rollout_length=self.number_of_steps,
            history_length=self.frame_stack_compensation)

        rollout = self.replay_buffer.get_rollout(
            rollout_idx,
            rollout_length=self.number_of_steps,
            history_length=self.frame_stack_compensation)

        return Trajectories(
            num_steps=self.number_of_steps,
            num_envs=self.environment.num_envs,
            environment_information=None,
            transition_tensors={
                'observations': self._to_tensor(rollout['states']),
                'dones': self._to_tensor(rollout['dones'].astype(np.float32)),
                'rewards': self._to_tensor(rollout['rewards']),
                'actions': self._to_tensor(rollout['actions']),
                'logprobs': self._to_tensor(rollout['action_logits'])
            },
            rollout_tensors={
                'final_estimated_values':
                model.value(self._to_tensor(rollout['states+1'][-1]))
            })
Beispiel #9
0
class ReplayQEnvRoller(ReplayEnvRollerBase):
    """
    Class calculating env rollouts and storing them in a buffer for experience replay
    Idea behind this class is to store as much as we can as pytorch tensors to minimize tensor copying.
    """
    def __init__(self, environment: VecEnv, device, number_of_steps,
                 discount_factor, buffer_capacity, buffer_initial_size,
                 frame_stack_compensation):
        self._environment = environment
        self.device = device
        self.number_of_steps = number_of_steps
        self.discount_factor = discount_factor
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.frame_stack_compensation = frame_stack_compensation

        # Initial observation
        self.last_observation = self._to_tensor(self.environment.reset())
        self.dones = torch.tensor(
            [False for _ in range(self.last_observation.shape[0])],
            device=self.device)

        self.batch_observation_shape = (
            (self.last_observation.shape[0] * self.number_of_steps, ) +
            self.environment.observation_space.shape)

        # Replay buffer
        self.replay_buffer = DequeMultiEnvBufferBackend(
            buffer_capacity=self.buffer_capacity,
            num_envs=self.environment.num_envs,
            observation_space=self.environment.observation_space,
            action_space=self.environment.action_space,
            extra_data={
                'action_logits':
                np.zeros((self.buffer_capacity, self.environment.num_envs,
                          self.environment.action_space.n),
                         dtype=np.float32)
            },
            frame_stack_compensation=self.frame_stack_compensation is not None)

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    def _to_tensor(self, numpy_array):
        """ Convert numpy array to a tensor """
        return torch.from_numpy(numpy_array).to(self.device)

    @torch.no_grad()
    def rollout(self, batch_info, model):
        """ Calculate env rollout """
        observation_accumulator = []  # Device tensors
        action_accumulator = []  # Device tensors
        action_logit_accumulator = []  # Device tensors
        dones_accumulator = []  # Device tensors
        rewards_accumulator = []  # Device tensors
        episode_information = []  # Python objects

        for step_idx in range(self.number_of_steps):
            step = model.step(self.last_observation)

            actions = step['actions']

            observation_accumulator.append(self.last_observation)
            action_accumulator.append(actions)
            dones_accumulator.append(self.dones)

            action_logits = step['action_logits']
            action_logit_accumulator.append(action_logits)

            actions_numpy = actions.detach().cpu().numpy()
            new_obs, new_rewards, new_dones, new_infos = self.environment.step(
                actions_numpy)

            # Store rollout in the experience replay buffer
            self.replay_buffer.store_transition(
                frame=self.last_observation.detach().cpu().numpy(),
                action=actions_numpy,
                reward=new_rewards,
                done=new_dones,
                extra_info={
                    'action_logits': action_logits.detach().cpu().numpy(),
                })

            # Done is flagged true when the episode has ended AND the frame we see is already a first frame from the
            # Next episode
            self.dones = self._to_tensor(new_dones.astype(np.uint8))
            self.last_observation = self._to_tensor(new_obs[:])

            rewards_accumulator.append(
                self._to_tensor(new_rewards.astype(np.float32)))

            for info in new_infos:
                maybe_episode_info = info.get('episode')

                if maybe_episode_info:
                    episode_information.append(maybe_episode_info)

        final_values = model.value(self.last_observation)

        dones_accumulator.append(self.dones)

        observation_buffer = torch.stack(observation_accumulator)
        rewards_buffer = torch.stack(rewards_accumulator)
        actions_buffer = torch.stack(action_accumulator)
        dones_buffer = torch.stack(dones_accumulator)
        action_logit_buffer = torch.stack(action_logit_accumulator)

        masks_buffer = dones_buffer[:-1, :]
        dones_buffer = dones_buffer[1:, :]

        batch_action_shape = (action_logit_buffer.size(0) *
                              action_logit_buffer.size(1),
                              action_logit_buffer.size(2))

        # Reshape into final batch size
        return {
            'size':
            self.batch_observation_shape[0],
            'observations':
            observation_buffer.reshape(self.batch_observation_shape),
            'masks':
            masks_buffer.flatten(
            ),  # Dones and masks are basically the same, just shifted by 1
            'dones':
            dones_buffer.flatten(),
            'rewards':
            rewards_buffer.flatten(),
            'actions':
            actions_buffer.flatten(),
            'episode_information':
            episode_information,
            'action_logits':
            action_logit_buffer.reshape(batch_action_shape),
            'final_values':
            final_values
        }

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.replay_buffer.current_size >= self.buffer_initial_size

    @torch.no_grad()
    def sample(self, batch_info, model):
        """ Sample experience from replay buffer and return a batch """
        rollout_idx = self.replay_buffer.sample_batch_rollout(
            rollout_length=self.number_of_steps,
            history_length=self.frame_stack_compensation)

        rollout = self.replay_buffer.get_rollout(
            rollout_idx,
            rollout_length=self.number_of_steps,
            history_length=self.frame_stack_compensation)

        action_logits_tensor = self._to_tensor(rollout['action_logits'])

        final_values = model.value(self._to_tensor(rollout['states+1'][-1]))

        return {
            'observations':
            self._to_tensor(rollout['states']).view(
                self.batch_observation_shape),
            'dones':
            self._to_tensor(rollout['dones'].astype(np.uint8)).flatten(),
            'rewards':
            self._to_tensor(rollout['rewards']).flatten(),
            'actions':
            self._to_tensor(rollout['actions']).flatten(),
            'action_logits':
            action_logits_tensor.view(
                action_logits_tensor.size(0) * action_logits_tensor.size(1),
                action_logits_tensor.size(2)),
            'final_values':
            final_values
        }