コード例 #1
0
def get_half_filled_buffer():
    """ Return simple preinitialized buffer """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8)
    action_space = gym.spaces.Discrete(4)

    buffer = DequeBufferBackend(20, observation_space, action_space)

    v1 = np.ones(4).reshape((2, 2, 1))

    for i in range(10):
        buffer.store_transition(v1 * (i+1), 0, float(i)/2, False)

    return buffer
コード例 #2
0
def get_filled_buffer_extra_info():
    """ Return simple preinitialized buffer """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8)
    action_space = gym.spaces.Discrete(4)

    buffer = DequeBufferBackend(20, observation_space, action_space, extra_data={
        'neglogp': np.zeros(20, dtype=float)
    })

    v1 = np.ones(4).reshape((2, 2, 1))

    for i in range(30):
        buffer.store_transition(v1 * (i+1), 0, float(i)/2, False, extra_info={'neglogp': i / 30.0})

    return buffer
コード例 #3
0
def get_filled_buffer_with_dones():
    """ Return simple preinitialized buffer with some done's in there """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8)
    action_space = gym.spaces.Discrete(4)
    buffer = DequeBufferBackend(20, observation_space, action_space)

    v1 = np.ones(4).reshape((2, 2, 1))

    done_set = {2, 5, 10, 13, 18, 22, 28}

    for i in range(30):
        if i in done_set:
            buffer.store_transition(v1 * (i+1), 0, float(i)/2, True)
        else:
            buffer.store_transition(v1 * (i+1), 0, float(i)/2, False)

    return buffer
コード例 #4
0
def get_filled_buffer1x1_history():
    """ Return simple preinitialized buffer """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 1), dtype=int)
    action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=(2,), dtype=float)

    buffer = DequeBufferBackend(20, observation_space=observation_space, action_space=action_space)

    v1 = np.ones(2).reshape((2, 1))
    a1 = np.arange(2).reshape((2,))

    for i in range(30):
        item = v1.copy()
        item[0] *= (i+1)
        item[1] *= 10 * (i+1)

        buffer.store_transition(item, a1 * i, float(i)/2, False)

    return buffer
コード例 #5
0
def test_buffer_filling_size():
    """ Check if buffer size is properly updated when we add items """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8)
    action_space = gym.spaces.Discrete(4)
    buffer = DequeBufferBackend(20, observation_space, action_space)

    v1 = np.ones(4).reshape((2, 2, 1))

    t.eq_(buffer.current_size, 0)

    buffer.store_transition(v1, 0, 0, False)
    buffer.store_transition(v1, 0, 0, False)

    t.eq_(buffer.current_size, 2)

    for i in range(30):
        buffer.store_transition(v1 * (i+1), 0, float(i)/2, False)

    t.eq_(buffer.current_size, buffer.buffer_capacity)
コード例 #6
0
def test_simple_get_frame():
    """ Check if get_frame returns frames from a buffer partially full """
    observation_space = gym.spaces.Box(low=0, high=255, shape=(2, 2, 1), dtype=np.uint8)
    action_space = gym.spaces.Discrete(4)
    buffer = DequeBufferBackend(20, observation_space, action_space)

    v1 = np.ones(4).reshape((2, 2, 1))
    v2 = v1 * 2
    v3 = v1 * 3

    buffer.store_transition(v1, 0, 0, False)
    buffer.store_transition(v2, 0, 0, False)
    buffer.store_transition(v3, 0, 0, False)

    assert np.all(buffer.get_frame(0, 4).max(0).max(0) == np.array([0, 0, 0, 1]))
    assert np.all(buffer.get_frame(1, 4).max(0).max(0) == np.array([0, 0, 1, 2]))
    assert np.all(buffer.get_frame(2, 4).max(0).max(0) == np.array([0, 1, 2, 3]))

    with t.assert_raises(VelException):
        buffer.get_frame(3, 4)

    with t.assert_raises(VelException):
        buffer.get_frame(4, 4)
コード例 #7
0
class DequeReplayRollerOuNoise(ReplayEnvRollerBase):
    """
    Enrionment roller with experience replay buffer rolling out a **single** environment
    with Ornstein–Uhlenbeck noise process
    """

    def __init__(self, environment, device, batch_size, buffer_capacity, buffer_initial_size, noise_std_dev,
                 normalize_observations=False):
        self.device = device
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.normalize_observations = normalize_observations

        self.device = device
        self._environment = environment

        self.backend = DequeBufferBackend(
            buffer_capacity=self.buffer_capacity,
            observation_space=environment.observation_space,
            action_space=environment.action_space
        )

        self.last_observation = self.environment.reset()

        len_action_space = self.environment.action_space.shape[-1]

        self.noise_process = OrnsteinUhlenbeckNoiseProcess(
            np.zeros(len_action_space), float(noise_std_dev) * np.ones(len_action_space)
        )

        self.ob_rms = RunningMeanStd(shape=self.environment.observation_space.shape) if normalize_observations else None
        self.clip_obs = 10.0

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.backend.current_size >= self.buffer_initial_size

    @torch.no_grad()
    def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        observation_tensor = torch.from_numpy(self.last_observation).to(self.device)

        step = model.step(observation_tensor[None])
        action = step['actions'].detach().cpu().numpy()[0]
        noise = self.noise_process()

        action_perturbed = np.clip(
            action + noise, self.environment.action_space.low, self.environment.action_space.high
        )

        observation, reward, done, info = self.environment.step(action_perturbed)

        if self.ob_rms is not None:
            self.ob_rms.update(observation)

        self.backend.store_transition(self.last_observation, action_perturbed, reward, done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()
            self.noise_process.reset()

        self.last_observation = observation

        return Transitions(
            size=1,
            environment_information=[info],
            transition_tensors={
                'actions': step['actions'],
                'values': step['values']
            },
        )

    def _filter_observation(self, obs):
        """ Potentially normalize observation """
        if self.ob_rms is not None:
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + 1e-8), -self.clip_obs, self.clip_obs)

            return obs.astype(np.float32)
        else:
            return obs

    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1)
        batch = self.backend.get_batch(indexes, history_length=1)

        observations = torch.from_numpy(self._filter_observation(batch['states'])).to(self.device)
        observations_plus1 = torch.from_numpy(self._filter_observation(batch['states+1'])).to(self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(
            size=self.batch_size,
            environment_information=[],
            transition_tensors={
                'observations': observations,
                'observations_next': observations_plus1,
                'dones': dones,
                'rewards': rewards,
                'actions': actions
            }
        )
コード例 #8
0
class DequeReplayRollerEpsGreedy(ReplayEnvRollerBase):
    """
    Environment roller for action-value models using experience replay.
    Simplest buffer implementation just holding up to given number of samples.

    Because framestack is implemented directly in the buffer, we can use *much* less space to hold samples in
    memory for very little additional cost.
    """
    def __init__(self, environment, device, epsilon_schedule: Schedule,
                 batch_size: int, buffer_capacity: int,
                 buffer_initial_size: int, frame_stack: int):
        self.epsilon_schedule = epsilon_schedule
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.buffer_initial_size = buffer_initial_size
        self.frame_stack = frame_stack

        self.device = device
        self._environment = environment
        self.backend = DequeBufferBackend(
            buffer_capacity=self.buffer_capacity,
            observation_space=environment.observation_space,
            action_space=environment.action_space)

        self.last_observation = self.environment.reset()

    @property
    def environment(self):
        """ Return environment of this env roller """
        return self._environment

    def is_ready_for_sampling(self) -> bool:
        """ If buffer is ready for drawing samples from it (usually checks if there is enough data) """
        return self.backend.current_size >= self.buffer_initial_size

    def epsgreedy_action(self, policy_samples, epsilon):
        """ Sample e-greedy action using curreny policy and epsilon value """
        random_samples = torch.randint_like(policy_samples,
                                            self.environment.action_space.n)
        selector = torch.rand_like(random_samples, dtype=torch.float32)
        return torch.where(selector > epsilon, policy_samples, random_samples)

    @torch.no_grad()
    def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        epsilon_value = self.epsilon_schedule.value(batch_info['progress'])
        batch_info['epsilon'] = epsilon_value

        last_observation = np.concatenate([
            self.backend.get_frame(self.backend.current_idx,
                                   self.frame_stack - 1), self.last_observation
        ],
                                          axis=-1)

        observation_tensor = torch.from_numpy(last_observation[None]).to(
            self.device)
        step = model.step(observation_tensor)

        epsgreedy_step = self.epsgreedy_action(step['actions'], epsilon_value)
        action = epsgreedy_step.item()

        observation, reward, done, info = self.environment.step(action)
        self.backend.store_transition(self.last_observation, action, reward,
                                      done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()

        self.last_observation = observation

        return Transitions(size=1,
                           environment_information=[info],
                           transition_tensors={
                               'actions': epsgreedy_step.unsqueeze(0),
                               'values': step['values']
                           },
                           extra_data={'epsilon': epsilon_value})

    def metrics(self):
        """ List of metrics to track for this learning process """
        return [
            AveragingNamedMetric("epsilon"),
        ]

    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size,
                                                    self.frame_stack)
        batch = self.backend.get_batch(indexes, self.frame_stack)

        observations = torch.from_numpy(batch['states']).to(self.device)
        observations_plus1 = torch.from_numpy(batch['states+1']).to(
            self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(
            self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(
            self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(size=self.batch_size,
                           environment_information=None,
                           transition_tensors={
                               'observations': observations,
                               'observations_next': observations_plus1,
                               'dones': dones,
                               'rewards': rewards,
                               'actions': actions,
                               'weights': torch.ones_like(rewards)
                           })