def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        observation_tensor = torch.from_numpy(self.last_observation).to(self.device)

        step = model.step(observation_tensor[None])
        action = step['actions'].detach().cpu().numpy()[0]
        noise = self.noise_process()

        action_perturbed = np.clip(
            action + noise, self.environment.action_space.low, self.environment.action_space.high
        )

        observation, reward, done, info = self.environment.step(action_perturbed)

        if self.ob_rms is not None:
            self.ob_rms.update(observation)

        self.backend.store_transition(self.last_observation, action_perturbed, reward, done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()
            self.noise_process.reset()

        self.last_observation = observation

        return Transitions(
            size=1,
            environment_information=[info],
            transition_tensors={
                'actions': step['actions'],
                'values': step['values']
            },
        )
Beispiel #2
0
    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size,
                                                    self.frame_stack)
        batch = self.backend.get_batch(indexes, self.frame_stack)

        observations = torch.from_numpy(batch['states']).to(self.device)
        observations_plus1 = torch.from_numpy(batch['states+1']).to(
            self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(
            self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(
            self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(size=self.batch_size,
                           environment_information=None,
                           transition_tensors={
                               'observations': observations,
                               'observations_next': observations_plus1,
                               'dones': dones,
                               'rewards': rewards,
                               'actions': actions,
                               'weights': torch.ones_like(rewards)
                           })
Beispiel #3
0
    def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        epsilon_value = self.epsilon_schedule.value(batch_info['progress'])
        batch_info['epsilon'] = epsilon_value

        last_observation = np.concatenate([
            self.backend.get_frame(self.backend.current_idx,
                                   self.frame_stack - 1), self.last_observation
        ],
                                          axis=-1)

        observation_tensor = torch.from_numpy(last_observation[None]).to(
            self.device)
        step = model.step(observation_tensor)

        epsgreedy_step = self.epsgreedy_action(step['actions'], epsilon_value)
        action = epsgreedy_step.item()

        observation, reward, done, info = self.environment.step(action)
        self.backend.store_transition(self.last_observation, action, reward,
                                      done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()

        self.last_observation = observation

        return Transitions(size=1,
                           environment_information=[info],
                           transition_tensors={
                               'actions': epsgreedy_step.unsqueeze(0),
                               'values': step['values']
                           },
                           extra_data={'epsilon': epsilon_value})
    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        probs, indexes, tree_idxs = self.backend.sample_batch_prioritized(self.batch_size, self.frame_stack)
        batch = self.backend.get_batch(indexes, self.frame_stack)

        # Normalize weights properly
        priority_weight = self.priority_weight_schedule.value(batch_info['progress'])

        probs = np.stack(probs) / self.backend.segment_tree.total()
        capacity = self.backend.deque.current_size
        weights = (capacity * probs) ** (-priority_weight)
        weights = weights / weights.max()

        observations = torch.from_numpy(batch['states']).to(self.device)
        observations_plus1 = torch.from_numpy(batch['states+1']).to(self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)
        weights = torch.from_numpy(weights.astype(np.float32)).to(self.device)

        return Transitions(
            size=self.batch_size,
            environment_information=None,
            transition_tensors={
                'observations': observations,
                'observations_next': observations_plus1,
                'dones': dones,
                'rewards': rewards,
                'actions': actions,
                'weights': weights,
            },
            extra_data={
                'tree_idxs': tree_idxs
            }
        )
Beispiel #5
0
    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size,
                                                    history_length=1)
        batch = self.backend.get_batch(indexes, history_length=1)

        observations = self._observation_list_to_tensor(batch['states'])
        observations_plus1 = self._observation_list_to_tensor(
            batch['states+1'])

        rewards = batch['rewards'].astype(np.float32)

        if self.ret_rms is not None:
            rewards = np.clip(rewards / np.sqrt(self.ret_rms.var + 1e-8),
                              -self.clip_obs, self.clip_obs)

        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(
            self.device)
        rewards = torch.from_numpy(rewards).to(self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(size=self.batch_size,
                           environment_information=[],
                           transition_tensors={
                               'observations': observations,
                               'observations_next': observations_plus1,
                               'dones': dones,
                               'rewards': rewards,
                               'actions': actions
                           })
Beispiel #6
0
    def rollout(self, batch_info, model) -> Rollout:
        """ Roll-out the environment and return it """
        # observation_tensor = self._observation_to_tensor(self.last_observation[None])
        observation_tensor = self._observation_to_tensor(self.last_observation)

        step = model.step(observation_tensor)
        action = step['actions'].detach().cpu().numpy()[0]
        noise = self.noise_process()

        action_perturbed = np.clip(action + noise,
                                   self.environment.action_space.low,
                                   self.environment.action_space.high)

        observation, reward, done, info = self.environment.step(
            action_perturbed)

        if self.ob_rms is not None:
            self.ob_rms.update(observation)

        if self.ret_rms is not None:
            self.accumulated_return = reward + self.discount_factor * self.accumulated_return

            self.ret_rms.update(np.array([self.accumulated_return]))

        self.backend.store_transition(self.last_observation, action_perturbed,
                                      reward, done)

        # Usual, reset on done
        if done:
            observation = self.environment.reset()
            self.noise_process.reset()
            self.accumulated_return = 0.0

        self.last_observation = observation

        return Transitions(
            size=1,
            environment_information=info,
            transition_tensors={
                'actions': step['actions'],
                'values': step['values']
            },
        )
    def sample(self, batch_info, model) -> Transitions:
        """ Sample experience from replay buffer and return a batch """
        indexes = self.backend.sample_batch_uniform(self.batch_size, history_length=1)
        batch = self.backend.get_batch(indexes, history_length=1)

        observations = torch.from_numpy(self._filter_observation(batch['states'])).to(self.device)
        observations_plus1 = torch.from_numpy(self._filter_observation(batch['states+1'])).to(self.device)
        dones = torch.from_numpy(batch['dones'].astype(np.float32)).to(self.device)
        rewards = torch.from_numpy(batch['rewards'].astype(np.float32)).to(self.device)
        actions = torch.from_numpy(batch['actions']).to(self.device)

        return Transitions(
            size=self.batch_size,
            environment_information=[],
            transition_tensors={
                'observations': observations,
                'observations_next': observations_plus1,
                'dones': dones,
                'rewards': rewards,
                'actions': actions
            }
        )