Ejemplo n.º 1
0
 def _get_experience(self):
     return Experience(
         state0=self.exp_state0,  # NOTE: here state0 is always None
         action=self.exp_action,
         reward=self.exp_reward,
         state1=self._preprocess_state(self.exp_state1),
         terminal1=self.exp_terminal1)
Ejemplo n.º 2
0
    def sample(self, batch_size, batch_idxs=None):
        if batch_idxs is None:
            # Draw random indexes such that we have at least a single entry before each
            # index.
            batch_idxs = sample_batch_indexes(0,
                                              self.nb_entries - 1,
                                              size=batch_size)
        batch_idxs = np.array(batch_idxs) + 1
        assert np.min(batch_idxs) >= 1
        assert np.max(batch_idxs) < self.nb_entries
        assert len(batch_idxs) == batch_size

        # Create experiences
        experiences = []
        for idx in batch_idxs:
            terminal0 = self.terminals[idx - 2] if idx >= 2 else False
            while terminal0:
                # Skip this transition because the environment was reset here. Select a new, random
                # transition and use this instead. This may cause the batch to contain the same
                # transition twice.
                idx = sample_batch_indexes(1, self.nb_entries, size=1)[0]
                terminal0 = self.terminals[idx - 2] if idx >= 2 else False
            assert 1 <= idx < self.nb_entries

            # This code is slightly complicated by the fact that subsequent observations might be
            # from different episodes. We ensure that an experience never spans multiple episodes.
            # This is probably not that important in practice but it seems cleaner.
            state0 = [self.observations[idx - 1]]
            for offset in range(0, self.window_length - 1):
                current_idx = idx - 2 - offset
                current_terminal = self.terminals[
                    current_idx - 1] if current_idx - 1 > 0 else False
                if current_idx < 0 or (not self.ignore_episode_boundaries
                                       and current_terminal):
                    # The previously handled observation was terminal, don't add the current one.
                    # Otherwise we would leak into a different episode.
                    break
                state0.insert(0, self.observations[current_idx])
            while len(state0) < self.window_length:
                state0.insert(0, zeroed_observation(state0[0]))
            action = self.actions[idx - 1]
            reward = self.rewards[idx - 1]
            terminal1 = self.terminals[idx - 1]

            # Okay, now we need to create the follow-up state. This is state0 shifted on timestep
            # to the right. Again, we need to be careful to not include an observation from the next
            # episode if the last state is terminal.
            state1 = [np.copy(x) for x in state0[1:]]
            state1.append(self.observations[idx])

            assert len(state0) == self.window_length
            assert len(state1) == len(state0)
            experiences.append(
                Experience(state0=state0,
                           action=action,
                           reward=reward,
                           state1=state1,
                           terminal1=terminal1))
        assert len(experiences) == batch_size
        return experiences
Ejemplo n.º 3
0
    def sample(self, batch_size, batch_idxs=None):
        if batch_idxs is None:
            batch_idxs = sample_batch_indexs(0,
                                             self.nb_entries - 1,
                                             size=batch_size)
        batch_idxs = np.array(batch_idxs) + 1

        experiences = []
        for idx in batch_idxs:
            terminal0 = self.terminals[idx - 2] if idx >= 2 else False
            while terminal0:
                idx = sample_batch_indexs(1, self.nb_entries, size=1)[0]
                terminal0 = self.terminals[idx - 2] if idx >= 2 else False
            assert 1 <= idx < self.nb_entries

            state0 = [self.observations[idx - 1]]
            for offset in range(0, self.window_length - 1):
                current_idx = idx - 2 - offset
                current_terminal = self.terminals[
                    current_idx - 1] if current_idx - 1 > 0 else False
                if current_idx < 0 or (not self.ignore_episode_boundaries
                                       and current_terminal):
                    break
                state0.insert(0, self.observations[current_idx])
            while (len(state0) < self.window_length):
                state0.insert(0, zeroed_observation(state0[0]))
            action = self.actions[idx - 1]
            reward = self.rewards[idx - 1]
            terminal1 = self.terminals[idx - 1]
            state1 = [np.copy(x) for x in state0[1:]]
            state1.append(self.observations[idx])
            experiences.append(
                Experience(state0, action, reward, state1, terminal1))
        return experiences
Ejemplo n.º 4
0
 def _get_experience(self):
     if self.hist_len == 1:
         return Experience(
             state0=self.exp_state0,  # NOTE: here state0 is always None
             action=self.exp_action,
             reward=self.exp_reward,
             state1=self._preprocessState(self.exp_state1),
             terminal1=self.exp_terminal1,
             extras=self.extras)
     else:
         return Experience(
             state0=self.exp_state0,  # NOTE: here state0 is always None
             action=self.exp_action,
             reward=self.exp_reward,
             state1=self.state_buffer,
             terminal1=self.exp_terminal1,
             extras=self.extras)
Ejemplo n.º 5
0
 def _reset_experience(
     self
 ):  # for getting one set of observation from env for every action taken
     self.experience = Experience(state0=None,
                                  action=None,
                                  reward=None,
                                  state1=None,
                                  terminal1=False)
Ejemplo n.º 6
0
 def _reset_experience(self):
     self.experience = Experience(state0 = None,
                                  action = None,
                                  reward = None,
                                  state1 = None,
                                  terminal1 = False)
Ejemplo n.º 7
0
 def _get_experience(self):
     return Experience(state0=self.exp_state0,
                       action=self.exp_action,
                       reward=self.exp_reward,
                       state1=self._preprocessState(self.exp_state1),
                       terminal1=self.exp_terminal1)