def _get_experience(self): return Experience( state0=self.exp_state0, # NOTE: here state0 is always None action=self.exp_action, reward=self.exp_reward, state1=self._preprocess_state(self.exp_state1), terminal1=self.exp_terminal1)
def sample(self, batch_size, batch_idxs=None): if batch_idxs is None: # Draw random indexes such that we have at least a single entry before each # index. batch_idxs = sample_batch_indexes(0, self.nb_entries - 1, size=batch_size) batch_idxs = np.array(batch_idxs) + 1 assert np.min(batch_idxs) >= 1 assert np.max(batch_idxs) < self.nb_entries assert len(batch_idxs) == batch_size # Create experiences experiences = [] for idx in batch_idxs: terminal0 = self.terminals[idx - 2] if idx >= 2 else False while terminal0: # Skip this transition because the environment was reset here. Select a new, random # transition and use this instead. This may cause the batch to contain the same # transition twice. idx = sample_batch_indexes(1, self.nb_entries, size=1)[0] terminal0 = self.terminals[idx - 2] if idx >= 2 else False assert 1 <= idx < self.nb_entries # This code is slightly complicated by the fact that subsequent observations might be # from different episodes. We ensure that an experience never spans multiple episodes. # This is probably not that important in practice but it seems cleaner. state0 = [self.observations[idx - 1]] for offset in range(0, self.window_length - 1): current_idx = idx - 2 - offset current_terminal = self.terminals[ current_idx - 1] if current_idx - 1 > 0 else False if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal): # The previously handled observation was terminal, don't add the current one. # Otherwise we would leak into a different episode. break state0.insert(0, self.observations[current_idx]) while len(state0) < self.window_length: state0.insert(0, zeroed_observation(state0[0])) action = self.actions[idx - 1] reward = self.rewards[idx - 1] terminal1 = self.terminals[idx - 1] # Okay, now we need to create the follow-up state. This is state0 shifted on timestep # to the right. Again, we need to be careful to not include an observation from the next # episode if the last state is terminal. state1 = [np.copy(x) for x in state0[1:]] state1.append(self.observations[idx]) assert len(state0) == self.window_length assert len(state1) == len(state0) experiences.append( Experience(state0=state0, action=action, reward=reward, state1=state1, terminal1=terminal1)) assert len(experiences) == batch_size return experiences
def sample(self, batch_size, batch_idxs=None): if batch_idxs is None: batch_idxs = sample_batch_indexs(0, self.nb_entries - 1, size=batch_size) batch_idxs = np.array(batch_idxs) + 1 experiences = [] for idx in batch_idxs: terminal0 = self.terminals[idx - 2] if idx >= 2 else False while terminal0: idx = sample_batch_indexs(1, self.nb_entries, size=1)[0] terminal0 = self.terminals[idx - 2] if idx >= 2 else False assert 1 <= idx < self.nb_entries state0 = [self.observations[idx - 1]] for offset in range(0, self.window_length - 1): current_idx = idx - 2 - offset current_terminal = self.terminals[ current_idx - 1] if current_idx - 1 > 0 else False if current_idx < 0 or (not self.ignore_episode_boundaries and current_terminal): break state0.insert(0, self.observations[current_idx]) while (len(state0) < self.window_length): state0.insert(0, zeroed_observation(state0[0])) action = self.actions[idx - 1] reward = self.rewards[idx - 1] terminal1 = self.terminals[idx - 1] state1 = [np.copy(x) for x in state0[1:]] state1.append(self.observations[idx]) experiences.append( Experience(state0, action, reward, state1, terminal1)) return experiences
def _get_experience(self): if self.hist_len == 1: return Experience( state0=self.exp_state0, # NOTE: here state0 is always None action=self.exp_action, reward=self.exp_reward, state1=self._preprocessState(self.exp_state1), terminal1=self.exp_terminal1, extras=self.extras) else: return Experience( state0=self.exp_state0, # NOTE: here state0 is always None action=self.exp_action, reward=self.exp_reward, state1=self.state_buffer, terminal1=self.exp_terminal1, extras=self.extras)
def _reset_experience( self ): # for getting one set of observation from env for every action taken self.experience = Experience(state0=None, action=None, reward=None, state1=None, terminal1=False)
def _reset_experience(self): self.experience = Experience(state0 = None, action = None, reward = None, state1 = None, terminal1 = False)
def _get_experience(self): return Experience(state0=self.exp_state0, action=self.exp_action, reward=self.exp_reward, state1=self._preprocessState(self.exp_state1), terminal1=self.exp_terminal1)