def estimate(self, batch: SampleBatchType) -> List[OffPolicyEstimate]: self.check_can_estimate_for(batch) estimates = [] for sub_batch in batch.split_by_episode(): rewards, old_prob = sub_batch["rewards"], sub_batch["action_prob"] new_prob = np.exp(self.action_log_likelihood(sub_batch)) # calculate importance ratios p = [] for t in range(sub_batch.count): if t == 0: pt_prev = 1.0 else: pt_prev = p[t - 1] p.append(pt_prev * new_prob[t] / old_prob[t]) # calculate stepwise IS estimate v_old = 0.0 v_new = 0.0 for t in range(sub_batch.count): v_old += rewards[t] * self.gamma**t v_new += p[t] * rewards[t] * self.gamma**t estimates.append( OffPolicyEstimate( self.name, { "v_old": v_old, "v_new": v_new, "v_gain": v_new / max(1e-8, v_old), }, )) return estimates
def k_fold_cv( batch: SampleBatchType, k: int, should_train: bool = True ) -> Generator[Tuple[List[SampleBatch]], None, None]: """Utility function that returns a k-fold cross validation generator over episodes from the given batch. If the number of episodes in the batch is less than `k` or `should_train` is set to False, yields an empty list for train_episodes and all the episodes in test_episodes. Args: batch: A SampleBatch of episodes to split k: Number of cross-validation splits should_train: True by default. If False, yield [], [episodes]. Returns: A tuple with two lists of SampleBatches (train_episodes, test_episodes) """ episodes = batch.split_by_episode() n_episodes = len(episodes) if n_episodes < k or not should_train: yield [], episodes return n_fold = n_episodes // k for i in range(k): train_episodes = episodes[:i * n_fold] + episodes[(i + 1) * n_fold:] if i != k - 1: test_episodes = episodes[i * n_fold:(i + 1) * n_fold] else: # Append remaining episodes onto the last test_episodes test_episodes = episodes[i * n_fold:] yield train_episodes, test_episodes return
def estimate(self, batch: SampleBatchType) -> Dict[str, Any]: """Compute off-policy estimates. Args: batch: The SampleBatch to run off-policy estimation on Returns: A dict consists of the following metrics: - v_behavior: The discounted return averaged over episodes in the batch - v_behavior_std: The standard deviation corresponding to v_behavior - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - v_gain_std: The standard deviation corresponding to v_gain """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) estimates = {"v_behavior": [], "v_target": [], "v_gain": []} for episode in batch.split_by_episode(): rewards, old_prob = episode["rewards"], episode["action_prob"] log_likelihoods = compute_log_likelihoods_from_input_dict( self.policy, episode) new_prob = np.exp(convert_to_numpy(log_likelihoods)) # calculate importance ratios p = [] for t in range(episode.count): if t == 0: pt_prev = 1.0 else: pt_prev = p[t - 1] p.append(pt_prev * new_prob[t] / old_prob[t]) for t, v in enumerate(p): if t >= len(self.filter_values): self.filter_values.append(v) self.filter_counts.append(1.0) else: self.filter_values[t] += v self.filter_counts[t] += 1.0 # calculate stepwise weighted IS estimate v_behavior = 0.0 v_target = 0.0 for t in range(episode.count): v_behavior += rewards[t] * self.gamma**t w_t = self.filter_values[t] / self.filter_counts[t] v_target += p[t] / w_t * rewards[t] * self.gamma**t estimates["v_behavior"].append(v_behavior) estimates["v_target"].append(v_target) estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) estimates["v_behavior"] = np.mean(estimates["v_behavior"]) estimates["v_target_std"] = np.std(estimates["v_target"]) estimates["v_target"] = np.mean(estimates["v_target"]) estimates["v_gain_std"] = np.std(estimates["v_gain"]) estimates["v_gain"] = np.mean(estimates["v_gain"]) return estimates
def _add_to_underlying_buffer( self, policy_id: PolicyID, batch: SampleBatchType, **kwargs ) -> None: """Add a batch of experiences to the underlying buffer of a policy. If the storage unit is `timesteps`, cut the batch into timeslices before adding them to the appropriate buffer. Otherwise, let the underlying buffer decide how slice batches. Args: policy_id: ID of the policy that corresponds to the underlying buffer batch: SampleBatch to add to the underlying buffer ``**kwargs``: Forward compatibility kwargs. """ # Merge kwargs, overwriting standard call arguments kwargs = merge_dicts_with_warning(self.underlying_buffer_call_args, kwargs) # For the storage unit `timesteps`, the underlying buffer will # simply store the samples how they arrive. For sequences and # episodes, the underlying buffer may split them itself. if self.storage_unit is StorageUnit.TIMESTEPS: timeslices = batch.timeslices(1) elif self.storage_unit is StorageUnit.SEQUENCES: timeslices = timeslice_along_seq_lens_with_overlap( sample_batch=batch, seq_lens=batch.get(SampleBatch.SEQ_LENS) if self.replay_sequence_override else None, zero_pad_max_seq_len=self.replay_sequence_length, pre_overlap=self.replay_burn_in, zero_init_states=self.replay_zero_init_states, ) elif self.storage_unit == StorageUnit.EPISODES: timeslices = [] for eps in batch.split_by_episode(): if ( eps.get(SampleBatch.T)[0] == 0 and eps.get(SampleBatch.DONES)[-1] == True # noqa E712 ): # Only add full episodes to the buffer timeslices.append(eps) else: if log_once("only_full_episodes"): logger.info( "This buffer uses episodes as a storage " "unit and thus allows only full episodes " "to be added to it. Some samples may be " "dropped." ) elif self.storage_unit == StorageUnit.FRAGMENTS: timeslices = [batch] else: raise ValueError("Unknown `storage_unit={}`".format(self.storage_unit)) for slice in timeslices: self.replay_buffers[policy_id].add(slice, **kwargs)
def estimate(self, batch: SampleBatchType) -> Dict[str, Any]: """Compute off-policy estimates. Args: batch: The SampleBatch to run off-policy estimation on Returns: A dict consists of the following metrics: - v_behavior: The discounted return averaged over episodes in the batch - v_behavior_std: The standard deviation corresponding to v_behavior - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - v_gain_std: The standard deviation corresponding to v_gain """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) estimates = {"v_behavior": [], "v_target": [], "v_gain": []} # Calculate doubly robust OPE estimates for episode in batch.split_by_episode(): rewards, old_prob = episode["rewards"], episode["action_prob"] log_likelihoods = compute_log_likelihoods_from_input_dict( self.policy, episode ) new_prob = np.exp(convert_to_numpy(log_likelihoods)) v_behavior = 0.0 v_target = 0.0 q_values = self.model.estimate_q(episode) q_values = convert_to_numpy(q_values) v_values = self.model.estimate_v(episode) v_values = convert_to_numpy(v_values) assert q_values.shape == v_values.shape == (episode.count,) for t in reversed(range(episode.count)): v_behavior = rewards[t] + self.gamma * v_behavior v_target = v_values[t] + (new_prob[t] / old_prob[t]) * ( rewards[t] + self.gamma * v_target - q_values[t] ) v_target = v_target.item() estimates["v_behavior"].append(v_behavior) estimates["v_target"].append(v_target) estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) estimates["v_behavior"] = np.mean(estimates["v_behavior"]) estimates["v_target_std"] = np.std(estimates["v_target"]) estimates["v_target"] = np.mean(estimates["v_target"]) estimates["v_gain_std"] = np.std(estimates["v_gain"]) estimates["v_gain"] = np.mean(estimates["v_gain"]) return estimates
def add(self, batch: SampleBatchType): """Splits a SampleBatch into episodes and adds episodes to the episode buffer. Args: batch: SampleBatch to be added """ self.timesteps += batch.count episodes = batch.split_by_episode() self.episodes.extend(episodes) if len(self.episodes) > self.max_length: delta = len(self.episodes) - self.max_length # Drop oldest episodes self.episodes = self.episodes[delta:]
def _postprocess_if_needed(self, batch: SampleBatchType) -> SampleBatchType: if not self.ioctx.config.get("postprocess_inputs"): return batch if isinstance(batch, SampleBatch): out = [] for sub_batch in batch.split_by_episode(): out.append(self.default_policy.postprocess_trajectory(sub_batch)) return SampleBatch.concat_samples(out) else: # TODO(ekl) this is trickier since the alignments between agent # trajectories in the episode are not available any more. raise NotImplementedError( "Postprocessing of multi-agent data not implemented yet." )
def add(self, batch: SampleBatchType, **kwargs) -> None: """Adds a batch of experiences to this buffer. Also splits experiences into chunks of timesteps, sequences or episodes, depending on self._storage_unit. Calls self._add_single_batch. Args: batch: Batch to add to this buffer's storage. **kwargs: Forward compatibility kwargs. """ assert batch.count > 0, batch warn_replay_capacity(item=batch, num_items=self.capacity / batch.count) if (type(batch) == MultiAgentBatch and self._storage_unit != StorageUnit.TIMESTEPS): raise ValueError("Can not add MultiAgentBatch to ReplayBuffer " "with storage_unit {}" "".format(str(self._storage_unit))) if self._storage_unit == StorageUnit.TIMESTEPS: self._add_single_batch(batch, **kwargs) elif self._storage_unit == StorageUnit.SEQUENCES: timestep_count = 0 for seq_len in batch.get(SampleBatch.SEQ_LENS): start_seq = timestep_count end_seq = timestep_count + seq_len self._add_single_batch(batch[start_seq:end_seq], **kwargs) timestep_count = end_seq elif self._storage_unit == StorageUnit.EPISODES: for eps in batch.split_by_episode(): if (eps.get(SampleBatch.T)[0] == 0 and eps.get(SampleBatch.DONES)[-1] == True # noqa E712 ): # Only add full episodes to the buffer self._add_single_batch(eps, **kwargs) else: if log_once("only_full_episodes"): logger.info("This buffer uses episodes as a storage " "unit and thus allows only full episodes " "to be added to it. Some samples may be " "dropped.") elif self._storage_unit == StorageUnit.FRAGMENTS: self._add_single_batch(batch, **kwargs)
def add(self, batch: SampleBatchType, **kwargs) -> None: """Adds a batch of experiences to this buffer. Splits batch into chunks of timesteps, sequences or episodes, depending on `self._storage_unit`. Calls `self._add_single_batch` to add resulting slices to the buffer storage. Args: batch: Batch to add. ``**kwargs``: Forward compatibility kwargs. """ if not batch.count > 0: return warn_replay_capacity(item=batch, num_items=self.capacity / batch.count) if self.storage_unit == StorageUnit.TIMESTEPS: timeslices = batch.timeslices(1) for t in timeslices: self._add_single_batch(t, **kwargs) elif self.storage_unit == StorageUnit.SEQUENCES: timestep_count = 0 for seq_len in batch.get(SampleBatch.SEQ_LENS): start_seq = timestep_count end_seq = timestep_count + seq_len self._add_single_batch(batch[start_seq:end_seq], **kwargs) timestep_count = end_seq elif self.storage_unit == StorageUnit.EPISODES: for eps in batch.split_by_episode(): if (eps.get(SampleBatch.T)[0] == 0 and eps.get(SampleBatch.DONES)[-1] == True # noqa E712 ): # Only add full episodes to the buffer self._add_single_batch(eps, **kwargs) else: if log_once("only_full_episodes"): logger.info("This buffer uses episodes as a storage " "unit and thus allows only full episodes " "to be added to it. Some samples may be " "dropped.") elif self.storage_unit == StorageUnit.FRAGMENTS: self._add_single_batch(batch, **kwargs)
def train_test_split( batch: SampleBatchType, train_test_split_val: float = 0.0, k: int = 0, ) -> Generator[Tuple[List[SampleBatch]], None, None]: """Utility function that returns either a train/test split or a k-fold cross validation generator over episodes from the given batch. By default, `k` is set to 0.0, which sets eval_batch = batch and train_batch to an empty SampleBatch. Args: batch: A SampleBatch of episodes to split train_test_split_val: Split the batch into a training batch with `train_test_split_val * n_episodes` episodes and an evaluation batch with `(1 - train_test_split_val) * n_episodes` episodes. If not specified, use `k` for k-fold cross validation instead. k: k-fold cross validation for training model and evaluating OPE. Returns: A tuple with two SampleBatches (eval_batch, train_batch) """ if not train_test_split_val and not k: logger.log( "`train_test_split_val` and `k` are both 0;" "not generating training batch" ) yield [batch], [SampleBatch()] return episodes = batch.split_by_episode() n_episodes = len(episodes) # Train-test split if train_test_split_val: train_episodes = episodes[: int(n_episodes * train_test_split_val)] eval_episodes = episodes[int(n_episodes * train_test_split_val) :] yield eval_episodes, train_episodes return # k-fold cv assert n_episodes >= k, f"Not enough eval episodes in batch for {k}-fold cv!" n_fold = n_episodes // k for i in range(k): train_episodes = episodes[: i * n_fold] + episodes[(i + 1) * n_fold :] if i != k - 1: eval_episodes = episodes[i * n_fold : (i + 1) * n_fold] else: # Append remaining episodes onto the last eval_episodes eval_episodes = episodes[i * n_fold :] yield eval_episodes, train_episodes return
def estimate(self, batch: SampleBatchType) -> OffPolicyEstimate: self.check_can_estimate_for(batch) estimates = [] for sub_batch in batch.split_by_episode(): rewards, old_prob = sub_batch["rewards"], sub_batch["action_prob"] new_prob = np.exp(self.action_log_likelihood(sub_batch)) # calculate importance ratios p = [] for t in range(sub_batch.count): if t == 0: pt_prev = 1.0 else: pt_prev = p[t - 1] p.append(pt_prev * new_prob[t] / old_prob[t]) for t, v in enumerate(p): if t >= len(self.filter_values): self.filter_values.append(v) self.filter_counts.append(1.0) else: self.filter_values[t] += v self.filter_counts[t] += 1.0 # calculate stepwise weighted IS estimate v_old = 0.0 v_new = 0.0 for t in range(sub_batch.count): v_old += rewards[t] * self.gamma ** t w_t = self.filter_values[t] / self.filter_counts[t] v_new += p[t] / w_t * rewards[t] * self.gamma ** t estimates.append( OffPolicyEstimate( self.name, { "v_old": v_old, "v_new": v_new, "v_gain": v_new / max(1e-8, v_old), }, ) ) return estimates
def estimate(self, batch: SampleBatchType) -> Dict[str, Any]: """Compute off-policy estimates. Args: batch: The SampleBatch to run off-policy estimation on Returns: A dict consists of the following metrics: - v_behavior: The discounted return averaged over episodes in the batch - v_behavior_std: The standard deviation corresponding to v_behavior - v_target: The estimated discounted return for `self.policy`, averaged over episodes in the batch - v_target_std: The standard deviation corresponding to v_target - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes - v_gain_std: The standard deviation corresponding to v_gain """ batch = self.convert_ma_batch_to_sample_batch(batch) self.check_action_prob_in_batch(batch) estimates = {"v_behavior": [], "v_target": [], "v_gain": []} # Calculate Direct Method OPE estimates for episode in batch.split_by_episode(): rewards = episode["rewards"] v_behavior = 0.0 v_target = 0.0 for t in range(episode.count): v_behavior += rewards[t] * self.gamma ** t init_step = episode[0:1] v_target = self.model.estimate_v(init_step) v_target = convert_to_numpy(v_target).item() estimates["v_behavior"].append(v_behavior) estimates["v_target"].append(v_target) estimates["v_gain"].append(v_target / max(v_behavior, 1e-8)) estimates["v_behavior_std"] = np.std(estimates["v_behavior"]) estimates["v_behavior"] = np.mean(estimates["v_behavior"]) estimates["v_target_std"] = np.std(estimates["v_target"]) estimates["v_target"] = np.mean(estimates["v_target"]) estimates["v_gain_std"] = np.std(estimates["v_gain"]) estimates["v_gain"] = np.mean(estimates["v_gain"]) return estimates
def _add_to_underlying_buffer(self, policy_id: PolicyID, batch: SampleBatchType, **kwargs) -> None: """Add a batch of experiences to the underlying buffer of a policy. If the storage unit is `timesteps`, cut the batch into timeslices before adding them to the appropriate buffer. Otherwise, let the underlying buffer decide how slice batches. Args: policy_id: ID of the policy that corresponds to the underlying buffer batch: SampleBatch to add to the underlying buffer ``**kwargs``: Forward compatibility kwargs. """ # Merge kwargs, overwriting standard call arguments kwargs = merge_dicts_with_warning(self.underlying_buffer_call_args, kwargs) # For the storage unit `timesteps`, the underlying buffer will # simply store the samples how they arrive. For sequences and # episodes, the underlying buffer may split them itself. if self.storage_unit is StorageUnit.TIMESTEPS: timeslices = batch.timeslices(1) elif self.storage_unit is StorageUnit.SEQUENCES: timeslices = timeslice_along_seq_lens_with_overlap( sample_batch=batch, seq_lens=batch.get(SampleBatch.SEQ_LENS) if self.replay_sequence_override else None, zero_pad_max_seq_len=self.replay_sequence_length, pre_overlap=self.replay_burn_in, zero_init_states=self.replay_zero_init_states, ) elif self.storage_unit == StorageUnit.EPISODES: timeslices = [] for eps in batch.split_by_episode(): if (eps.get(SampleBatch.T)[0] == 0 and eps.get(SampleBatch.DONES)[-1] == True # noqa E712 ): # Only add full episodes to the buffer timeslices.append(eps) else: if log_once("only_full_episodes"): logger.info("This buffer uses episodes as a storage " "unit and thus allows only full episodes " "to be added to it. Some samples may be " "dropped.") elif self.storage_unit == StorageUnit.FRAGMENTS: timeslices = [batch] else: raise ValueError("Unknown `storage_unit={}`".format( self.storage_unit)) for slice in timeslices: # If SampleBatch has prio-replay weights, average # over these to use as a weight for the entire # sequence. if self.replay_mode is ReplayMode.INDEPENDENT: if "weights" in slice and len(slice["weights"]): weight = np.mean(slice["weights"]) else: weight = None if "weight" in kwargs and weight is not None: if log_once("overwrite_weight"): logger.warning("Adding batches with column " "`weights` to this buffer while " "providing weights as a call argument " "to the add method results in the " "column being overwritten.") kwargs = {"weight": weight, **kwargs} else: if "weight" in kwargs: if log_once("lockstep_no_weight_allowed"): logger.warning("Settings weights for batches in " "lockstep mode is not allowed." "Weights are being ignored.") kwargs = {**kwargs, "weight": None} self.replay_buffers[policy_id].add(slice, **kwargs)