def one_step_discounted_return(rewards, values, step_types, discounts): """Calculate the one step discounted return for the first T-1 steps. return = next_reward + next_discount * next_value if is not the last step; otherwise will set return = current_discount * current_value. Note: Input tensors must be time major Args: rewards (Tensor): shape is [T, B] (or [T]) representing rewards. values (Tensor): shape is [T,B] (or [T]) representing values. step_types (Tensor): shape is [T,B] (or [T]) representing step types. discounts (Tensor): shape is [T, B] (or [T]) representing discounts. Returns: A tensor with shape [T-1, B] (or [T-1]) representing the discounted returns. """ assert values.shape[0] >= 2, ("The sequence length needs to be " "at least 2. Got {s}".format( s=values.shape[0])) is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32) is_lasts = common.expand_dims_as(is_lasts, values) discounts = common.expand_dims_as(discounts, values) discounted_values = discounts * values rets = (1 - is_lasts[:-1]) * (rewards[1:] + discounted_values[1:]) + \ is_lasts[:-1] * discounted_values[:-1] return rets.detach()
def discounted_return(rewards, values, step_types, discounts, time_major=True): """Computes discounted return for the first T-1 steps. The difference between this function and the one tf_agents.utils.value_ops is that the accumulated_discounted_reward is replaced by value for is_last steps in this function. ``` Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} + gamma^(T-t+1)*final_value. ``` Define abbreviations: (B) batch size representing number of trajectories (T) number of steps per trajectory Args: rewards (Tensor): shape is [T, B] (or [T]) representing rewards. values (Tensor): shape is [T,B] (or [T]) representing values. step_types (Tensor): shape is [T,B] (or [T]) representing step types. discounts (Tensor): shape is [T, B] (or [T]) representing discounts. time_major (bool): Whether input tensors are time major. False means input tensors have shape [B, T]. Returns: A tensor with shape [T-1, B] (or [T-1]) representing the discounted returns. Shape is [B, T-1] when time_major is false. """ if not time_major: discounts = discounts.transpose(0, 1) rewards = rewards.transpose(0, 1) values = values.transpose(0, 1) step_types = step_types.transpose(0, 1) assert values.shape[0] >= 2, ("The sequence length needs to be " "at least 2. Got {s}".format( s=values.shape[0])) is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32) is_lasts = common.expand_dims_as(is_lasts, values) discounts = common.expand_dims_as(discounts, values) rets = torch.zeros_like(values) rets[-1] = values[-1] with torch.no_grad(): for t in reversed(range(rewards.shape[0] - 1)): acc_value = rets[t + 1] * discounts[t + 1] + rewards[t + 1] rets[t] = is_lasts[t] * values[t] + (1 - is_lasts[t]) * acc_value rets = rets[:-1] if not time_major: rets = rets.transpose(0, 1) return rets.detach()
def _sample(a, noise): if epsilon_greedy >= 1.0: return a + noise else: choose_random_action = (torch.rand(a.shape[:1]) < epsilon_greedy) return torch.where( common.expand_dims_as(choose_random_action, a), a + noise, a)
def read(self, keys, scale=None): """Read from memory. Read the memory for given the keys. For each key in keys we will get one result as `r = sum_i M[i] a[i]` where `M[i]` is the memory content at location i and `a[i]` is the attention weight for key at location i. `a` is calculated as softmax of a scaled similarity between key and each memory content: `a[i] = exp(scale*sim[i])/(sum_i scale*sim[i])` Args: keys (Tensor): shape[-1] is dim. For single key read, the shape is (batch_size, dim). For multiple key read, the shape is (batch_szie, k, dim), where k is the number of keys. scale (None|float|Tensor): shape is () or keys.shape[:-1]. The cosine similarities are multiplied with `scale` before softmax is applied. If None, use the scale provided at constructor. Returns: resutl Tensor: shape is same as keys. result[..., i] is the read result for the corresponding key. """ if not self._built: self.build(keys.shape[0]) assert 2 <= len(keys.shape) <= 3 assert keys.shape[0] == self._batch_size assert keys.shape[-1] == self.dim if scale is None: scale = self._scale else: if isinstance(scale, (int, float)): pass else: # assuming it's Tensor scale = expand_dims_as(scale, keys) sim = layers.dot([keys, self._memory], axes=-1, normalize=self._normalize) sim = sim * scale attention = activations.softmax(sim) result = layers.dot([attention, self._memory], axes=(-1, 1)) if len(sim.shape) > 2: # multiple read keys usage = tf.reduce_sum(attention, axis=tf.range(1, len(sim.shape) - 1)) else: usage = attention if self._snapshot_only: self._usage.assign_add(usage) else: self._usage = self._usage + usage return result
def generalized_advantage_estimation(rewards, values, step_types, discounts, td_lambda=1.0, time_major=True): """Computes generalized advantage estimation (GAE) for the first T-1 steps. For theory, see "High-Dimensional Continuous Control Using Generalized Advantage Estimation" by John Schulman, Philipp Moritz et al. See https://arxiv.org/abs/1506.02438 for full paper. The difference between this function and the one tf_agents.utils.value_ops is that the accumulated_td is reset to 0 for is_last steps in this function. Define abbreviations: (B) batch size representing number of trajectories (T) number of steps per trajectory Args: rewards (Tensor): shape is [T, B] (or [T]) representing rewards. values (Tensor): shape is [T,B] (or [T]) representing values. step_types (Tensor): shape is [T,B] (or [T]) representing step types. discounts (Tensor): shape is [T, B] (or [T]) representing discounts. td_lambda (float): A scalar between [0, 1]. It's used for variance reduction in temporal difference. time_major (bool): Whether input tensors are time major. False means input tensors have shape [B, T]. Returns: A tensor with shape [T-1, B] representing advantages. Shape is [B, T-1] when time_major is false. """ if not time_major: discounts = discounts.transpose(0, 1) rewards = rewards.transpose(0, 1) values = values.transpose(0, 1) step_types = step_types.transpose(0, 1) assert values.shape[0] >= 2, ("The sequence length needs to be " "at least 2. Got {s}".format( s=values.shape[0])) is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32) is_lasts = common.expand_dims_as(is_lasts, values) discounts = common.expand_dims_as(discounts, values) weighted_discounts = discounts[1:] * td_lambda advs = torch.zeros_like(values) delta = rewards[1:] + discounts[1:] * values[1:] - values[:-1] with torch.no_grad(): for t in reversed(range(rewards.shape[0] - 1)): advs[t] = (1 - is_lasts[t]) * \ (delta[t] + weighted_discounts[t] * advs[t + 1]) advs = advs[:-1] if not time_major: advs = advs.transpose(0, 1) return advs.detach()
def read(self, keys, scale=None): r"""Read from memory. Read the memory for given the keys. For each key in keys we will get one result as :math:`r = \sum_i M_i a_i` where :math:`M_i` is the memory content at location i and :math:`a_i` is the attention weight for key at location i. :math:`a` is calculated as softmax of a scaled similarity between key and each memory content: :math:`a_i = \exp(\frac{scale*sim_i}{\sum_i scale*sim_i})` Args: keys (Tensor): shape[-1] is dim. For single key read, the shape is (batch_size, dim). For multiple key read, the shape is (batch_szie, k, dim), where k is the number of keys. scale (None|float|Tensor): shape is () or keys.shape[:-1]. The cosine similarities are multiplied with ``scale`` before softmax is applied. If None, use the scale provided at constructor. Returns: resutl Tensor: shape is same as keys. result[..., i] is the read result for the corresponding key. """ if not self._built: self.build(keys.shape[0]) assert 2 <= keys.ndim <= 3 assert keys.shape[0] == self._batch_size assert keys.shape[-1] == self.dim multikey = keys.ndim == 3 if not multikey: keys = keys.unsqueeze(1) # B: batch size, K: number of keys, N: memory size, D: dimension of the memory sim = torch.bmm(keys, self._memory.transpose(1, 2)) # [B, K, N] if self._normalize: key_norm = 1 / (1e-30 + keys.norm(dim=2)) # [B, K] mem_norm = 1 / (1e-30 + self._memory.norm(dim=2)) # [B, N] key_norm = key_norm.unsqueeze(-1) # [B, K, 1] mem_norm = mem_norm.unsqueeze(1) # [B, 1, N] sim = sim * key_norm * mem_norm if scale is None: scale = self._scale else: if isinstance(scale, (int, float)): pass else: # assuming it's Tensor scale = expand_dims_as(scale, sim) sim = sim * scale # [B, K, N] attention = F.softmax(sim, dim=2) result = torch.bmm(attention, self._memory) # [B, K, D] if multikey: usage = attention.sum(1) # [B, N] else: usage = attention.squeeze(1) if self._snapshot_only: self._usage.add_(usage.detach()) else: self._usage = self._usage + usage if not multikey: result = result.squeeze(1) return result
def _update(tgt, updt): scatter_indices = common.expand_dims_as(gather_indices, updt) scatter_indices = scatter_indices.expand_as(updt) return tgt.scatter(0, scatter_indices, updt)