Example #1
0
def one_step_discounted_return(rewards, values, step_types, discounts):
    """Calculate the one step discounted return  for the first T-1 steps.

    return = next_reward + next_discount * next_value if is not the last step;
    otherwise will set return = current_discount * current_value.

    Note: Input tensors must be time major
    Args:
        rewards (Tensor): shape is [T, B] (or [T]) representing rewards.
        values (Tensor): shape is [T,B] (or [T]) representing values.
        step_types (Tensor): shape is [T,B] (or [T]) representing step types.
        discounts (Tensor): shape is [T, B] (or [T]) representing discounts.
    Returns:
        A tensor with shape [T-1, B] (or [T-1]) representing the discounted
        returns.
    """
    assert values.shape[0] >= 2, ("The sequence length needs to be "
                                  "at least 2. Got {s}".format(
                                      s=values.shape[0]))

    is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32)
    is_lasts = common.expand_dims_as(is_lasts, values)
    discounts = common.expand_dims_as(discounts, values)

    discounted_values = discounts * values
    rets = (1 - is_lasts[:-1]) * (rewards[1:] + discounted_values[1:]) + \
                 is_lasts[:-1] * discounted_values[:-1]
    return rets.detach()
Example #2
0
def discounted_return(rewards, values, step_types, discounts, time_major=True):
    """Computes discounted return for the first T-1 steps.

    The difference between this function and the one tf_agents.utils.value_ops
    is that the accumulated_discounted_reward is replaced by value for is_last
    steps in this function.

    ```
    Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} + gamma^(T-t+1)*final_value.
    ```

    Define abbreviations:
    (B) batch size representing number of trajectories
    (T) number of steps per trajectory

    Args:
        rewards (Tensor): shape is [T, B] (or [T]) representing rewards.
        values (Tensor): shape is [T,B] (or [T]) representing values.
        step_types (Tensor): shape is [T,B] (or [T]) representing step types.
        discounts (Tensor): shape is [T, B] (or [T]) representing discounts.
        time_major (bool): Whether input tensors are time major.
            False means input tensors have shape [B, T].

    Returns:
        A tensor with shape [T-1, B] (or [T-1]) representing the discounted
        returns. Shape is [B, T-1] when time_major is false.
    """
    if not time_major:
        discounts = discounts.transpose(0, 1)
        rewards = rewards.transpose(0, 1)
        values = values.transpose(0, 1)
        step_types = step_types.transpose(0, 1)

    assert values.shape[0] >= 2, ("The sequence length needs to be "
                                  "at least 2. Got {s}".format(
                                      s=values.shape[0]))

    is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32)
    is_lasts = common.expand_dims_as(is_lasts, values)
    discounts = common.expand_dims_as(discounts, values)

    rets = torch.zeros_like(values)
    rets[-1] = values[-1]

    with torch.no_grad():
        for t in reversed(range(rewards.shape[0] - 1)):
            acc_value = rets[t + 1] * discounts[t + 1] + rewards[t + 1]
            rets[t] = is_lasts[t] * values[t] + (1 - is_lasts[t]) * acc_value

    rets = rets[:-1]

    if not time_major:
        rets = rets.transpose(0, 1)

    return rets.detach()
Example #3
0
 def _sample(a, noise):
     if epsilon_greedy >= 1.0:
         return a + noise
     else:
         choose_random_action = (torch.rand(a.shape[:1]) <
                                 epsilon_greedy)
         return torch.where(
             common.expand_dims_as(choose_random_action, a),
             a + noise, a)
Example #4
0
    def read(self, keys, scale=None):
        """Read from memory.

        Read the memory for given the keys. For each key in keys we will get one
        result as `r = sum_i M[i] a[i]` where `M[i]` is the memory content
        at location i and `a[i]` is the attention weight for key at location i.
        `a` is calculated as softmax of a scaled similarity between key and
        each memory content: `a[i] = exp(scale*sim[i])/(sum_i scale*sim[i])`

        Args:
            keys (Tensor): shape[-1] is dim.
              For single key read, the shape is (batch_size, dim).
              For multiple key read, the shape is (batch_szie, k, dim), where
              k is the number of keys.
            scale (None|float|Tensor): shape is () or keys.shape[:-1]. The
              cosine similarities are multiplied with `scale` before softmax
              is applied. If None, use the scale provided at constructor.
        Returns:
            resutl Tensor: shape is same as keys. result[..., i] is the read
              result for the corresponding key.

        """
        if not self._built:
            self.build(keys.shape[0])
        assert 2 <= len(keys.shape) <= 3
        assert keys.shape[0] == self._batch_size
        assert keys.shape[-1] == self.dim

        if scale is None:
            scale = self._scale
        else:
            if isinstance(scale, (int, float)):
                pass
            else:  # assuming it's Tensor
                scale = expand_dims_as(scale, keys)
        sim = layers.dot([keys, self._memory],
                         axes=-1,
                         normalize=self._normalize)
        sim = sim * scale

        attention = activations.softmax(sim)
        result = layers.dot([attention, self._memory], axes=(-1, 1))

        if len(sim.shape) > 2:  # multiple read keys
            usage = tf.reduce_sum(attention,
                                  axis=tf.range(1,
                                                len(sim.shape) - 1))
        else:
            usage = attention

        if self._snapshot_only:
            self._usage.assign_add(usage)
        else:
            self._usage = self._usage + usage

        return result
Example #5
0
def generalized_advantage_estimation(rewards,
                                     values,
                                     step_types,
                                     discounts,
                                     td_lambda=1.0,
                                     time_major=True):
    """Computes generalized advantage estimation (GAE) for the first T-1 steps.

    For theory, see
    "High-Dimensional Continuous Control Using Generalized Advantage Estimation"
    by John Schulman, Philipp Moritz et al.
    See https://arxiv.org/abs/1506.02438 for full paper.

    The difference between this function and the one tf_agents.utils.value_ops
    is that the accumulated_td is reset to 0 for is_last steps in this function.

    Define abbreviations:
        (B) batch size representing number of trajectories
        (T) number of steps per trajectory

    Args:
        rewards (Tensor): shape is [T, B] (or [T]) representing rewards.
        values (Tensor): shape is [T,B] (or [T]) representing values.
        step_types (Tensor): shape is [T,B] (or [T]) representing step types.
        discounts (Tensor): shape is [T, B] (or [T]) representing discounts.
        td_lambda (float): A scalar between [0, 1]. It's used for variance
            reduction in temporal difference.
        time_major (bool): Whether input tensors are time major.
            False means input tensors have shape [B, T].

    Returns:
        A tensor with shape [T-1, B] representing advantages. Shape is [B, T-1]
        when time_major is false.
    """

    if not time_major:
        discounts = discounts.transpose(0, 1)
        rewards = rewards.transpose(0, 1)
        values = values.transpose(0, 1)
        step_types = step_types.transpose(0, 1)

    assert values.shape[0] >= 2, ("The sequence length needs to be "
                                  "at least 2. Got {s}".format(
                                      s=values.shape[0]))

    is_lasts = (step_types == StepType.LAST).to(dtype=torch.float32)
    is_lasts = common.expand_dims_as(is_lasts, values)
    discounts = common.expand_dims_as(discounts, values)

    weighted_discounts = discounts[1:] * td_lambda

    advs = torch.zeros_like(values)
    delta = rewards[1:] + discounts[1:] * values[1:] - values[:-1]

    with torch.no_grad():
        for t in reversed(range(rewards.shape[0] - 1)):
            advs[t] = (1 - is_lasts[t]) * \
                      (delta[t] + weighted_discounts[t] * advs[t + 1])
        advs = advs[:-1]

    if not time_major:
        advs = advs.transpose(0, 1)

    return advs.detach()
Example #6
0
    def read(self, keys, scale=None):
        r"""Read from memory.

        Read the memory for given the keys. For each key in keys we will get one
        result as :math:`r = \sum_i M_i a_i` where :math:`M_i` is the memory content
        at location i and :math:`a_i` is the attention weight for key at location i.
        :math:`a` is calculated as softmax of a scaled similarity between key and
        each memory content: :math:`a_i = \exp(\frac{scale*sim_i}{\sum_i scale*sim_i})`

        Args:
            keys (Tensor): shape[-1] is dim.
              For single key read, the shape is (batch_size, dim).
              For multiple key read, the shape is (batch_szie, k, dim), where
              k is the number of keys.
            scale (None|float|Tensor): shape is () or keys.shape[:-1]. The
              cosine similarities are multiplied with ``scale`` before softmax
              is applied. If None, use the scale provided at constructor.
        Returns:
            resutl Tensor: shape is same as keys. result[..., i] is the read
              result for the corresponding key.

        """
        if not self._built:
            self.build(keys.shape[0])
        assert 2 <= keys.ndim <= 3
        assert keys.shape[0] == self._batch_size
        assert keys.shape[-1] == self.dim

        multikey = keys.ndim == 3
        if not multikey:
            keys = keys.unsqueeze(1)

        # B: batch size, K: number of keys, N: memory size, D: dimension of the memory
        sim = torch.bmm(keys, self._memory.transpose(1, 2))  # [B, K, N]
        if self._normalize:
            key_norm = 1 / (1e-30 + keys.norm(dim=2))  # [B, K]
            mem_norm = 1 / (1e-30 + self._memory.norm(dim=2))  # [B, N]
            key_norm = key_norm.unsqueeze(-1)  # [B, K, 1]
            mem_norm = mem_norm.unsqueeze(1)  # [B, 1, N]
            sim = sim * key_norm * mem_norm

        if scale is None:
            scale = self._scale
        else:
            if isinstance(scale, (int, float)):
                pass
            else:  # assuming it's Tensor
                scale = expand_dims_as(scale, sim)

        sim = sim * scale  # [B, K, N]

        attention = F.softmax(sim, dim=2)
        result = torch.bmm(attention, self._memory)  # [B, K, D]

        if multikey:
            usage = attention.sum(1)  # [B, N]
        else:
            usage = attention.squeeze(1)

        if self._snapshot_only:
            self._usage.add_(usage.detach())
        else:
            self._usage = self._usage + usage

        if not multikey:
            result = result.squeeze(1)
        return result
Example #7
0
 def _update(tgt, updt):
     scatter_indices = common.expand_dims_as(gather_indices, updt)
     scatter_indices = scatter_indices.expand_as(updt)
     return tgt.scatter(0, scatter_indices, updt)