Ejemplo n.º 1
0
class TxStatus(Enum):
    """Denotes transaction status."""

    Success = enum_auto()
    Error = enum_auto()
    NotCommitted = enum_auto()
    Unknown = enum_auto()
Ejemplo n.º 2
0
class ProcessExitResult(Enum):
    """Result of the process termination."""

    # Process exited succesfully.
    Ok = enum_auto()
    # Process did not exit succesfully and was killed.
    Killed = enum_auto()
Ejemplo n.º 3
0
    class ErrorKind(Enum):
        """Kind of the error."""

        EMBEDDED_PATH = enum_auto()
        DUPLICATE_PATH = enum_auto()
        INVALID_ORDERING = enum_auto()
        NON_TERMINAL_NODE = enum_auto()
        MALFORMED_ENTRY = enum_auto()
Ejemplo n.º 4
0
class ActionResult(Enum):
    """Denotes if action was successfull or not."""

    Success = enum_auto()
    Fail = enum_auto()

    def __bool__(self) -> bool:
        return self == ActionResult.Success

    def __str__(self) -> str:
        return "success" if bool(self) else "fail"
Ejemplo n.º 5
0
class MathOperation(enum.Enum):
  multiply_add = enum_auto()
  multiply_add_saturate = enum_auto()
  xor_popc = enum_auto()
  multiply_add_fast_bf16 = enum_auto()
  multiply_add_fast_f16 = enum_auto()
  multiply_add_fast_f32 = enum_auto()
  multiply_add_complex_fast_f32 = enum_auto()
  multiply_add_complex = enum_auto()
  multiply_add_complex_gaussian = enum_auto()
Ejemplo n.º 6
0
class CollideType(Flag):
    """Type of collision."""
    NOTHING = 0
    SOLID = enum_auto()  # Regular solid walls, props etc.
    DECORATION = enum_auto()  # A location where decoration may not be placed.
    GRATING = enum_auto()  # Grating, blocks movement, but does not block energy beams.
    GLASS = enum_auto()   # Only permits lasers through.
    BRIDGE = enum_auto()
    FIZZLER = enum_auto()
    TEMPORARY = enum_auto()  # Collision is only sometimes present here.
    ANTLINES = enum_auto()  # Antlines should not pass here.

    GRATE = GRATING
    DECO = DECORATION
    ANTLINE = ANTLINES
    # Aliases matching editoritems COLLIDE_ definitions.
    PHYSICS = SOLID | TEMPORARY

    # OR all defined members from above.
    EVERYTHING = functools.reduce(
        operator.or_,
        filter(lambda x: isinstance(x, int), vars().values()),
    )

    @classmethod
    def parse(cls, text: str) -> CollideType:
        """Parse from a space-separated string."""
        coll = cls.NOTHING
        for word in text.split():
            try:
                coll |= cls[word.upper()]
            except KeyError:
                raise ValueError(f'Unknown collide type "{word}"!')
        return coll
Ejemplo n.º 7
0
class EpilogueFunctor(enum.Enum):
    LinearCombination = enum_auto()
    LinearCombinationClamp = enum_auto()
    BiasAddLinearCombination = enum_auto()
    BiasAddLinearCombinationRelu = enum_auto()
    BiasAddLinearCombinationHSwish = enum_auto()
    BiasAddLinearCombinationClamp = enum_auto()
    BiasAddLinearCombinationReluClamp = enum_auto()
    BiasAddLinearCombinationHSwishClamp = enum_auto()
Ejemplo n.º 8
0
class EpilogueFunctor(enum.Enum):
    LinearCombination = enum_auto()
    LinearCombinationRelu = enum_auto()
    LinearCombinationBias = enum_auto()
    LinearCombinationGelu = enum_auto()
    LinearCombinationSigmoid = enum_auto()
    LinearCombinationSilu = enum_auto()
    LinearCombinationHardSwish = enum_auto()
    LinearCombinationResidualBlock = enum_auto()
Ejemplo n.º 9
0
class SwizzlingFunctor(enum.Enum):
  Identity1 = enum_auto()
  Identity2 = enum_auto()
  Identity4 = enum_auto()
  Identity8 = enum_auto()
  Horizontal = enum_auto()
  StridedDgradIdentity1 = enum_auto()
  StridedDgradIdentity4 = enum_auto()
  StridedDgradHorizontal = enum_auto()
Ejemplo n.º 10
0
class SwizzlingFunctor(enum.Enum):
    Identity1 = enum_auto()
    Identity2 = enum_auto()
    Identity4 = enum_auto()
    Identity8 = enum_auto()
    Batched = enum_auto()
    StridedDgradIdentity1 = enum_auto()
    StridedDgradIdentity4 = enum_auto()
Ejemplo n.º 11
0
class GemmKind(enum.Enum):
    Gemm = enum_auto()
    Sparse = enum_auto()
    Universal = enum_auto()
    PlanarComplex = enum_auto()
    PlanarComplexArray = enum_auto()
    SplitKParallel = enum_auto()
    GemvBatchedStrided = enum_auto()
Ejemplo n.º 12
0
class GemmKind(enum.Enum):
  Gemm = enum_auto()
  Batched = enum_auto()
  Array = enum_auto()
  Universal = enum_auto()
  PlanarComplex = enum_auto()
  PlanarComplexArray = enum_auto()
Ejemplo n.º 13
0
    class ErrorKind(Enum):
        """
        Kind of the error. Possible variants:
          - UNEXPECTED_LEAF: Proof contains a hash in the place where a value was expected.
          - UNEXPECTED_BRANCH: Proof contains a hash in the position which is impossible according to the list length.
          - REDUNDANT_HASH: There are redundant hashes in the proof: the hash of the underlying list can be calculated
            without some present hashes.
          - MISSING_HASH: Proof does not contain necessary information to compute the hash of the underlying list.
          - NON_EMPTY_PROOF: Non-empty proof for an empty list.
          - DUPLICATE_KEY: Same key is used more than once in the proof.
        """

        UNEXPECTED_LEAF = enum_auto()
        UNEXPECTED_BRANCH = enum_auto()
        REDUNDANT_HASH = enum_auto()
        MISSING_HASH = enum_auto()
        NON_EMPTY_PROOF = enum_auto()
        PARSE_ERROR = enum_auto()
        DUPLICATE_KEY = enum_auto()
Ejemplo n.º 14
0
class MaildirFlags(IntFlag):
    Unread = enum_auto()
    Flagged = enum_auto()
    Replied = enum_auto()
    Passed = enum_auto()
    Draft = enum_auto()
    Trashed = enum_auto()

    @classmethod
    def tags_to_flags(cls, tags):
        flags = cls(0)

        # these are special notmuch tags: https://notmuchmail.org/special-tags/
        mapping = {member.name.lower(): member for member in cls}
        for tag in tags:
            if tag in mapping:
                flags |= mapping[tag]

        return flags
Ejemplo n.º 15
0
class IteratorAlgorithm(enum.Enum):
    Analytic = enum_auto()
    Optimized = enum_auto()
Ejemplo n.º 16
0
class ConvKind(enum.Enum):
    Fprop = enum_auto()
    Dgrad = enum_auto()
    Wgrad = enum_auto()
Ejemplo n.º 17
0
class SwizzlingFunctor(enum.Enum):
    Identity1 = enum_auto()
    Identity2 = enum_auto()
    Identity4 = enum_auto()
    Identity8 = enum_auto()
Ejemplo n.º 18
0
class EpilogueFunctor(enum.Enum):
    LinearCombination = enum_auto()
    LinearCombinationClamp = enum_auto()
Ejemplo n.º 19
0
class DataType(enum.Enum):
    b1 = enum_auto()
    u4 = enum_auto()
    u8 = enum_auto()
    u16 = enum_auto()
    u32 = enum_auto()
    u64 = enum_auto()
    s4 = enum_auto()
    s8 = enum_auto()
    s16 = enum_auto()
    s32 = enum_auto()
    s64 = enum_auto()
    f16 = enum_auto()
    bf16 = enum_auto()
    f32 = enum_auto()
    tf32 = enum_auto()
    f64 = enum_auto()
    cf16 = enum_auto()
    cbf16 = enum_auto()
    cf32 = enum_auto()
    ctf32 = enum_auto()
    cf64 = enum_auto()
    cs4 = enum_auto()
    cs8 = enum_auto()
    cs16 = enum_auto()
    cs32 = enum_auto()
    cs64 = enum_auto()
    cu4 = enum_auto()
    cu8 = enum_auto()
    cu16 = enum_auto()
    cu32 = enum_auto()
    cu64 = enum_auto()
    invalid = enum_auto()
Ejemplo n.º 20
0
class GemmKind(enum.Enum):
    Gemm = enum_auto()
    Sparse = enum_auto()
    Universal = enum_auto()
    PlanarComplex = enum_auto()
    PlanarComplexArray = enum_auto()
Ejemplo n.º 21
0
class Target(enum.Enum):
    library = enum_auto()
Ejemplo n.º 22
0
class OperationKind(enum.Enum):
    Gemm = enum_auto()
    Conv2d = enum_auto()
    Conv3d = enum_auto()
Ejemplo n.º 23
0
class OpcodeClass(enum.Enum):
    Simt = enum_auto()
    TensorOp = enum_auto()
    WmmaTensorOp = enum_auto()
Ejemplo n.º 24
0
class GeneratorTarget(enum.Enum):
    Library = enum_auto()
Ejemplo n.º 25
0
class Align(Enum):
    left = enum_auto()
    center = enum_auto()
Ejemplo n.º 26
0
class StrideSupport(enum.Enum):
    Strided = enum_auto()
    Unity = enum_auto()
Ejemplo n.º 27
0
class DepDirection(Enum):
    GOV = enum_auto()
    DEP = enum_auto()
Ejemplo n.º 28
0
class ComplexMultiplyOp(enum.Enum):
    multiply_add = enum_auto()
    gaussian = enum_auto()
Ejemplo n.º 29
0
class AttentionMechanism(IntFlag):
    r""" The taxomony of all attention
  The meaning of `query`, `value` and `key` depend on the application. In the
  case of text similarity, for example, `query` is the sequence embeddings of
  the first piece of text and `value` is the sequence embeddings of the second
  piece of text. Hence, the attention determines alignment between `query` and
  `value`, `key` is usually the same tensor as value.
  A mapping from `query` to `key` will be learned during the attention.

  To use this method in your attention layer, follow the steps:
    * Use `query` tensor of shape `[batch_size, Tq]` and `key` tensor of shape
      `[batch_size, Tv]` to calculate the attention `scores`.
    * Pass `scores` and `value` tensors to this method. The method applies
      `scores_mask`, calculates `attention_distribution = softmax(scores)`, then
      returns `matmul(attention_distribution, value).
    * Apply `query_mask` and return the result.

  The following method call order is recommended:
    * `validate`: make the no duplicated steps stored in the `AttentionMechanism`
    * `prepare`: prepare the query, key, value and masks according to the given
      mechanism.
    * `score`: get the attention scores given the query and the key
    * `normalize` (optional): normalize the multi-heads attention scores.
    * `align`: create attention distribution, use this distribution to align
      the query and the value

  All family of attention mechanism is summarized into follow
  a hierarchical structure, the order is as follow:

  The input space of the attention mechansim:
    - `Intra` (a.k.a. self-attention):
    - `Inter`:

  The attending positions within the input space:
    - `PosGlobal`: global attention
    - `PosLocalM`: local monotonic positioning
    - `PosLocalP`: local predictive positioning

  The alignment of the position:
    - `AlignSoft`:
    - `AlignRelax`: using gumble softmax for "relaxed" hard attention
    - `AlignHard`:

  The score function in which the attention logits are calculated:
    - `ScoreLocation`:
    - `ScoreAdditive`:
    - `ScoreDotProd`:
    - `ScoreCosine`:
    - `ScoreGeneral`:

  Since many studies try to group attention algorithm into categories, we take
  a more flexbile approach that allow a random path passing through each stage
  to create the final algorithm, e.g.
    - `Intra` to `PosGlobal` to `AlignSoft` to `ScoreLocation`
    - `Inter` to `PosGlobal` to `AlignHard` to `ScoreConcat`
  and so on.

  # TODO:
  * Down sampled multihead attention
  * Sparse attention
  """
    # ====== input space ====== #
    Intra = enum_auto()  # a.k.a. self-attention
    Inter = enum_auto()  # a.k.a. inter-attention
    # ====== attending positions ====== #
    PosGlobal = enum_auto()
    PosLocalM = enum_auto()  # local monotonic
    PosLocalP = enum_auto()  # local predictive
    # ====== alignment function ====== #
    AlignSoft = enum_auto()
    AlignHard = enum_auto()
    AlignRelax = enum_auto()
    # ====== alignment score function ====== #
    ScoreLocation = enum_auto()
    ScoreAdditive = enum_auto()
    ScoreDotProd = enum_auto()
    ScoreCosine = enum_auto()
    ScoreGeneral = enum_auto()

    def __or__(self, other):
        # delete the duplicated bit, then setting the new bit
        att = super().__or__(other)
        for group in _GROUPS:
            if other in group:
                for g in group:
                    if g == other:
                        continue
                    att = att & ~g
                break
        return att

    def __str__(self):
        text = super().__str__()
        text = text.replace(self.__class__.__name__ + '.', '')
        return text

    @property
    def is_self_attention(self):
        r""" self-attention is intra-attention, in contrast to inter-attention
    which determines the alignment between two different sequences. """
        self.validate()
        if Intra in self:
            return True
        return False

    @property
    def is_soft_attention(self):
        return AlignSoft in self

    @property
    def is_hard_attention(self):
        return AlignSoft not in self

    def validate(self):
        def count_and_check(groups):
            duplication = [g for g in groups if g in self]
            c = len(duplication)
            if c == 0:
                raise ValueError(
                    "The created mechanism must contain one of the following: %s"
                    % ', '.join([str(g) for g in groups]))
            elif c > 1:
                raise ValueError(
                    "The created mechanism contain duplicated methods of the same stage: %s"
                    % ', '.join([str(g) for g in duplication]))

        for g in _GROUPS:
            count_and_check(g)
        return self

    def prepare(self, query, key=None, value=None, mask=None):
        r""" Preparing input for attention model

    Returns:
      query: Query (or target sequence) tensor of shape `[batch_size, Tq, dim]`.
      key: Key (or source sequence) tensor of shape `[batch_size, Tv, dim]`.
      value: Value (or source sequence) tensor of shape `[batch_size, Tv, dim]`.
      mask: list of the following
        * query_mask: A boolean mask `Tensor` of shape `[batch_size, Tq]`.
            If given, the output will be zero at the positions where
            `mask==False`.
        * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
            If given, will apply the mask such that values at positions where
            `mask==False` do not contribute to the result.
    """
        # by default, if key is not provide, using value
        query = bk.array(query, ignore_none=True)
        key = bk.array(key, ignore_none=True)
        value = bk.array(value, ignore_none=True)
        # ====== check if intra-attention ====== #
        if self.is_self_attention:
            if (key is not None or value is not None):
                warnings.warn(
                    "Self-attention (intra-attention) need only query, "
                    "ignore provided key and value",
                    category=UserWarning)
            if key is not None:
                key = query
            if value is not None:
                value = query
        ### inter-attention
        else:
            if key is None:
                key = value
            if value is None:  # value must always provided
                raise RuntimeError(
                    "value must be given of inter-sequences attention.")
        # ====== masks ====== #
        if self.is_self_attention:  # only 1 mask is need
            if isinstance(mask, (tuple, list)):
                q_mask = mask[0]
            else:
                q_mask = mask
            v_mask = None
        else:
            q_mask = mask[0] if mask else None
            v_mask = mask[1] if mask else None
            if v_mask is not None:
                if v_mask.shape[1] != value.shape[1]:
                    raise RuntimeError(
                        "Value mask has time dimension %d, but value has time dimension %d"
                        % (v_mask.shape[1], value.shape[1]))
        # ====== return ====== #
        return query, key, value, \
          bk.array(q_mask, ignore_none=True), bk.array(v_mask, ignore_none=True)

    def normalize(self, scores):
        r""" Normalize attention scores using "fro"-norm that encouraging diversity
    among attention heads math::`P = ||A^T*A - I||_2^2` (Kim et al. 2017)

    Arguments:
      scores: Tensor with shape `[batch_size * num_heads, Tq, Tv]`
    """
        # it is easier to assume there is always 1-head at least
        num_heads = _get_num_heads(scores)
        if num_heads == 0:
            return bk.cast(0., scores.dtype)
        # [batch_size, num_heads, Tq * Tv]
        scoresT = bk.swapaxes(bk.reshape(scores, shape=([0], [1], -1)), 0, 1)
        # [batch_size, Tq * Tv, num_heads]
        scores = bk.swapaxes(scoresT, 1, 2)
        # [batch_size, num_heads, num_heads]
        A = bk.matmul(scoresT, scores)
        # [batch_size, num_heads, num_heads]
        I = bk.eye(num_heads, dtype=A.dtype)
        I = bk.expand_dims(I, axis=0)
        I = bk.tile(I, reps=A.shape[0], axis=0)
        # normalized
        P = bk.norm(A - I, p="fro")**2
        return P

    def score(self,
              query,
              key=None,
              scale=1,
              window_width=None,
              q_proj=None,
              target_proj=None):
        r"""
    Arguments:
      query: Query (or target sequence) tensor of shape
        `[batch_size, Tq, dim]` or `[num_heads, batch_size, Tq, dim]` in case
        of multi-heads attention.
      key: Key (or source sequence) tensor of shape
        `[batch_size, Tv, dim]` or `[num_heads, batch_size, Tv, dim]` in case
        of multi-heads attention.
      scale: single `Scalar` or `Tensor` of shape `[dim]` for scaling
        the attention scores, suggested `1/sqrt(dim)` in (Vaswani et al. 2017).
      window_width : `None`, `Integer` or `Float` ([0, 1]). The total number of
        frames for a single window in local attention (i.e. `left + 1 + right`)
        Can be given as a fixed number of frames (`int`), or percentage of
        the sequence length (`float`). If `None`, use `Tq`
      q_proj : `Dense`, instance of dense or fully connected layer
        - for `ScoreLocation`, the number of hidden unit is `1`
        - for `ScoreGeneral`, the number of hidden unit is `dim`
      target_proj : `Dense`, for predictive local attention, applying
        a fully connected network on target sequence (i.e. the query) to
        predict the position on source sequence (i.e. the key).
        The layer must has output dimension equal to 1 and return logit value.

    Returns:
      Tensor of shape `[num_heads, batch_size, Tq, Tv]`, or
       `[num_heads, batch_size, Tq, 1]` if `ScoreLocation`
    """
        ### Check if multi-head attention is used
        num_heads = _get_num_heads(query)
        if num_heads > 0:
            query = bk.reshape(query, [-1] + [i for i in query.shape[2:]])
            if key is not None:
                key = bk.reshape(key, [-1] + [i for i in key.shape[2:]])
        Tq = query.shape[1]
        Tv = Tq if key is None else key.shape[1]
        # scale shape is `[]` or `[dim]`
        scale = bk.array(scale, dtype=query.dtype)
        ### Check the window width
        if window_width is None:
            window_width = Tq
        elif window_width < 1:
            window_width = window_width * Tv
        window_width = int(window_width)
        ### Locative attention
        if AttentionMechanism.ScoreLocation in self:
            if PosLocalM in self or PosLocalP in self:
                raise NotImplementedError(
                    "ScoreLocation only support Global attention, but given: %s"
                    % str(self))
            # [batch_size * num_heads, Tq, dim]
            scores = bk.reduce_mean(scale) * q_proj(query)
            assert scores.shape[-1] == 1, \
              " q_proj must have only 1 hidden unit, but given %d" % scores.shape[-1]
        ### Other score mode need the key tensor
        else:
            if key is None:
                raise ValueError(
                    "key must be provided for attention type: %s" % str(self))
            ### Attention position (local or global)
            if PosLocalM in self:
                key = key[:, -window_width:]
            elif PosLocalP in self:
                pt = bk.sigmoid(target_proj(bk.reshape(query, ([0], -1))))
                assert pt.shape[-1] == 1, \
                  "target_proj must project the query [., Tq * dim] to [., 1], i.e. " + \
                    "predicting the attention position on source sequence using " + \
                      "knowledge from target sequence."
                pt = Tv * pt  # `[batch_size * num_heads, 1]`
                # `[batch_size * num_heads, Tv]`
                # Eq (10) (Luong et al. 2015)
                gauss_est = bk.exp(
                    -bk.square(bk.arange(Tv, dtype=pt.dtype) - pt) /
                    (2 * bk.square(window_width / 2)))
                # `[batch_size * num_heads, 1, Tv]`
                gauss_est = bk.expand_dims(gauss_est, axis=1)
            ### Additive or concat method
            if AttentionMechanism.ScoreAdditive in self:
                # [batch_size * num_heads, Tq, 1, dim]
                q = bk.expand_dims(query, axis=2)
                # [batch_size * num_heads, 1, Tv, dim]
                k = bk.expand_dims(key, axis=1)
                # [batch_size * num_heads, Tq, Tv]
                scores = bk.reduce_sum(scale * bk.tanh(q + k), axis=-1)
            ### Dot product or multiplicative scoring
            elif AttentionMechanism.ScoreDotProd in self:
                # this is a trick to make attention_scale broadcastable when
                # scale_tied=False
                scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2))
            ### cosine scoring
            elif AttentionMechanism.ScoreCosine in self:
                # [batch_size * num_heads, Tq, 1, dim]
                q = bk.expand_dims(query, axis=2)
                # [batch_size * num_heads, 1, Tv, dim]
                k = bk.expand_dims(key, axis=1)
                # [batch_size * num_heads, Tq, Tv, dim]
                scores = (q * k) / (bk.norm(q, p=2) * bk.norm(k, p=2))
                scores = bk.reduce_sum(scale * scores, axis=-1, keepdims=False)
            ### general method with only project on the query
            elif AttentionMechanism.ScoreGeneral in self:
                query = q_proj(query)
                assert query.shape[-1] == key.shape[-1], \
                  " q_proj must have %d hidden units, but given %d units" % \
                    (key.shape[-1], query.shape[-1])
                scores = bk.matmul(scale * query, bk.swapaxes(key, 1, 2))
            else:
                raise NotImplementedError(
                    "No support for attention_type='%s'" % str(self))
            ### applying the local-predictive attention
            if PosLocalP in self:
                scores = scores * gauss_est
        ### get back the multi-heads shape
        if num_heads > 0:
            scores = bk.reshape(scores,
                                shape=[num_heads, -1] +
                                [i for i in scores.shape[1:]])
        return scores

    def align(self,
              scores,
              value,
              query=None,
              q_mask=None,
              v_mask=None,
              causal=False,
              residual=False,
              dropout=0,
              temporal_dropout=False,
              sample_shape=1,
              temperature=0.5,
              training=None):
        r"""Applies attention scores to the given value tensor.

    Arguments:
      scores: Attention Scores float tensor of shape
        `[num_heads, batch_size, Tq, Tv]`.
      value: Value (or source sequence) tensor of shape
        `[num_heads, batch_size, Tv, dim]`.
      query: Query (or target sequence) tensor of shape
        `[num_heads, batch_size, Tq, dim]`.
      q_mask: A boolean query mask `Tensor` of shape `[batch_size, Tq]`.
        If given, the output will be zero at the positions where
        `mask==False`.
      v_mask: A boolean value mask `Tensor` of shape `[batch_size, Tv]`.
        If given, will apply the mask such that values at positions where
        `mask==False` do not contribute to the result.
      dropout : Float. Dropout probability of the attention scores.
      temporal_dropout : Boolean. If `True`, using the same dropout mask along
        temporal axis (i.e. the 1-st dimension)
      sample_shape (`Integer`) : number of mcmc samples for estimating the gradient
        of hard attention
      temperature: An 0-D `Tensor`, representing the temperature
        of a set of RelaxedOneHotCategorical distributions. The temperature
        should be positive.

    Returns:
      attended sequence: Tensor of shape
        * `[sample_shape, num_heads, batch_size, Tq, dim]` for (hard + multi-heads)
        * `[sample_shape, batch_size, Tq, dim]` for (hard + no-head)
        * `[num_heads, batch_size, Tq, dim]` for (soft + multi-heads)
        * `[batch_size, Tq, dim]` for (soft + no-head)
      attention distribution : for soft attention, return Tensor of shape
        * `[num_heads, batch_size, Tq]` for self-attention
        * `[num_heads, batch_size, Tq, Tv]` for inter-attention.
        for hard attention, return one-hot categorical distribution of shape
        * `[sample_shape, num_heads, batch_size, Tq]` for self-attention
        * `[sample_shape, num_heads, batch_size, Tq, Tv]` for inter-attention.
        if multi-heads attention wasn't used, omit the `[num_heads]`.
    """
        num_heads = _get_num_heads(scores)
        if num_heads == 0:
            Tq = scores.shape[1]
            Tv = scores.shape[2]
        else:
            Tq = scores.shape[2]
            Tv = scores.shape[3]
        if value is None:
            if query is None:
                raise ValueError("both query and value are None, "
                                 "at least one of them must be given")
            value = query
        # ====== Causal mask ====== #
        if causal:
            # Creates a lower triangular mask, so position i cannot attend to
            # positions j>i. This prevents the flow of information from the future
            # into the past.
            scores_shape = scores.shape
            # causal_mask_shape = [1, Tq, Tv].
            causal_mask_shape = bk.concatenate(
                [bk.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0)
            causal_mask = bk.tril_mask(causal_mask_shape)
        else:
            causal_mask = None
        if v_mask is not None:
            # LocalM applied
            if PosLocalM in self:
                v_mask = v_mask[:, -Tv:]
            # Mask of shape [batch_size, 1, Tv].
            v_mask = bk.expand_dims(v_mask, axis=-2)
            v_mask = bk.cast(v_mask, 'bool')
            if num_heads > 0:
                v_mask = bk.expand_dims(v_mask, axis=0)
        scores_mask = bk.logical_and(v_mask, causal_mask)
        ### applying the scores mask
        if scores_mask is not None:
            padding_mask = bk.logical_not(scores_mask)
            # Bias so padding positions do not contribute to attention distribution.
            scores -= 1.e9 * bk.cast(padding_mask, dtype=scores.dtype)
        # ====== convert attention score to distribution ====== #
        # if the last dimension is 1, no point for applying softmax, hence,
        # softmax to the second last dimension
        ### soft attention
        if AlignSoft in self:
            attention_distribution = bk.softmax(
                scores, axis=-2 if scores.shape[-1] == 1 else -1)
        ### relaxed hard attention
        elif AlignRelax in self:
            attention_distribution = bay.distributions.RelaxedOneHotCategorical(
                temperature=temperature,
                logits=bk.squeeze(scores, axis=-1)
                if scores.shape[-1] == 1 else scores)
            fsample = partial(bay.Distribution.sample,
                              sample_shape=sample_shape)
            attention_distribution = bay.coercible_tensor(
                attention_distribution, convert_to_tensor_fn=fsample)
        ### hard attention
        elif AlignHard in self:
            attention_distribution = bay.distributions.OneHotCategorical(
                logits=bk.squeeze(scores, axis=-1)
                if scores.shape[-1] == 1 else scores,
                dtype=value.dtype)
            fsample = partial(bay.Distribution.sample,
                              sample_shape=sample_shape)
            attention_distribution = bay.coercible_tensor(
                attention_distribution, convert_to_tensor_fn=fsample)
        # ======  dropout the attention scores ====== #
        attention = bk.dropout(attention_distribution,
                               p_drop=dropout,
                               axis=1 if temporal_dropout else None,
                               training=training and dropout > 0)
        # ====== applying the attention ====== #
        if self.is_self_attention and ScoreLocation in self:
            result = bk.expand_dims(bk.array(attention), axis=-1) * value  \
                if attention.shape[-1] != 1 else \
                  attention * value
        else:
            if PosLocalM in self:
                value = value[:, -Tv:] if num_heads == 0 else value[:, :, -Tv:]
            result = bk.matmul(attention, value)
        # ====== applying the Query mask ====== #
        if q_mask is not None:
            assert q_mask.shape[1] == Tq,\
              "Query mask has time dimension %d, but query has time dimension %d" \
                % (q_mask.shape[1], Tq)
            # Mask of shape [batch_size, Tq, 1].
            q_mask = bk.expand_dims(q_mask, axis=-1)
            result *= bk.cast(q_mask, dtype=result.dtype)
        # ====== residual connection ====== #
        if residual:
            if query is None:
                raise ValueError("query must be given for residual connection")
            result += query
        # ====== return ====== #
        return result, attention_distribution

    def compute_mask(self, mask=None):
        if mask:
            q_mask = mask[0] if isinstance(mask, (tuple, list)) else mask
            return bk.array(q_mask)
Ejemplo n.º 30
0
class LayoutType(enum.Enum):
    ColumnMajor = enum_auto()
    RowMajor = enum_auto()
    ColumnMajorInterleaved2 = enum_auto()
    RowMajorInterleaved2 = enum_auto()
    ColumnMajorInterleaved32 = enum_auto()
    RowMajorInterleaved32 = enum_auto()
    ColumnMajorInterleaved64 = enum_auto()
    RowMajorInterleaved64 = enum_auto()
    TensorNHWC = enum_auto()
    TensorNDHWC = enum_auto()
    TensorNCHW = enum_auto()
    TensorNGHWC = enum_auto()
    TensorNC32HW32 = enum_auto()
    TensorNC64HW64 = enum_auto()
    TensorC32RSK32 = enum_auto()
    TensorC64RSK64 = enum_auto()