Example #1
0
def _in_projection(
    q: Tensor,
    k: Tensor,
    v: Tensor,
    w_q: Tensor,
    w_k: Tensor,
    w_v: Tensor,
    b_q: Optional[Tensor] = None,
    b_k: Optional[Tensor] = None,
    b_v: Optional[Tensor] = None,
) -> Tuple[Tensor, Tensor, Tensor]:
    Eq, Ek, Ev = q.size(-1), k.size(-1), v.size(-1)
    assert w_q.shape == (
        Eq,
        Eq,
    ), f"expecting query weights shape of {(Eq, Eq)}, but got {w_q.shape}"
    assert w_k.shape == (
        Eq,
        Ek,
    ), f"expecting key weights shape of {(Eq, Ek)}, but got {w_k.shape}"
    assert w_v.shape == (
        Eq,
        Ev,
    ), f"expecting value weights shape of {(Eq, Ev)}, but got {w_v.shape}"
    assert b_q is None or b_q.shape == (
        Eq, ), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
    assert b_k is None or b_k.shape == (
        Eq, ), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
    assert b_v is None or b_v.shape == (
        Eq, ), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
    return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
Example #2
0
    def forward(
        self,
        query: flow.Tensor,
        key: flow.Tensor,
        value: flow.Tensor,
        mask: Optional[flow.Tensor] = None,
    ) -> Tuple[flow.Tensor, flow.Tensor]:
        batch_size = query.size(0)

        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # multi head
        query = query.view(batch_size, -1, self.num_attention_heads,
                           self.dims_per_head).transpose(1, 2)
        key = key.view(batch_size, -1, self.num_attention_heads,
                       self.dims_per_head).transpose(1, 2)
        value = value.view(batch_size, -1, self.num_attention_heads,
                           self.dims_per_head).transpose(1, 2)

        # self attention
        context, attention = self.attention(query, key, value, attn_mask=mask)
        # concat heads
        context = context.transpose(1, 2).contiguous().view(
            batch_size, -1, self.hidden_size)
        output = self.dense(context)

        return output, attention
Example #3
0
def topk_accuracy(output: Tensor, target: Tensor,
                  topk: Sequence[int] = (1, )) -> List[Tensor]:
    """
    https://github.com/pytorch/examples/blob/master/imagenet/main.py#L411

    Args:
        output: [B, C], for C way classification
        target: [B]
    """
    maxk = max(topk)
    batch_size = target.size(0)

    if target.ndim == 2:
        # Possibly onehot target
        target = target.max(dim=1).values

    _, pred = output.topk(maxk, dim=1, largest=True, sorted=True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=False)
        res.append(correct_k.mul_(100.0 / batch_size))

    return res
Example #4
0
 def forward(self, cosine: flow.Tensor, label):
     index = flow.where(label != -1)[0]
     m_hot = flow.zeros(index.size()[0],
                        cosine.size()[1],
                        device=cosine.device)
     m_hot.scatter_(1, label[index, None], self.m)
     cosine.acos_()
     cosine[index] += m_hot
     cosine.cos_().mul_(self.s)
     return cosine
Example #5
0
    def forward(
        self,
        input_ids: flow.Tensor,
        token_type_ids: Optional[flow.Tensor] = None,
        position_ids: Optional[flow.Tensor] = None,
    ) -> flow.Tensor:
        input_shape = input_ids.size()
        seq_length = input_shape[1]

        if token_type_ids is None:
            token_type_ids = flow.zeros(input_shape,
                                        dtype=flow.long,
                                        device=input_ids.device)
        if position_ids is None:
            position_ids = flow.arange(seq_length,
                                       dtype=flow.long,
                                       device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)

        input_embeddings = self.token_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        position_embeddings = self.position_embeddings(position_ids)

        embeddings = input_embeddings + position_embeddings + \
            token_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings
Example #6
0
    def get_extended_attention_mask(self, attention_mask: flow.Tensor,
                                    input_shape: Tuple[int],
                                    device: flow.device) -> flow.Tensor:

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            # Provided a padding mask of dimensions [batch_size, seq_length]
            # - if the model is a decoder, apply a causal mask in addition to the padding mask
            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
            if self.is_decoder:
                batch_size, seq_length = input_shape
                seq_ids = flow.arange(seq_length, device=device)
                causal_mask = (seq_ids[None, None, :].repeat(
                    batch_size, seq_length, 1) <= seq_ids[None, :, None])
                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
                causal_mask = causal_mask.to(attention_mask.dtype)

                if causal_mask.shape[1] < attention_mask.shape[1]:
                    prefix_seq_len = attention_mask.shape[
                        1] - causal_mask.shape[1]
                    causal_mask = flow.cat(
                        [
                            flow.ones(
                                (batch_size, seq_length, prefix_seq_len),
                                device=device,
                                dtype=causal_mask.dtype,
                            ),
                            causal_mask,
                        ],
                        axis=-1,
                    )

                extended_attention_mask = (causal_mask[:, None, :, :] *
                                           attention_mask[:, None, None, :])
            else:
                extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(
                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
            )

        extended_attention_mask = extended_attention_mask.to(dtype=flow.float)
        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
        return extended_attention_mask
Example #7
0
    def numel_in_bucket(tensor: flow.Tensor):
        def align(x: int, unit_size: int):
            return (x + (unit_size - 1)) // unit_size * unit_size

        # tensor memory should be align to 512 bytes for cuda operations,
        # 4 is the bytes of a float number
        # TODO(jianhao): expose the `kCudaMemAllocAlignSize` from C++ to
        # avoid this hardcoded "512"
        return align(tensor.numel(), 512 // 4)
Example #8
0
 def _forward_impl(self, x: Tensor) -> Tensor:
     x = self.conv1(x)
     x = self.maxpool(x)
     x = self.stage2(x)
     x = self.stage3(x)
     x = self.stage4(x)
     x = self.conv5(x)
     x = x.mean([2, 3])  # globalpool
     x = self.fc(x)
     return x
Example #9
0
    def get_extended_attention_mask(
        self,
        attention_mask: flow.Tensor,
        input_ids: flow.Tensor,
    ):
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError("Wrong shape for input_ids (shape {}) "
                             "or attention_mask (shape {})".format(
                                 input_ids.shape, attention_mask.shape))

        extended_attention_mask = extended_attention_mask.to(
            dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        return extended_attention_mask
Example #10
0
    def forward(
        self,
        src: Tensor,
        tgt: Tensor,
        src_mask: Optional[Tensor] = None,
        tgt_mask: Optional[Tensor] = None,
        memory_mask: Optional[Tensor] = None,
        src_key_padding_mask: Optional[Tensor] = None,
        tgt_key_padding_mask: Optional[Tensor] = None,
        memory_key_padding_mask: Optional[Tensor] = None,
    ) -> Tensor:

        if not self.batch_first and src.size(1) != tgt.size(1):
            raise RuntimeError("the batch number of src and tgt must be equal")
        elif self.batch_first and src.size(0) != tgt.size(0):
            raise RuntimeError("the batch number of src and tgt must be equal")

        if src.size(2) != self.d_model or tgt.size(2) != self.d_model:
            raise RuntimeError(
                "the feature number of src and tgt must be equal to d_model")

        memory = self.encoder(src, src_mask, src_key_padding_mask)
        output = self.decoder(
            tgt,
            memory,
            tgt_mask,
            memory_mask,
            tgt_key_padding_mask,
            memory_key_padding_mask,
        )
        return output
Example #11
0
 def forward(
     self,
     query: flow.Tensor,
     key: flow.Tensor,
     value: flow.Tensor,
     attn_mask: Optional[flow.Tensor] = None,
 ) -> Tuple[flow.Tensor, flow.Tensor]:
     r"""
     Args:
         query: [batch, num_attention_heads, len_query, dim_query]
         key: [batch, num_attention_heads, len_key, dim_key]
         value: [batch, num_attention_heads, len_value, dim_value]
         attn_mask: [batch, num_attention_heads, len_query, len_key]
     """
     attention = flow.matmul(query, key.transpose(-1, -2))
     attention = attention / math.sqrt(query.size(-1))
     if attn_mask is not None:
         attention = attention + attn_mask
     attention = nn.Softmax(dim=-1)(attention)
     attention = self.dropout(attention)
     context = flow.matmul(attention, value)
     return context, attention
Example #12
0
def channel_shuffle(x: Tensor, groups: int) -> Tensor:
    batchsize, num_channels, height, width = x.size()
    channels_per_group = num_channels // groups

    # reshape
    x = flow.reshape(x, [batchsize, groups, channels_per_group, height, width])

    x = flow.transpose(x, 1, 2)

    # flatten
    x = flow.reshape(x, [batchsize, -1, height, width])

    return x
Example #13
0
def _in_projection_packed(
    q: Tensor,
    k: Tensor,
    v: Tensor,
    w: Tensor,
    b: Optional[Tensor] = None,
) -> List[Tensor]:
    E = q.size(-1)
    if k is v:
        if q is k:
            # self-attention
            # dim=-1的时候chunk不起作用
            res = linear(q, w, b)
            chunk_dim = len(res.shape)
            return res.chunk(3, dim=chunk_dim - 1)
        else:
            # encoder-decoder attention
            # w_q, w_kv = w.split([E, E * 2])
            w_q, w_k, w_v = w.chunk(3, dim=0)
            w_kv = flow.cat([w_k, w_v])
            if b is None:
                b_q = b_kv = None
            else:
                # b_q, b_kv = b.split([E, E * 2])
                b_q, b_k, b_v = b.chunk(3, dim=0)
                b_kv = flow.cat([b_k, b_v])
            res = linear(k, w_kv, b_kv)
            chunk_dim = len(res.shape)
            # 似乎与return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(k, w_v, b_v)等效?
            return [linear(q, w_q, b_q)] + res.chunk(2, dim=chunk_dim - 1)
    else:
        w_q, w_k, w_v = w.chunk(3, dim=0)
        if b is None:
            b_q = b_k = b_v = None
        else:
            b_q, b_k, b_v = b.chunk(3, dim=0)
        return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
Example #14
0
def _expand_mask(mask: flow.Tensor,
                 dtype: flow.dtype,
                 tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len

    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len,
                                                  src_len).to(dtype)

    inverted_mask = 1.0 - expanded_mask

    return inverted_mask.masked_fill(inverted_mask.to(flow.int32), -1e9)
Example #15
0
 def forward(self, x: flow.Tensor):
     """Add positional encoding.
     Args:
         x (torch.Tensor): Input. Its shape is (batch, time, ...)
     Returns:
         torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
     """
     pos = flow.arange(0, x.size(1), device=x.device).reshape(1,
                                                              -1)  # [1, t]
     posemb = self._embedding_from_positions(pos)  # [1, t, emb_dim]
     if self.scale_learnable:
         x = x + self.alpha * posemb
     else:
         x = x * self.xscale + posemb
     return self.dropout(x), posemb
Example #16
0
def _scaled_dot_product_attention(
    q: Tensor,
    k: Tensor,
    v: Tensor,
    attn_mask: Optional[Tensor] = None,
    dropout_p: float = 0.0,
) -> Tuple[Tensor, Tensor]:
    B, Nt, E = q.shape
    q = q / math.sqrt(E)
    # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns)
    attn = flow.bmm(q, k.transpose(-2, -1))
    if attn_mask is not None:
        attn += attn_mask
    attn = flow.softmax(attn, dim=-1)
    if dropout_p > 0.0:
        attn = flow.nn.functional.dropout(attn, p=dropout_p)
    # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E)
    output = flow.bmm(attn, v)
    return output, attn
Example #17
0
def prune_linear_layer(layer: nn.Linear, index: flow.Tensor, dim: int = 0) -> nn.Linear:

    index = index.to(layer.weight.device)
    W = layer.weight.index_select(dim, index).clone().detach()
    if layer.bias is not None:
        if dim == 1:
            b = layer.bias.clone().detach()
        else:
            b = layer.bias[index].clone().detach()
    new_size = list(layer.weight.size())
    new_size[dim] = len(index)
    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(
        layer.weight.device
    )
    new_layer.weight.requires_grad = False
    new_layer.weight.copy_(W.contiguous())
    new_layer.weight.requires_grad = True
    if layer.bias is not None:
        new_layer.bias.requires_grad = False
        new_layer.bias.copy_(b.contiguous())
        new_layer.bias.requires_grad = True
    return new_layer