Example #1
0
    def test_numpy_type(test_case):
        import numpy as np

        with test_case.assertRaises(TypeError) as exp:
            F.pad(np.random.randn(2, 2))
        test_case.assertTrue(
            "pad(): argument 'x' must be tensor, not <class 'numpy.ndarray'>"
            in str(exp.exception))
Example #2
0
    def forward(self, x, mask):

        feat_len = x.size(1)
        if (feat_len - self.nframes) % self.stride != 0:
            pad_len = self.stride - (feat_len - self.nframes) % self.stride
            x = F.pad(x, pad=(0, 0, 0, pad_len), value=0.0)
            mask = F.pad(mask.int(), pad=(0, pad_len), value=0) > 0
        else:
            pad_len = 0

        with flow.no_grad():
            x = self.window(x.unsqueeze(1))
            x = x.transpose(1, 2)

        mask = mask[:, self.left_frames::self.stride]
        assert mask.size(1) == x.size(1)

        return x, mask
Example #3
0
 def utt_make_frames(self, x):
     frame_size = self.config["data_loader"]["frame_size"]
     remains = x.size(0) % frame_size
     if remains != 0:
         x = F.pad(x, (0, remains))
     out = x.view(1,
                  x.size(0) // frame_size,
                  frame_size * x.size(1)).transpose(1, 2)
     return out
Example #4
0
    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(
            x1,
            (diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2))

        x = flow.cat([x2, x1], dim=1)
        return self.conv(x)
Example #5
0
    def inference(self, memory, memory_mask):

        if self.apply_look_ahead:
            memory = F.pad(memory, pad=(0, 0, 0, self.lookahead_steps), value=0.0)
            memory = memory.transpose(1, 2)
            memory = self.lookahead_conv(memory)
            memory = memory.transpose(1, 2)

        logits = self.output_layer(memory)
        memory_length = flow.sum(memory_mask.squeeze(1), dim=-1)
        logsoftmax = nn.LogSoftmax(dim=-1)
        return logsoftmax(logits), memory_length
Example #6
0
def pad_layer_2d(inp, layer, pad_type="reflect"):
    kernel_size = layer.kernel_size
    if kernel_size[0] % 2 == 0:
        pad_lr = [kernel_size[0] // 2, kernel_size[0] // 2 - 1]
    else:
        pad_lr = [kernel_size[0] // 2, kernel_size[0] // 2]
    if kernel_size[1] % 2 == 0:
        pad_ud = [kernel_size[1] // 2, kernel_size[1] // 2 - 1]
    else:
        pad_ud = [kernel_size[1] // 2, kernel_size[1] // 2]
    pad = tuple(pad_lr + pad_ud)

    inp = F.pad(inp, pad=pad, mode=pad_type)
    out = layer(inp)
    return out
Example #7
0
    def forward(
        self,
        memory,
        memory_length=None,
        targets=None,
        tgt_length=None,
        return_logits=False,
    ):

        if self.apply_look_ahead:
            memory = F.pad(memory, pad=(0, 0, 0, self.lookahead_steps), value=0.0)
            memory = memory.transpose(1, 2)
            memory = self.lookahead_conv(memory)
            memory = memory.transpose(1, 2)

        logits = self.compute_logits(memory)
        if return_logits:
            return logits
        else:
            loss = self.compute_loss(logits, memory_length, targets, tgt_length)
            return loss
Example #8
0
    def test_numpy_error_msg(test_case):
        import numpy as np

        with test_case.assertRaises(RuntimeError) as exp:
            F.pad(np.random.randn(2, 2))
        test_case.assertTrue("numpy" in str(exp.exception))
Example #9
0
 def test_torch_error_msg(test_case):
     with test_case.assertRaises(RuntimeError) as exp:
         F.pad(torch.randn(2, 2))
     test_case.assertTrue("torch.Tensor" in str(exp.exception))
Example #10
0
def multi_head_attention_forward(
    query: Tensor,
    key: Tensor,
    value: Tensor,
    embed_dim_to_check: int,
    num_heads: int,
    in_proj_weight: Tensor,
    in_proj_bias: Optional[Tensor],
    bias_k: Optional[Tensor],
    bias_v: Optional[Tensor],
    add_zero_attn: bool,
    dropout_p: float,
    out_proj_weight: Tensor,
    out_proj_bias: Optional[Tensor],
    training: bool = True,
    key_padding_mask: Optional[Tensor] = None,
    need_weights: bool = True,
    attn_mask: Optional[Tensor] = None,
    use_separate_proj_weight: bool = False,
    q_proj_weight: Optional[Tensor] = None,
    k_proj_weight: Optional[Tensor] = None,
    v_proj_weight: Optional[Tensor] = None,
    static_k: Optional[Tensor] = None,
    static_v: Optional[Tensor] = None,
) -> Tuple[Tensor, Optional[Tensor]]:
    # set up shape vars
    tgt_len, bsz, embed_dim = query.shape
    src_len, _, _ = key.shape
    assert (
        embed_dim == embed_dim_to_check
    ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
    if isinstance(embed_dim, Tensor):
        # embed_dim can be a tensor when JIT tracing
        head_dim = embed_dim.div(num_heads)
    else:
        head_dim = embed_dim // num_heads
    assert (head_dim * num_heads == embed_dim
            ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
    if use_separate_proj_weight:
        # allow MHA to have different embedding dimensions when separate projection weights are used
        assert (
            key.shape[:2] == value.shape[:2]
        ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
    else:
        assert (
            key.shape == value.shape
        ), f"key shape {key.shape} does not match value shape {value.shape}"

    #
    # compute in-projection
    #
    if not use_separate_proj_weight:
        q, k, v = _in_projection_packed(query, key, value, in_proj_weight,
                                        in_proj_bias)
    else:
        assert (q_proj_weight is not None
                ), "use_separate_proj_weight is True but q_proj_weight is None"
        assert (k_proj_weight is not None
                ), "use_separate_proj_weight is True but k_proj_weight is None"
        assert (v_proj_weight is not None
                ), "use_separate_proj_weight is True but v_proj_weight is None"
        if in_proj_bias is None:
            b_q = b_k = b_v = None
        else:
            b_q, b_k, b_v = in_proj_bias.chunk(3, dim=0)
        q, k, v = _in_projection(
            query,
            key,
            value,
            q_proj_weight,
            k_proj_weight,
            v_proj_weight,
            b_q,
            b_k,
            b_v,
        )

    # prep attention mask
    if attn_mask is not None:
        assert (
            attn_mask.dtype.is_floating_point == False
        ), f"Only integer type are supported for attn_mask, not {attn_mask.dtype}"
        # ensure attn_mask's dim is 3
        if attn_mask.dim() == 2:
            correct_2d_size = (tgt_len, src_len)
            if attn_mask.shape != correct_2d_size:
                raise RuntimeError(
                    f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}."
                )
            attn_mask = attn_mask.unsqueeze(0)
        elif attn_mask.dim() == 3:
            correct_3d_size = (bsz * num_heads, tgt_len, src_len)
            if attn_mask.shape != correct_3d_size:
                raise RuntimeError(
                    f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}."
                )
        else:
            raise RuntimeError(
                f"attn_mask's dimension {attn_mask.dim()} is not supported")

    # add bias along batch dimension (currently second)
    if bias_k is not None and bias_v is not None:
        assert static_k is None, "bias cannot be added to static key."
        assert static_v is None, "bias cannot be added to static value."
        k = flow.cat([k, bias_k.repeat((1, bsz, 1))])
        v = flow.cat([v, bias_v.repeat((1, bsz, 1))])
        if attn_mask is not None:
            attn_mask = pad(attn_mask, (0, 1, 0, 0))
        if key_padding_mask is not None:
            key_padding_mask = pad(key_padding_mask, (0, 1, 0, 0))
    else:
        assert bias_k is None
        assert bias_v is None

    #
    # reshape q, k, v for multihead attention and make em batch first
    #
    # replace torch.contiguous with reshape
    q = q.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
    if static_k is None:
        k = k.reshape(-1, bsz * num_heads, head_dim).transpose(0, 1)
    else:
        assert (
            static_k.size(0) == bsz * num_heads
        ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
        assert (
            static_k.size(2) == head_dim
        ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
        k = static_k
    if static_v is None:
        v = v.reshape(-1, bsz * num_heads, head_dim).transpose(0, 1)
    else:
        assert (
            static_v.size(0) == bsz * num_heads
        ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
        assert (
            static_v.size(2) == head_dim
        ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
        v = static_v

    # add zero attention along batch dimension (now first)
    if add_zero_attn:
        zero_attn_shape = (bsz * num_heads, 1, head_dim)
        k = flow.cat(
            [k, flow.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)],
            dim=1)
        v = flow.cat(
            [v, flow.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)],
            dim=1)
        if attn_mask is not None:
            attn_mask = pad(attn_mask, (0, 1, 0, 0))
        if key_padding_mask is not None:
            key_padding_mask = pad(key_padding_mask, (0, 1, 0, 0))

    # update source sequence length after adjustments
    src_len = k.size(1)

    # merge key padding and attention masks
    if key_padding_mask is not None:
        assert key_padding_mask.shape == (
            bsz,
            src_len,
        ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
        key_padding_mask = (key_padding_mask.reshape(
            bsz, 1, 1, src_len).expand(-1, num_heads, tgt_len,
                                       -1).reshape(bsz * num_heads, tgt_len,
                                                   src_len))
        if attn_mask is not None:
            attn_mask = attn_mask.expand(bsz * num_heads, -1, -1)
        if attn_mask is None:
            attn_mask = key_padding_mask
        else:
            attn_mask = flow.logical_or(attn_mask, key_padding_mask)

    # convert mask to float
    if attn_mask is not None and attn_mask.dtype.is_floating_point == False:
        new_attn_mask = flow.zeros_like(attn_mask).to(flow.float)
        new_attn_mask = new_attn_mask.masked_fill(attn_mask, float("-inf"))
        attn_mask = new_attn_mask

    # adjust dropout probability
    if not training:
        dropout_p = 0.0

    #
    # (deep breath) calculate attention and out projection
    #
    attn_output, attn_output_weights = _scaled_dot_product_attention(
        q, k, v, attn_mask, dropout_p)
    attn_output = attn_output.transpose(0, 1).reshape(tgt_len, bsz, embed_dim)
    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)

    if need_weights:
        # average attention weights over heads
        attn_output_weights = attn_output_weights.reshape(
            bsz, num_heads, tgt_len, src_len)
        return attn_output, attn_output_weights.sum(dim=1) / num_heads
    else:
        return attn_output, None
Example #11
0
 def test_torch_type(test_case):
     with test_case.assertRaises(TypeError) as exp:
         F.pad(torch.randn(2, 2))
     test_case.assertTrue(
         "pad(): argument 'x' must be tensor, not <class 'torch.Tensor'>" in
         str(exp.exception))