def test_numpy_type(test_case): import numpy as np with test_case.assertRaises(TypeError) as exp: F.pad(np.random.randn(2, 2)) test_case.assertTrue( "pad(): argument 'x' must be tensor, not <class 'numpy.ndarray'>" in str(exp.exception))
def forward(self, x, mask): feat_len = x.size(1) if (feat_len - self.nframes) % self.stride != 0: pad_len = self.stride - (feat_len - self.nframes) % self.stride x = F.pad(x, pad=(0, 0, 0, pad_len), value=0.0) mask = F.pad(mask.int(), pad=(0, pad_len), value=0) > 0 else: pad_len = 0 with flow.no_grad(): x = self.window(x.unsqueeze(1)) x = x.transpose(1, 2) mask = mask[:, self.left_frames::self.stride] assert mask.size(1) == x.size(1) return x, mask
def utt_make_frames(self, x): frame_size = self.config["data_loader"]["frame_size"] remains = x.size(0) % frame_size if remains != 0: x = F.pad(x, (0, remains)) out = x.view(1, x.size(0) // frame_size, frame_size * x.size(1)).transpose(1, 2) return out
def forward(self, x1, x2): x1 = self.up(x1) # input is CHW diffY = x2.size()[2] - x1.size()[2] diffX = x2.size()[3] - x1.size()[3] x1 = F.pad( x1, (diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2)) x = flow.cat([x2, x1], dim=1) return self.conv(x)
def inference(self, memory, memory_mask): if self.apply_look_ahead: memory = F.pad(memory, pad=(0, 0, 0, self.lookahead_steps), value=0.0) memory = memory.transpose(1, 2) memory = self.lookahead_conv(memory) memory = memory.transpose(1, 2) logits = self.output_layer(memory) memory_length = flow.sum(memory_mask.squeeze(1), dim=-1) logsoftmax = nn.LogSoftmax(dim=-1) return logsoftmax(logits), memory_length
def pad_layer_2d(inp, layer, pad_type="reflect"): kernel_size = layer.kernel_size if kernel_size[0] % 2 == 0: pad_lr = [kernel_size[0] // 2, kernel_size[0] // 2 - 1] else: pad_lr = [kernel_size[0] // 2, kernel_size[0] // 2] if kernel_size[1] % 2 == 0: pad_ud = [kernel_size[1] // 2, kernel_size[1] // 2 - 1] else: pad_ud = [kernel_size[1] // 2, kernel_size[1] // 2] pad = tuple(pad_lr + pad_ud) inp = F.pad(inp, pad=pad, mode=pad_type) out = layer(inp) return out
def forward( self, memory, memory_length=None, targets=None, tgt_length=None, return_logits=False, ): if self.apply_look_ahead: memory = F.pad(memory, pad=(0, 0, 0, self.lookahead_steps), value=0.0) memory = memory.transpose(1, 2) memory = self.lookahead_conv(memory) memory = memory.transpose(1, 2) logits = self.compute_logits(memory) if return_logits: return logits else: loss = self.compute_loss(logits, memory_length, targets, tgt_length) return loss
def test_numpy_error_msg(test_case): import numpy as np with test_case.assertRaises(RuntimeError) as exp: F.pad(np.random.randn(2, 2)) test_case.assertTrue("numpy" in str(exp.exception))
def test_torch_error_msg(test_case): with test_case.assertRaises(RuntimeError) as exp: F.pad(torch.randn(2, 2)) test_case.assertTrue("torch.Tensor" in str(exp.exception))
def multi_head_attention_forward( query: Tensor, key: Tensor, value: Tensor, embed_dim_to_check: int, num_heads: int, in_proj_weight: Tensor, in_proj_bias: Optional[Tensor], bias_k: Optional[Tensor], bias_v: Optional[Tensor], add_zero_attn: bool, dropout_p: float, out_proj_weight: Tensor, out_proj_bias: Optional[Tensor], training: bool = True, key_padding_mask: Optional[Tensor] = None, need_weights: bool = True, attn_mask: Optional[Tensor] = None, use_separate_proj_weight: bool = False, q_proj_weight: Optional[Tensor] = None, k_proj_weight: Optional[Tensor] = None, v_proj_weight: Optional[Tensor] = None, static_k: Optional[Tensor] = None, static_v: Optional[Tensor] = None, ) -> Tuple[Tensor, Optional[Tensor]]: # set up shape vars tgt_len, bsz, embed_dim = query.shape src_len, _, _ = key.shape assert ( embed_dim == embed_dim_to_check ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" if isinstance(embed_dim, Tensor): # embed_dim can be a tensor when JIT tracing head_dim = embed_dim.div(num_heads) else: head_dim = embed_dim // num_heads assert (head_dim * num_heads == embed_dim ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" if use_separate_proj_weight: # allow MHA to have different embedding dimensions when separate projection weights are used assert ( key.shape[:2] == value.shape[:2] ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" else: assert ( key.shape == value.shape ), f"key shape {key.shape} does not match value shape {value.shape}" # # compute in-projection # if not use_separate_proj_weight: q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) else: assert (q_proj_weight is not None ), "use_separate_proj_weight is True but q_proj_weight is None" assert (k_proj_weight is not None ), "use_separate_proj_weight is True but k_proj_weight is None" assert (v_proj_weight is not None ), "use_separate_proj_weight is True but v_proj_weight is None" if in_proj_bias is None: b_q = b_k = b_v = None else: b_q, b_k, b_v = in_proj_bias.chunk(3, dim=0) q, k, v = _in_projection( query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v, ) # prep attention mask if attn_mask is not None: assert ( attn_mask.dtype.is_floating_point == False ), f"Only integer type are supported for attn_mask, not {attn_mask.dtype}" # ensure attn_mask's dim is 3 if attn_mask.dim() == 2: correct_2d_size = (tgt_len, src_len) if attn_mask.shape != correct_2d_size: raise RuntimeError( f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}." ) attn_mask = attn_mask.unsqueeze(0) elif attn_mask.dim() == 3: correct_3d_size = (bsz * num_heads, tgt_len, src_len) if attn_mask.shape != correct_3d_size: raise RuntimeError( f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}." ) else: raise RuntimeError( f"attn_mask's dimension {attn_mask.dim()} is not supported") # add bias along batch dimension (currently second) if bias_k is not None and bias_v is not None: assert static_k is None, "bias cannot be added to static key." assert static_v is None, "bias cannot be added to static value." k = flow.cat([k, bias_k.repeat((1, bsz, 1))]) v = flow.cat([v, bias_v.repeat((1, bsz, 1))]) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1, 0, 0)) if key_padding_mask is not None: key_padding_mask = pad(key_padding_mask, (0, 1, 0, 0)) else: assert bias_k is None assert bias_v is None # # reshape q, k, v for multihead attention and make em batch first # # replace torch.contiguous with reshape q = q.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1) if static_k is None: k = k.reshape(-1, bsz * num_heads, head_dim).transpose(0, 1) else: assert ( static_k.size(0) == bsz * num_heads ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}" assert ( static_k.size(2) == head_dim ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" k = static_k if static_v is None: v = v.reshape(-1, bsz * num_heads, head_dim).transpose(0, 1) else: assert ( static_v.size(0) == bsz * num_heads ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}" assert ( static_v.size(2) == head_dim ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" v = static_v # add zero attention along batch dimension (now first) if add_zero_attn: zero_attn_shape = (bsz * num_heads, 1, head_dim) k = flow.cat( [k, flow.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1) v = flow.cat( [v, flow.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1, 0, 0)) if key_padding_mask is not None: key_padding_mask = pad(key_padding_mask, (0, 1, 0, 0)) # update source sequence length after adjustments src_len = k.size(1) # merge key padding and attention masks if key_padding_mask is not None: assert key_padding_mask.shape == ( bsz, src_len, ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" key_padding_mask = (key_padding_mask.reshape( bsz, 1, 1, src_len).expand(-1, num_heads, tgt_len, -1).reshape(bsz * num_heads, tgt_len, src_len)) if attn_mask is not None: attn_mask = attn_mask.expand(bsz * num_heads, -1, -1) if attn_mask is None: attn_mask = key_padding_mask else: attn_mask = flow.logical_or(attn_mask, key_padding_mask) # convert mask to float if attn_mask is not None and attn_mask.dtype.is_floating_point == False: new_attn_mask = flow.zeros_like(attn_mask).to(flow.float) new_attn_mask = new_attn_mask.masked_fill(attn_mask, float("-inf")) attn_mask = new_attn_mask # adjust dropout probability if not training: dropout_p = 0.0 # # (deep breath) calculate attention and out projection # attn_output, attn_output_weights = _scaled_dot_product_attention( q, k, v, attn_mask, dropout_p) attn_output = attn_output.transpose(0, 1).reshape(tgt_len, bsz, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) if need_weights: # average attention weights over heads attn_output_weights = attn_output_weights.reshape( bsz, num_heads, tgt_len, src_len) return attn_output, attn_output_weights.sum(dim=1) / num_heads else: return attn_output, None
def test_torch_type(test_case): with test_case.assertRaises(TypeError) as exp: F.pad(torch.randn(2, 2)) test_case.assertTrue( "pad(): argument 'x' must be tensor, not <class 'torch.Tensor'>" in str(exp.exception))