def forward( self, xs_pad: torch.Tensor, xs_lens: torch.Tensor, decoding_chunk_size: int = 0, ) -> Tuple[torch.Tensor, torch.Tensor]: """Embed positions in tensor. Args: xs_pad: padded input tensor (B, L, D) xs_lens: input length (B) decoding_chunk_size: decoding chunk size for dynamic chunk, it's 0: default for training, use random dynamic chunk. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. Returns: encoder output tensor, lens and mask """ masks = ~make_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L) xs, pos_emb, masks = self.embed(xs_pad, masks) chunk_masks = add_optional_chunk_mask(xs, masks, self.use_dynamic_chunk, decoding_chunk_size, self.static_chunk_size) for layer in self.encoders: xs, chunk_masks = layer(xs, chunk_masks, pos_emb) if self.normalize_before: xs = self.after_norm(xs) # Here we assume the mask is not changed in encoder layers, so just # return the masks before encoder layers, and the masks will be used # for cross attention with decoder later return xs, masks
def forward( self, xs: torch.Tensor, ilens: torch.Tensor, prev_states: Optional[torch.Tensor] = None, decoding_chunk_size: int = 0, ) -> Tuple[torch.Tensor, torch.Tensor]: """Encode input sequence. Args: xs (torch.Tensor): Input tensor (#batch, time, idim). masks (torch.Tensor): Mask tensor (#batch, time). Returns: torch.Tensor: Output tensor (#batch, time, attention_dim). torch.Tensor: Mask tensor (#batch, time). """ masks = ~make_pad_mask(ilens).unsqueeze(1) # the returned xs by self.embed is in: tuple(x, position_encoding) xs, masks = self.embed(xs, masks) chunk_masks = add_optional_chunk_mask(xs[0], masks, self.use_dynamic_chunk, decoding_chunk_size, self.static_chunk_size) for layer in self.encoders: xs, chunk_masks = layer(xs, chunk_masks) if isinstance(xs, tuple): xs = xs[0] if self.normalize_before: xs = self.after_norm(xs) return xs, masks
def forward( self, xs: torch.Tensor, xs_lens: torch.Tensor, decoding_chunk_size: int = 0, num_decoding_left_chunks: int = -1, ) -> Tuple[torch.Tensor, torch.Tensor]: """Embed positions in tensor. Args: xs: padded input tensor (B, T, D) xs_lens: input length (B) decoding_chunk_size: decoding chunk size for dynamic chunk 0: default for training, use random dynamic chunk. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. num_decoding_left_chunks: number of left chunks, this is for decoding, the chunk size is decoding_chunk_size. >=0: use num_decoding_left_chunks <0: use all left chunks Returns: encoder output tensor xs, and subsampled masks xs: padded output tensor (B, T' ~= T/subsample_rate, D) masks: torch.Tensor batch padding mask after subsample (B, 1, T' ~= T/subsample_rate) """ T = xs.size(1) masks = ~make_pad_mask(xs_lens, T).unsqueeze(1) # (B, 1, T) if self.global_cmvn is not None: xs = self.global_cmvn(xs) xs, pos_emb, masks = self.embed(xs, masks) mask_pad = masks # (B, 1, T/subsample_rate) chunk_masks = add_optional_chunk_mask(xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, decoding_chunk_size, self.static_chunk_size, num_decoding_left_chunks) for layer in self.encoders: xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad) if self.normalize_before: xs = self.after_norm(xs) # Here we assume the mask is not changed in encoder layers, so just # return the masks before encoder layers, and the masks will be used # for cross attention with decoder later return xs, masks
def forward( self, xs: torch.Tensor, xs_lens: torch.Tensor, decoding_chunk_size: int = 0, num_decoding_left_chunks: int = -1, ) -> Tuple[torch.Tensor, torch.Tensor]: """Embed positions in tensor. Args: xs: padded input tensor (B, L, D) xs_lens: input length (B) decoding_chunk_size: decoding chunk size for dynamic chunk 0: default for training, use random dynamic chunk. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. num_decoding_left_chunks: number of left chunks, this is for decoding, the chunk size is decoding_chunk_size. >=0: use num_decoding_left_chunks <0: use all left chunks Returns: encoder output tensor, lens and mask """ masks = ~make_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L) 根据batch填充来制作pad,因为batch内的有效长度并不相同. if self.global_cmvn is not None: xs = self.global_cmvn(xs) xs, pos_emb, masks = self.embed(xs, masks) #Conv2dSubsampling4和RelPositionalEncoding存在下采样 mask_pad = masks chunk_masks = add_optional_chunk_mask(xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, decoding_chunk_size, self.static_chunk_size, num_decoding_left_chunks) for layer in self.encoders: xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad) if self.normalize_before: xs = self.after_norm(xs) # Here we assume the mask is not changed in encoder layers, so just # return the masks before encoder layers, and the masks will be used # for cross attention with decoder later return xs, masks