def forward(self, enc_states, enc_len, dec_states): """Returns the output of the attention module. Arguments --------- enc_states : torch.Tensor The tensor to be attended. enc_len : torch.Tensor The real length (without padding) of enc_states for each sentence. dec_states : torch.Tensor The query tensor. """ if self.keys is None: self.keys = self.key_linear(enc_states) self.values = self.value_linear(enc_states) self.mask = length_to_mask(enc_len, max_len=enc_states.size(1), device=enc_states.device).unsqueeze(2) query = self.query_linear(dec_states).unsqueeze(2) scores = torch.matmul(self.keys, query) / self.scaling scores = scores.masked_fill(self.mask == 0, -np.inf) normalized_scores = scores.softmax(1).transpose(1, 2) out = torch.matmul(normalized_scores, self.values).squeeze(1) return out, normalized_scores
def forward(self, enc_states, enc_len, dec_states): """Returns the output of the attention module. Arguments --------- enc_states : torch.Tensor The tensor to be attended. enc_len : torch.Tensor The real length (without padding) of enc_states for each sentence. dec_states : torch.Tensor The query tensor. """ if self.precomputed_enc_h is None: self.precomputed_enc_h = self.mlp_enc(enc_states) self.mask = length_to_mask(enc_len, max_len=enc_states.size(1), device=enc_states.device) dec_h = self.mlp_dec(dec_states.unsqueeze(1)) attn = self.mlp_attn(torch.tanh(self.precomputed_enc_h + dec_h)).squeeze(-1) # mask the padded frames attn = attn.masked_fill(self.mask == 0, -np.inf) attn = self.softmax(attn * self.scaling) # compute context vectors # [B, 1, L] X [B, L, F] context = torch.bmm(attn.unsqueeze(1), enc_states).squeeze(1) context = self.mlp_out(context) return context, attn
def __init__(self, x, enc_lens, batch_size, beam_size, blank_index, eos_index): self.blank_index = blank_index self.eos_index = eos_index self.max_enc_len = x.size(1) self.batch_size = batch_size self.beam_size = beam_size self.vocab_size = x.size(-1) self.device = x.device self.minus_inf = -1e20 self.last_frame_index = enc_lens - 1 assert (self.eos_index != self.blank_index ), "Please set these two tokens to different indexes" # mask frames > enc_lens mask = 1 - length_to_mask(enc_lens) mask = mask.unsqueeze(-1).expand(-1, -1, x.size(-1)).eq(1) x.masked_fill_(mask, self.minus_inf) x[:, :, 0] = x[:, :, 0].masked_fill_(mask[:, :, 0], 0) # dim=0: xnb, nonblank posteriors, dim=1: xb, blank posteriors xnb = x.transpose(0, 1) xb = (xnb[:, :, self.blank_index].unsqueeze(2).expand( -1, -1, self.vocab_size)) # (2, L, batch_size * beam_size, vocab_size) self.x = torch.stack([xnb, xb]) # The first index of each sentence. self.beam_offset = (torch.arange(batch_size, device=self.device) * self.beam_size) # The first index of each candidates. self.cand_offset = (torch.arange(batch_size, device=self.device) * self.vocab_size)
def encode( self, src, wav_len=None, ): """ forward the encoder with source input Arguments ---------- src : tensor The sequence to the encoder (required). """ # reshape the src vector to [Batch, Time, Fea] if a 4d vector is given if src.dim() == 4: bz, t, ch1, ch2 = src.shape src = src.reshape(bz, t, ch1 * ch2) src_key_padding_mask = None if wav_len is not None and self.training: abs_len = torch.round(wav_len * src.shape[1]) src_key_padding_mask = (1 - length_to_mask(abs_len)).bool() src = self.custom_src_module(src) src = src + self.positional_encoding(src) encoder_out, _ = self.encoder( src=src, src_key_padding_mask=src_key_padding_mask) return encoder_out
def nll_loss_kd( probabilities, targets, rel_lab_lengths, ): """Knowledge distillation for negative log likelihood loss. Reference --------- Distilling Knowledge from Ensembles of Acoustic Models for Joint CTC-Attention End-to-End Speech Recognition. https://arxiv.org/abs/2005.09310 Arguments --------- probabilities : torch.Tensor The predicted probabilities from student model. Format is [batch, frames, p] targets : torch.Tensor The target probabilities from teacher model. Format is [batch, frames, p] rel_lab_lengths : torch.Tensor Length of each utterance, if frame-level loss is desired. Example ------- >>> probabilities = torch.tensor([[[0.8, 0.2], [0.2, 0.8]]]) >>> targets = torch.tensor([[[0.9, 0.1], [0.1, 0.9]]]) >>> rel_lab_lengths = torch.tensor([1.]) >>> nll_loss_kd(probabilities, targets, rel_lab_lengths) tensor(-0.7400) """ # Getting the number of sentences in the minibatch N_snt = probabilities.shape[0] # Getting the maximum length of label sequence max_len = probabilities.shape[1] # Getting the label lengths lab_lengths = torch.round(rel_lab_lengths * targets.shape[1]).int() # Reshape to [batch_size * length, feature] prob_curr = probabilities.reshape(N_snt * max_len, probabilities.shape[-1]) # Generating mask mask = length_to_mask(lab_lengths, max_len=max_len, dtype=torch.float, device=prob_curr.device) # Reshape to [batch_size * length, feature] lab_curr = targets.reshape(N_snt * max_len, targets.shape[-1]) loss = ce_kd(prob_curr, lab_curr) # Loss averaging loss = torch.sum(loss.reshape(N_snt, max_len) * mask) / torch.sum(mask) return loss
def forward(self, x, lengths=None): L = x.shape[-1] if lengths is not None: mask = length_to_mask(lengths * L, max_len=L, device=x.device) mask = mask.unsqueeze(1) total = mask.sum(dim=2, keepdim=True) s = (x * mask).sum(dim=2, keepdim=True) / total else: s = x.mean(dim=2, keepdim=True) s = self.relu(self.conv1(s)) s = self.sigmoid(self.conv2(s)) return s * x
def forward(self, x, lengths=None): """Calculates mean and std for a batch (input tensor). Arguments --------- x : torch.Tensor Tensor of shape [N, C, L]. """ L = x.shape[-1] def _compute_statistics(x, m, dim=2, eps=self.eps): mean = (m * x).sum(dim) std = torch.sqrt( (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps) ) return mean, std if lengths is None: lengths = torch.ones(x.shape[0], device=x.device) # Make binary mask of shape [N, 1, L] mask = length_to_mask(lengths * L, max_len=L, device=x.device) mask = mask.unsqueeze(1) # Expand the temporal context of the pooling layer by allowing the # self-attention to look at global properties of the utterance. if self.global_context: # torch.std is unstable for backward computation # https://github.com/pytorch/pytorch/issues/4320 total = mask.sum(dim=2, keepdim=True).float() mean, std = _compute_statistics(x, mask / total) mean = mean.unsqueeze(2).repeat(1, 1, L) std = std.unsqueeze(2).repeat(1, 1, L) attn = torch.cat([x, mean, std], dim=1) else: attn = x # Apply layers attn = self.conv(self.tanh(self.tdnn(attn))) # Filter out zero-paddings attn = attn.masked_fill(mask == 0, float("-inf")) attn = F.softmax(attn, dim=2) mean, std = _compute_statistics(x, attn) # Append mean and std of the batch pooled_stats = torch.cat((mean, std), dim=1) pooled_stats = pooled_stats.unsqueeze(2) return pooled_stats
def forward(self, enc_states, enc_len, dec_states): """Returns the output of the attention module. Arguments --------- enc_states : torch.Tensor The tensor to be attended. enc_len : torch.Tensor The real length (without padding) of enc_states for each sentence. dec_states : torch.Tensor The query tensor. """ if self.precomputed_enc_h is None: self.precomputed_enc_h = self.mlp_enc(enc_states) self.mask = length_to_mask( enc_len, max_len=enc_states.size(1), device=enc_states.device ) # multiply mask by 1/Ln for each row self.prev_attn = self.mask * (1 / enc_len.float()).unsqueeze(1) # compute location-aware features # [B, 1, L] -> [B, C, L] attn_conv = self.conv_loc(self.prev_attn.unsqueeze(1)) # [B, C, L] -> [B, L, C] -> [B, L, F] attn_conv = self.mlp_loc(attn_conv.transpose(1, 2)) dec_h = self.mlp_dec(dec_states.unsqueeze(1)) attn = self.mlp_attn( torch.tanh(self.precomputed_enc_h + dec_h + attn_conv) ).squeeze(-1) # mask the padded frames attn = attn.masked_fill(self.mask == 0, -np.inf) attn = self.softmax(attn * self.scaling) # set prev_attn to current attn for the next timestep self.prev_attn = attn.detach() # compute context vectors # [B, 1, L] X [B, L, F] context = torch.bmm(attn.unsqueeze(1), enc_states).squeeze(1) context = self.mlp_out(context) return context, attn
def make_masks(self, src, tgt, wav_len=None, pad_idx=0): """This method generates the masks for training the transformer model. Arguments --------- src : tensor The sequence to the encoder (required). tgt : tensor The sequence to the decoder (required). pad_idx : int The index for <pad> token (default=0). """ src_key_padding_mask = None if wav_len is not None and self.training: abs_len = torch.round(wav_len * src.shape[1]) src_key_padding_mask = (1 - length_to_mask(abs_len)).bool() tgt_key_padding_mask = get_key_padding_mask(tgt, pad_idx=pad_idx) src_mask = None tgt_mask = get_lookahead_mask(tgt) return src_key_padding_mask, tgt_key_padding_mask, src_mask, tgt_mask
def encode( self, src, wav_len=None, ): """ Encoder forward pass Arguments ---------- src : torch.Tensor The sequence to the encoder. wav_len: torch.Tensor, optional Torch Tensor of shape (batch, ) containing the relative length to padded length for each example. """ # reshape the src vector to [Batch, Time, Fea] if a 4d vector is given if src.dim() == 4: bz, t, ch1, ch2 = src.shape src = src.reshape(bz, t, ch1 * ch2) src_key_padding_mask = None if wav_len is not None and self.training: abs_len = torch.round(wav_len * src.shape[1]) src_key_padding_mask = (1 - length_to_mask(abs_len)).bool() src = self.custom_src_module(src) if self.attention_type == "RelPosMHAXL": pos_embs_source = self.positional_encoding(src) elif self.positional_encoding_type == "fixed_abs_sine": src = src + self.positional_encoding(src) pos_embs_source = None encoder_out, _ = self.encoder( src=src, src_key_padding_mask=src_key_padding_mask, pos_embs=pos_embs_source, ) return encoder_out
def Accuracy(log_probablities, targets, length=None): """Calculates the accuracy for predicted log probabilities and targets in a batch. Arguments ---------- log_probablities : tensor Predicted log probabilities (batch_size, time, feature). targets : tensor Target (batch_size, time). length : tensor Length of target (batch_size,). Example ------- >>> probs = torch.tensor([[0.9, 0.1], [0.1, 0.9], [0.8, 0.2]]).unsqueeze(0) >>> acc = Accuracy(torch.log(probs), torch.tensor([1, 1, 0]).unsqueeze(0), torch.tensor([2/3])) >>> print(acc) (1.0, 2.0) """ if length is not None: mask = length_to_mask( length * targets.shape[1], max_len=targets.shape[1], ).bool() if len(targets.shape) == 3: mask = mask.unsqueeze(2).repeat(1, 1, targets.shape[2]) padded_pred = log_probablities.argmax(-1) if length is not None: numerator = torch.sum( padded_pred.masked_select(mask) == targets.masked_select(mask)) denominator = torch.sum(mask) else: numerator = torch.sum(padded_pred == targets) denominator = targets.shape[1] return float(numerator), float(denominator)
def compute_masked_loss( loss_fn, predictions, targets, length=None, label_smoothing=0.0, reduction="mean", ): """Compute the true average loss of a set of waveforms of unequal length. Arguments --------- loss_fn : function A function for computing the loss taking just predictions and targets. Should return all the losses, not a reduction (e.g. reduction="none") predictions : torch.Tensor First argument to loss function. targets : torch.Tensor Second argument to loss function. length : torch.Tensor Length of each utterance to compute mask. If None, global average is computed and returned. label_smoothing: float The proportion of label smoothing. Should only be used for NLL loss. Ref: Regularizing Neural Networks by Penalizing Confident Output Distributions. https://arxiv.org/abs/1701.06548 reduction : str One of 'mean', 'batch', 'batchmean', 'none' where 'mean' returns a single value and 'batch' returns one per item in the batch and 'batchmean' is sum / batch_size and 'none' returns all. """ mask = torch.ones_like(targets) if length is not None: length_mask = length_to_mask( length * targets.shape[1], max_len=targets.shape[1], ) # Handle any dimensionality of input while len(length_mask.shape) < len(mask.shape): length_mask = length_mask.unsqueeze(-1) length_mask = length_mask.type(mask.dtype) mask *= length_mask # Compute, then reduce loss loss = loss_fn(predictions, targets) * mask N = loss.size(0) if reduction == "mean": loss = loss.sum() / torch.sum(mask) elif reduction == "batchmean": loss = loss.sum() / N elif reduction == "batch": loss = loss.reshape(N, -1).sum(1) / mask.reshape(N, -1).sum(1) if label_smoothing == 0: return loss else: loss_reg = torch.mean(predictions, dim=1) * mask if reduction == "mean": loss_reg = torch.sum(loss_reg) / torch.sum(mask) elif reduction == "batchmean": loss_reg = torch.sum(loss_reg) / targets.shape[0] elif reduction == "batch": loss_reg = loss_reg.sum(1) / mask.sum(1) return -label_smoothing * loss_reg + (1 - label_smoothing) * loss