def apply_mask(self, x, padding_mask): B, T, C = x.shape if self.mask_prob > 0: mask_indices = compute_mask_indices( (B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space, ) mask_indices = torch.from_numpy(mask_indices).to(x.device) x = index_put(x, mask_indices, 0) if self.mask_channel_prob > 0: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = (torch.from_numpy(mask_channel_indices).to( x.device).unsqueeze(1).expand(-1, T, -1)) x = index_put(x, mask_channel_indices, 0) return x
def apply_mask_teacher( self, x, padding_mask, mask_indices=None, mask_channel_indices=None, ): B, T, C = x.shape if self.mask_channel_prob > 0 and self.mask_channel_before: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = (torch.from_numpy(mask_channel_indices).to( x.device).unsqueeze(1).expand(-1, T, -1)) x[mask_channel_indices] = 0 if self.mask_prob > 0: if mask_indices is None: mask_indices = compute_mask_indices( (B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space, ) mask_indices = torch.from_numpy(mask_indices).to(x.device) x = index_put(x, mask_indices, self.mask_emb_teacher) else: mask_indices = None if self.mask_channel_prob > 0 and not self.mask_channel_before: if mask_channel_indices is None: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = ( torch.from_numpy(mask_channel_indices).to( x.device).unsqueeze(1).expand(-1, T, -1)) x = index_put(x, mask_channel_indices, 0) return x, mask_indices
def forward(self, x, padding_mask: Optional[torch.Tensor]): if self.layernorm is not None: x = self.layernorm(x) if self.proj is not None: x = x + 0.5 * self.proj(x) x = self.proj_ln(x) if padding_mask is not None: x = utils.index_put(x, padding_mask.T, 0) # T x B x C -> B x C x T x = x.transpose(0, 1).transpose(1, 2) out_lens = None if padding_mask is not None: out_lens = (~padding_mask).sum(1).float() for layer in self.layers: layerdrop_prob = np.random.random() if not self.training or (layerdrop_prob > self.layerdrop): x = nn.functional.glu(layer(x), dim=1) if padding_mask is not None: out_lens = ((out_lens - 1) / self.stride + 1).floor() # B x C x T -> T x B x C x = x.transpose(1, 2).transpose(0, 1) if self.post_proj is not None: x = x + 0.5 * self.post_proj(x) x = self.post_proj_ln(x) out_padding_mask = None if padding_mask is not None: out_padding_mask = lengths_to_padding_mask(out_lens.long()) x = utils.index_put(x, out_padding_mask.T, 0) return x, out_padding_mask
def extract_features(self, x, padding_mask=None): if padding_mask is not None: x = index_put(x, padding_mask, 0) x_conv = self.pos_conv(x.transpose(1, 2)) x_conv = x_conv.transpose(1, 2) x = x + x_conv if not self.layer_norm_first: x = self.layer_norm(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) layer_results = [] for i, layer in enumerate(self.layers): dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): x, z = layer(x, self_attn_padding_mask=padding_mask, need_weights=False) layer_results.append(x) # T x B x C -> B x T x C x = x.transpose(0, 1) return x
def compute_preds(self, x, y, negatives): neg_is_pos = (y == negatives).all(-1) y = y.unsqueeze(0) targets = torch.cat([y, negatives], dim=0) logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1) logits = logits / self.logit_temp logits = logits.type_as(x) if is_xla_tensor(logits) or neg_is_pos.any(): if not hasattr(self, "_inftensor"): fillval = -float(2**30) self._inftensor = (torch.tensor(fillval).to(x.device) if is_xla_tensor(logits) else float("-inf")) logits[1:] = index_put(logits[1:], neg_is_pos, self._inftensor) return logits
def extract_features(self, x, padding_mask=None, tgt_layer=None, fix_n=0): if padding_mask is not None: x = index_put(x, padding_mask, 0) x_conv = self.pos_conv(x.transpose(1, 2)) x_conv = x_conv.transpose(1, 2) x = x + x_conv if not self.layer_norm_first: x = self.layer_norm(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) layer_results = [] r = None for i, layer in enumerate(self.layers): dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): if i < fix_n: with torch.no_grad(): x, z = layer(x, self_attn_padding_mask=padding_mask, need_weights=False) else: x, z = layer(x, self_attn_padding_mask=padding_mask, need_weights=False) if tgt_layer is not None: layer_results.append((x, z)) if i == tgt_layer: r = x break if r is not None: x = r # T x B x C -> B x T x C x = x.transpose(0, 1) return x, layer_results
def extract_features(self, x, padding_mask=None, tgt_layer=None): if padding_mask is not None: x = index_put(x, padding_mask, 0) # B x T x C -> T x B x C x = x.transpose(0, 1) # B X T X C here position_emb = None if self.pos_enc_type == "rel_pos": position_emb = self.embed_positions(x) if not self.layer_norm_first: x = self.layer_norm(x) x = F.dropout(x, p=self.dropout, training=self.training) layer_results = [] r = None for i, layer in enumerate(self.layers): dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): x, z = layer( x, self_attn_padding_mask=padding_mask, need_weights=False, position_emb=position_emb, ) if tgt_layer is not None: layer_results.append((x, z)) if i == tgt_layer: r = x break if r is not None: x = r # T x B x C -> B x T x C x = x.transpose(0, 1) return x, layer_results
def extract_features( self, x, padding_mask=None, tgt_layer=None, min_layer=0, ): if padding_mask is not None: x = index_put(x, padding_mask, 0) x_conv = self.pos_conv(x.transpose(1, 2)) x_conv = x_conv.transpose(1, 2) x = x + x_conv if not self.layer_norm_first: x = self.layer_norm(x) # pad to the sequence length dimension x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0) if pad_length > 0 and padding_mask is None: padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) padding_mask[:, -pad_length:] = True else: padding_mask, _ = pad_to_multiple(padding_mask, self.required_seq_len_multiple, dim=-1, value=True) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) layer_results = [] r = None for i, layer in enumerate(self.layers): dropout_probability = np.random.random( ) if self.layerdrop > 0 else 1 if not self.training or (dropout_probability > self.layerdrop): x, (z, lr) = layer(x, self_attn_padding_mask=padding_mask, need_weights=False) if i >= min_layer: layer_results.append((x, z, lr)) if i == tgt_layer: r = x break if r is not None: x = r # T x B x C -> B x T x C x = x.transpose(0, 1) # undo paddding if pad_length > 0: x = x[:, :-pad_length] def undo_pad(a, b, c): return ( a[:-pad_length], b[:-pad_length] if b is not None else b, c[:-pad_length], ) layer_results = [undo_pad(*u) for u in layer_results] return x, layer_results