class PositionalEmbeddingAudio(nn.Module): """This module learns audio positional embeddings up to a fixed maximum size. Padding symbols are ignored, but it is necessary to specify whether padding is added on the left side (left_pad=True) or right side (left_pad=False). """ def __init__(self, num_embeddings, embedding_dim, padding_idx, learned=True): super().__init__() if learned: self.embeddings = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) else: self.embeddings = SinusoidalPositionalEmbedding(int(embedding_dim), padding_idx) self.padding_idx = padding_idx def forward(self, input, lengths, incremental_state=None): """Input is expected to be of size [bsz x seqlen x feature_dim].""" max_length = max(lengths) pos_tensor = lengths.new(input.size(0), max_length).fill_(self.padding_idx) for i, l in enumerate(lengths): pos_tensor[i, :l] = self.padding_idx + 1 return self.embeddings(pos_tensor) def max_positions(self): """Maximum number of supported positions.""" return self.embeddings.max_positions() @property def weight(self): return self.embeddings.weight
class TransformerEncoder(FairseqEncoder): """Transformer encoder.""" def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, rank_scale=0.0): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.out_norm = LayerNormalization(hidden_size) def forward(self, src_tokens, src_lengths): # embed tokens plus positions input_to_padding = attention_bias_ignore_padding( src_tokens, self.dictionary.pad()) encoder_self_attention_bias = encoder_attention_bias(input_to_padding) encoder_input = self.embed_tokens(src_tokens) if self.pos != "nopos": encoder_input += self.embed_positions(src_tokens) x = F.dropout(encoder_input, p=self.dropout, training=self.training) for self_attention, ffn, norm1, norm2 in zip( self.self_attention_blocks, self.ffn_blocks, self.norm1_blocks, self.norm2_blocks): y = self_attention(norm1(x), None, encoder_self_attention_bias) x = residual(x, y, self.dropout, self.training) y = ffn(norm2(x)) x = residual(x, y, self.dropout, self.training) x = self.out_norm(x) return x def max_positions(self): """Maximum input length supported by the encoder.""" if self.pos == "learned": return self.embed_positions.max_positions() else: return 1024
class DualPathEncoder(FairseqEncoder): """Transformer encoder.""" def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, convolutions=((256, 3), ) * 4): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.out_norm = LayerNormalization(hidden_size) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() for (out_channels, kernel_size) in convolutions: pad = (kernel_size - 1) / 2 self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, padding=pad, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim) def forward(self, src_tokens, src_lengths): # embed tokens plus positions input_to_padding = attention_bias_ignore_padding( src_tokens, self.dictionary.pad()) encoder_self_attention_bias = encoder_attention_bias(input_to_padding) encoder_input = self.embed_tokens(src_tokens) if self.pos != "nopos": encoder_input += self.embed_positions(src_tokens) x = F.dropout(encoder_input, p=self.dropout, training=self.training) z = x for self_attention, ffn, norm1, norm2 in zip( self.self_attention_blocks, self.ffn_blocks, self.norm1_blocks, self.norm2_blocks): y = self_attention(norm1(x), None, encoder_self_attention_bias) x = residual(x, y, self.dropout, self.training) y = ffn(norm2(x)) x = residual(x, y, self.dropout, self.training) x = self.out_norm(x) z = self.fc1(z) z = z.transpose(0, 1) for proj, conv in zip(self.projections, self.convolutions): r = z if proj is None else proj(x) z = F.dropout(z, p=self.dropout, training=self.training) z = conv(z) z = F.glu(z, dim=2) z = (z + r) * math.sqrt(0.5) z = z.transpose(1, 0) z = self.fc2(z) return (x, z) def max_positions(self): """Maximum input length supported by the encoder.""" if self.pos == "learned": return self.embed_positions.max_positions() else: return 1024
class TransformerDecoder(FairseqIncrementalDecoder): """Transformer decoder.""" def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, share_embed=False, rank_scale=0.0): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_TARGET) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_TARGET) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.encdec_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() self.norm3_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttentionDecoder(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.norm3_blocks.append(LayerNormalization(hidden_size)) self.encdec_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.out_norm = LayerNormalization(hidden_size) out_embed_dim = hidden_size if share_embed: assert out_embed_dim == embed_dim, \ "Shared embed weights implies same dimensions " \ " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) self.out_embed = nn.Linear(hidden_size, num_embeddings) self.out_embed.weight = self.embed_tokens.weight else: self.out_embed = Linear(hidden_size, num_embeddings, dropout=dropout) def forward(self, input_tokens, encoder_out, incremental_state=None): # split and transpose encoder outputs input_to_padding = attention_bias_ignore_padding( input_tokens, self.dictionary.pad()) decoder_self_attention_bias = encoder_attention_bias(input_to_padding) decoder_self_attention_bias += attention_bias_lower_triangle( input_tokens) # embed positions positions = self.embed_positions(input_tokens, incremental_state) if incremental_state is not None: input_tokens = input_tokens[:, -1:] decoder_self_attention_bias = decoder_self_attention_bias[:, -1:, :] # embed tokens and positions x = self.embed_tokens(input_tokens) + positions x = F.dropout(x, p=self.dropout, training=self.training) avg_attn_scores = None num_attn_layers = len(self.encdec_attention_blocks) for self_attention, encdec_attention, ffn, norm1, norm2, norm3 in zip( self.self_attention_blocks, self.encdec_attention_blocks, self.ffn_blocks, self.norm1_blocks, self.norm2_blocks, self.norm3_blocks): y = self_attention(norm1(x), None, decoder_self_attention_bias, incremental_state) x = residual(x, y, self.dropout, self.training) if incremental_state is not None: y, attn_scores = encdec_attention(norm2(x), encoder_out, None, True) attn_scores = attn_scores / self.layers if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) else: y = encdec_attention(norm2(x), encoder_out, None) x = residual(x, y, self.dropout, self.training) y = ffn(norm3(x)) x = residual(x, y, self.dropout, self.training) x = self.out_embed(self.out_norm(x)) return x, avg_attn_scores def max_positions(self): """Maximum output length supported by the decoder.""" return self.embed_positions.max_positions() def upgrade_state_dict(self, state_dict): if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2: # old models use incorrect weight norm dimension for i, conv in enumerate(self.convolutions): # reconfigure weight norm nn.utils.remove_weight_norm(conv) self.convolutions[i] = nn.utils.weight_norm(conv, dim=0) state_dict['decoder.version'] = torch.Tensor([1]) return state_dict
class DPEncoder(FairseqEncoder): """Transformer encoder.""" def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, convolutions=4): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.attnpath = AttnPathEncoder(self.layers, num_heads=num_heads, filter_size=filter_size, hidden_size=hidden_size, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout) self.cnnpath = CNNPathEncoder(self.layers, hidden_size=hidden_size, dropout=dropout, in_embed=hidden_size, out_embed=hidden_size) def forward(self, src_tokens, src_lengths): # embed tokens plus positions input_to_padding = attention_bias_ignore_padding( src_tokens, self.dictionary.pad()) encoder_self_attention_bias = encoder_attention_bias(input_to_padding) encoder_input = self.embed_tokens(src_tokens) if self.pos != "nopos": encoder_input += self.embed_positions(src_tokens) x = F.dropout(encoder_input, p=self.dropout, training=self.training) attn_x = self.attnpath(x) cnn_x = self.cnnpath(x) return (attn_x, cnn_x) def max_positions(self): """Maximum input length supported by the encoder.""" if self.pos == "learned": return self.embed_positions.max_positions() else: return 1024