def __init__(self, num_embeddings, embedding_dim, padding_idx, learned=True): super().__init__() if learned: self.embeddings = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) else: self.embeddings = SinusoidalPositionalEmbedding(int(embedding_dim), padding_idx) self.padding_idx = padding_idx
def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, rank_scale=0.0): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.out_norm = LayerNormalization(hidden_size)
def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, convolutions=4): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.attnpath = AttnPathEncoder(self.layers, num_heads=num_heads, filter_size=filter_size, hidden_size=hidden_size, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout) self.cnnpath = CNNPathEncoder(self.layers, hidden_size=hidden_size, dropout=dropout, in_embed=hidden_size, out_embed=hidden_size)
class PositionalEmbeddingAudio(nn.Module): """This module learns audio positional embeddings up to a fixed maximum size. Padding symbols are ignored, but it is necessary to specify whether padding is added on the left side (left_pad=True) or right side (left_pad=False). """ def __init__(self, num_embeddings, embedding_dim, padding_idx, learned=True): super().__init__() if learned: self.embeddings = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) else: self.embeddings = SinusoidalPositionalEmbedding(int(embedding_dim), padding_idx) self.padding_idx = padding_idx def forward(self, input, lengths, incremental_state=None): """Input is expected to be of size [bsz x seqlen x feature_dim].""" max_length = max(lengths) pos_tensor = lengths.new(input.size(0), max_length).fill_(self.padding_idx) for i, l in enumerate(lengths): pos_tensor[i, :l] = self.padding_idx + 1 return self.embeddings(pos_tensor) def max_positions(self): """Maximum number of supported positions.""" return self.embeddings.max_positions() @property def weight(self): return self.embeddings.weight
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad, learned=False): if learned: m = LearnedPositionalEmbedding(num_embeddings + padding_idx + 1, embedding_dim, padding_idx, left_pad) nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) nn.init.constant_(m.weight[padding_idx], 0) else: m = SinusoidalPositionalEmbedding(embedding_dim, padding_idx, left_pad, num_embeddings + padding_idx + 1) return m
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, learned=False): if learned: m = LearnedPositionalEmbedding(num_embeddings + padding_idx + 1, embedding_dim, padding_idx) nn.init.normal_(m.weight, mean=0, std=0.02) nn.init.constant_(m.weight[padding_idx], 0) else: m = SinusoidalPositionalEmbedding( embedding_dim, padding_idx, init_size=num_embeddings + padding_idx + 1, ) return m
def __init__(self, args): """ Transformer model, following Vaswani et al., 2017. """ scale_factor = args.encoder_embed_dim // 256 super(RotomerTransformerModel, self).__init__(scale_factor=scale_factor) self.layers = nn.ModuleList([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.fc1 = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.fc2 = nn.Linear(args.encoder_embed_dim, 1) self.embed_positions = SinusoidalPositionalEmbedding( args.encoder_embed_dim, -1, left_pad=False, )
def PositionalEmbeddingCreator( num_embeddings, embedding_dim, padding_idx, left_pad, learned=False ): if learned: m = LearnedPositionalEmbedding( num_embeddings + padding_idx + 1, embedding_dim, padding_idx ) nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) nn.init.constant_(m.weight[padding_idx], 0) else: # sys.stderr.write(str(type(embedding_dim)) + ",") # sys.stderr.write(str(type(padding_idx)) + ",") # sys.stderr.write(str(type(left_pad)) + ",") # sys.stderr.write(str(type(num_embeddings))) # sys.stderr.write("\n") m = SinusoidalPositionalEmbedding( embedding_dim=embedding_dim, padding_idx=padding_idx, init_size=num_embeddings + padding_idx + 1, ) return m
def __init__(self, n_layers, n_heads, d_model, attn_dropout, relu_dropout, emb_dropout, res_dropout, attn_mask, scale_embedding=True): super(CrossmodalTransformer, self).__init__() self.attn_mask = attn_mask self.emb_scale = math.sqrt(d_model) if scale_embedding else 1.0 self.pos_emb = SinusoidalPositionalEmbedding(d_model, 0, init_size=128) self.dropout = nn.Dropout(emb_dropout) layer = TransformerEncoderBlock(d_model=d_model, n_heads=n_heads, d_feedforward=d_model * 4, attn_dropout=attn_dropout, res_dropout=res_dropout, relu_dropout=relu_dropout) self.layers = _get_clones(layer, n_layers)
def __init__( self, d_model, nhead, emb_dropout, attn_dropout, res_dropout, relu_dropout, n_layer, attn_mask, scale_embedding=True, ): super(CrossmodalTransformer, self).__init__() self.attn_mask = attn_mask self.emb_scale = math.sqrt(d_model) if scale_embedding else 1.0 self.pos = SinusoidalPositionalEmbedding(d_model, 0, init_size=128) self.emb_dropout = emb_dropout self.layers = nn.ModuleList([]) for layer in range(n_layer): new_layer = TransformerEncoderBlock(d_model, nhead, d_model * 4, attn_dropout, res_dropout, relu_dropout) self.layers.append(new_layer)
class TransformerDecoder(FairseqIncrementalDecoder): """Transformer decoder.""" def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, share_embed=False, rank_scale=0.0): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_TARGET) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_TARGET) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.encdec_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() self.norm3_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttentionDecoder(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.norm3_blocks.append(LayerNormalization(hidden_size)) self.encdec_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.out_norm = LayerNormalization(hidden_size) out_embed_dim = hidden_size if share_embed: assert out_embed_dim == embed_dim, \ "Shared embed weights implies same dimensions " \ " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) self.out_embed = nn.Linear(hidden_size, num_embeddings) self.out_embed.weight = self.embed_tokens.weight else: self.out_embed = Linear(hidden_size, num_embeddings, dropout=dropout) def forward(self, input_tokens, encoder_out, incremental_state=None): # split and transpose encoder outputs input_to_padding = attention_bias_ignore_padding( input_tokens, self.dictionary.pad()) decoder_self_attention_bias = encoder_attention_bias(input_to_padding) decoder_self_attention_bias += attention_bias_lower_triangle( input_tokens) # embed positions positions = self.embed_positions(input_tokens, incremental_state) if incremental_state is not None: input_tokens = input_tokens[:, -1:] decoder_self_attention_bias = decoder_self_attention_bias[:, -1:, :] # embed tokens and positions x = self.embed_tokens(input_tokens) + positions x = F.dropout(x, p=self.dropout, training=self.training) avg_attn_scores = None num_attn_layers = len(self.encdec_attention_blocks) for self_attention, encdec_attention, ffn, norm1, norm2, norm3 in zip( self.self_attention_blocks, self.encdec_attention_blocks, self.ffn_blocks, self.norm1_blocks, self.norm2_blocks, self.norm3_blocks): y = self_attention(norm1(x), None, decoder_self_attention_bias, incremental_state) x = residual(x, y, self.dropout, self.training) if incremental_state is not None: y, attn_scores = encdec_attention(norm2(x), encoder_out, None, True) attn_scores = attn_scores / self.layers if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) else: y = encdec_attention(norm2(x), encoder_out, None) x = residual(x, y, self.dropout, self.training) y = ffn(norm3(x)) x = residual(x, y, self.dropout, self.training) x = self.out_embed(self.out_norm(x)) return x, avg_attn_scores def max_positions(self): """Maximum output length supported by the decoder.""" return self.embed_positions.max_positions() def upgrade_state_dict(self, state_dict): if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2: # old models use incorrect weight norm dimension for i, conv in enumerate(self.convolutions): # reconfigure weight norm nn.utils.remove_weight_norm(conv) self.convolutions[i] = nn.utils.weight_norm(conv, dim=0) state_dict['decoder.version'] = torch.Tensor([1]) return state_dict
class DualPathEncoder(FairseqEncoder): """Transformer encoder.""" def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, convolutions=((256, 3), ) * 4): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.out_norm = LayerNormalization(hidden_size) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() for (out_channels, kernel_size) in convolutions: pad = (kernel_size - 1) / 2 self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, padding=pad, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim) def forward(self, src_tokens, src_lengths): # embed tokens plus positions input_to_padding = attention_bias_ignore_padding( src_tokens, self.dictionary.pad()) encoder_self_attention_bias = encoder_attention_bias(input_to_padding) encoder_input = self.embed_tokens(src_tokens) if self.pos != "nopos": encoder_input += self.embed_positions(src_tokens) x = F.dropout(encoder_input, p=self.dropout, training=self.training) z = x for self_attention, ffn, norm1, norm2 in zip( self.self_attention_blocks, self.ffn_blocks, self.norm1_blocks, self.norm2_blocks): y = self_attention(norm1(x), None, encoder_self_attention_bias) x = residual(x, y, self.dropout, self.training) y = ffn(norm2(x)) x = residual(x, y, self.dropout, self.training) x = self.out_norm(x) z = self.fc1(z) z = z.transpose(0, 1) for proj, conv in zip(self.projections, self.convolutions): r = z if proj is None else proj(x) z = F.dropout(z, p=self.dropout, training=self.training) z = conv(z) z = F.glu(z, dim=2) z = (z + r) * math.sqrt(0.5) z = z.transpose(1, 0) z = self.fc2(z) return (x, z) def max_positions(self): """Maximum input length supported by the encoder.""" if self.pos == "learned": return self.embed_positions.max_positions() else: return 1024
def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, convolutions=((256, 3), ) * 4): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.out_norm = LayerNormalization(hidden_size) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() for (out_channels, kernel_size) in convolutions: pad = (kernel_size - 1) / 2 self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, padding=pad, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad): m = SinusoidalPositionalEmbedding(embedding_dim, padding_idx, left_pad, num_embeddings + padding_idx + 1) return m
def __init__(self, args, dictionary, embed_tokens, left_pad=False, final_norm=True): super().__init__(dictionary) self.padding_idx = embed_tokens.padding_idx self.dropout = args.dropout self.share_input_output_embed = args.share_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.embed_dim output_embed_dim = args.output_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens #self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.max_positions = args.max_positions + 1 self.embed_segment = nn.Embedding( args.num_segment, embed_dim, self.padding_idx, ) if args.num_segment > 0 else None self.project_in_dim = nn.Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.prediction_word_embedding = nn.Parameter( torch.Tensor(1, 1, embed_dim).zero_()) self.embed_positions = PositionalEmbedding( self.max_positions, embed_dim, padding_idx, left_pad=left_pad, ) if not args.no_token_positional_embeddings else None def make_layers(args, layers, needs_key_values): if args.universal: layers = [ ShuffleTransformerDecoderLayer( args, needs_key_values=needs_key_values) ] * layers else: layers = [ ShuffleTransformerDecoderLayer( args, needs_key_values=needs_key_values) for _ in range(layers) ] return nn.ModuleList(layers) self.stacked_decoder = args.stacked_decoder self.encoder_layers = make_layers(args, args.encoder_layers, needs_key_values=True) self.decoder_layers = make_layers( args, args.decoder_layers, needs_key_values=False) if args.asymmetric else self.encoder_layers if not args.stacked_decoder and args.encoder_layers != args.decoder_layers: raise ( "If not using stacked-decoder, encoder and decoder must have the same number of layers" ) if not args.asymmetric and args.encoder_layers != args.decoder_layers: raise ( "If not using asymmetric, encoder and decoder must have the same number of layers" ) if args.relative_position == 'sinusoidal': num_positions = self.max_positions sinusoidal_positions = SinusoidalPositionalEmbedding.get_embedding( num_positions, args.embed_dim // args.attention_heads) sinusoidal_relative_positions = [] for i in range(num_positions): sinusoidal_relative_positions.append( torch.cat([ sinusoidal_positions[num_positions - i:], sinusoidal_positions[:num_positions - i] ], 0)) # Make sentinel token have same relative position to everything sinusoidal_relative_positions[-1][0] = 0 assert sinusoidal_relative_positions[-1].size( ) == sinusoidal_positions.size() sinusoidal_relative_positions = torch.stack( sinusoidal_relative_positions, 0) self.sinusoidal_relative_positions = nn.Parameter( sinusoidal_relative_positions) assert sinusoidal_relative_positions.size() == ( num_positions, num_positions, args.embed_dim // args.attention_heads) #assert (sinusoidal_relative_positions[0] == sinusoidal_positions).all() assert (sinusoidal_relative_positions[7, 7] == sinusoidal_relative_positions[11, 11]).all() assert (sinusoidal_relative_positions[5, 11] == sinusoidal_relative_positions[6, 12]).all() else: self.sinusoidal_relative_positions = None self.adaptive_softmax = None self.project_out_dim = nn.Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None self.load_softmax = not getattr(args, 'remove_head', False) if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) #if args.sentence_class_num > 0: # self.sentence_projection_layer = Linear(embed_dim, args.sentence_class_num, bias=False) self.normalize = args.normalize_before and final_norm if self.normalize: self.layer_norm = BertLayerNorm(embed_dim) self.apply(self.init_bert_weights)
class TransformerEncoder(FairseqEncoder): """Transformer encoder.""" def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, rank_scale=0.0): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.out_norm = LayerNormalization(hidden_size) def forward(self, src_tokens, src_lengths): # embed tokens plus positions input_to_padding = attention_bias_ignore_padding( src_tokens, self.dictionary.pad()) encoder_self_attention_bias = encoder_attention_bias(input_to_padding) encoder_input = self.embed_tokens(src_tokens) if self.pos != "nopos": encoder_input += self.embed_positions(src_tokens) x = F.dropout(encoder_input, p=self.dropout, training=self.training) for self_attention, ffn, norm1, norm2 in zip( self.self_attention_blocks, self.ffn_blocks, self.norm1_blocks, self.norm2_blocks): y = self_attention(norm1(x), None, encoder_self_attention_bias) x = residual(x, y, self.dropout, self.training) y = ffn(norm2(x)) x = residual(x, y, self.dropout, self.training) x = self.out_norm(x) return x def max_positions(self): """Maximum input length supported by the encoder.""" if self.pos == "learned": return self.embed_positions.max_positions() else: return 1024
def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, share_embed=False, rank_scale=0.0): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_TARGET) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_TARGET) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.encdec_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() self.norm3_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttentionDecoder(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.norm3_blocks.append(LayerNormalization(hidden_size)) self.encdec_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.out_norm = LayerNormalization(hidden_size) out_embed_dim = hidden_size if share_embed: assert out_embed_dim == embed_dim, \ "Shared embed weights implies same dimensions " \ " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) self.out_embed = nn.Linear(hidden_size, num_embeddings) self.out_embed.weight = self.embed_tokens.weight else: self.out_embed = Linear(hidden_size, num_embeddings, dropout=dropout)
class DPEncoder(FairseqEncoder): """Transformer encoder.""" def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, convolutions=4): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.attnpath = AttnPathEncoder(self.layers, num_heads=num_heads, filter_size=filter_size, hidden_size=hidden_size, dropout=dropout, attention_dropout=attention_dropout, relu_dropout=relu_dropout) self.cnnpath = CNNPathEncoder(self.layers, hidden_size=hidden_size, dropout=dropout, in_embed=hidden_size, out_embed=hidden_size) def forward(self, src_tokens, src_lengths): # embed tokens plus positions input_to_padding = attention_bias_ignore_padding( src_tokens, self.dictionary.pad()) encoder_self_attention_bias = encoder_attention_bias(input_to_padding) encoder_input = self.embed_tokens(src_tokens) if self.pos != "nopos": encoder_input += self.embed_positions(src_tokens) x = F.dropout(encoder_input, p=self.dropout, training=self.training) attn_x = self.attnpath(x) cnn_x = self.cnnpath(x) return (attn_x, cnn_x) def max_positions(self): """Maximum input length supported by the encoder.""" if self.pos == "learned": return self.embed_positions.max_positions() else: return 1024