def __init__(self, num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1): super(AttnPathEncoder, self).__init__() self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.out_norm = LayerNormalization(hidden_size)
def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, share_embed=False, rank_scale=0.0): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([2])) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_TARGET) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_TARGET) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.encdec_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() self.norm3_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttentionDecoder(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.norm3_blocks.append(LayerNormalization(hidden_size)) self.encdec_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads, rank_scale=rank_scale)) self.out_norm = LayerNormalization(hidden_size) out_embed_dim = hidden_size if share_embed: assert out_embed_dim == embed_dim, \ "Shared embed weights implies same dimensions " \ " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) self.out_embed = nn.Linear(hidden_size, num_embeddings) self.out_embed.weight = self.embed_tokens.weight else: self.out_embed = Linear(hidden_size, num_embeddings, dropout=dropout)
def __init__(self, dictionary, embed_dim=256, max_positions=1024, pos="learned", num_layers=2, num_heads=8, filter_size=256, hidden_size=256, dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, convolutions=((256, 3), ) * 4): super().__init__(dictionary) assert pos == "learned" or pos == "timing" or pos == "nopos" self.dropout = dropout self.attention_dropout = attention_dropout self.relu_dropout = relu_dropout self.pos = pos num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if self.pos == "learned": self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) if self.pos == "timing": self.embed_positions = SinusoidalPositionalEmbedding( embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) self.layers = num_layers self.self_attention_blocks = nn.ModuleList() self.ffn_blocks = nn.ModuleList() self.norm1_blocks = nn.ModuleList() self.norm2_blocks = nn.ModuleList() for i in range(num_layers): self.self_attention_blocks.append( MultiheadAttention(hidden_size, hidden_size, hidden_size, num_heads)) self.ffn_blocks.append( FeedForwardNetwork(hidden_size, filter_size, relu_dropout)) self.norm1_blocks.append(LayerNormalization(hidden_size)) self.norm2_blocks.append(LayerNormalization(hidden_size)) self.out_norm = LayerNormalization(hidden_size) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() for (out_channels, kernel_size) in convolutions: pad = (kernel_size - 1) / 2 self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, padding=pad, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)