def __init__(self, params, name="cached_decoder"): super().__init__(name=name) self.normalization = params.normalization self.enable_cache = params.enable_decoder_cache self.enable_relative_positional_embedding = params.enable_relative_positional_embedding self.query_method = params.tgt_query_method self.dropout = params.residual_dropout with utils.scope(name): self.cache = Cache(params, name="decoder_cache") if self.query_method == "single_linear": self.query_transform = nn.Sequential(nn.Linear(params.hidden_size, self.cache_dk), nn.Tanh()) self.layers = nn.ModuleList([CachedTransformerDecoderLayer(params, name="layer_%d" % i) for i in range(params.num_decoder_layers)]) if params.enable_relative_positional_embedding: self.pos_emb = PositionalEmbedding(params.hidden_size) self.pos_bias_u = nn.Parameter(torch.Tensor(params.num_heads, params.hidden_size // params.num_heads)) self.pos_bias_v = nn.Parameter(torch.Tensor(params.num_heads, params.hidden_size // params.num_heads)) self.add_name(self.pos_bias_u, "pos_bias_u") self.add_name(self.pos_bias_v, "pos_bias_v") else: self.pos_bias_u, self.pos_bias_v = None, None if self.normalization == "before": self.layer_norm = modules.LayerNorm(params.hidden_size) else: self.layer_norm = None self.reset_parameters()
def __init__(self, params, name="attention"): super(AttentionSubLayer, self).__init__(name=name) self.dropout = params.residual_dropout self.normalization = params.normalization with utils.scope(name): self.attention = modules.MultiHeadAttention( params.hidden_size, params.num_heads, params.attention_dropout) self.layer_norm = modules.LayerNorm(params.hidden_size)
def __init__(self, params, dtype=None, name="ffn_layer"): super(FFNSubLayer, self).__init__(name=name) self.dropout = params.residual_dropout self.normalization = params.normalization with utils.scope(name): self.ffn_layer = modules.FeedForward(params.hidden_size, params.filter_size, dropout=params.relu_dropout) self.layer_norm = modules.LayerNorm(params.hidden_size)
def __init__(self, params, name="encoder"): super(TransformerEncoder, self).__init__(name=name) self.normalization = params.normalization with utils.scope(name): self.layers = nn.ModuleList([ TransformerEncoderLayer(params, name="layer_%d" % i) for i in range(params.num_encoder_layers)]) if self.normalization == "before": self.layer_norm = modules.LayerNorm(params.hidden_size) else: self.layer_norm = None
def __init__(self, params, name="learnableselfattention"): super().__init__(name=name) self.dropout = params.residual_dropout self.normalization = params.normalization self.gated = params.enable_residual_gate if self.gated: hidden_size = params.hidden_size self.W_x = Affine(hidden_size, hidden_size, name="W_x") self.W_y = Affine(hidden_size, hidden_size, name="W_y") with utils.scope(name): self.attention = modules.LearnableMultiHeadSelfAttention(params.hidden_size, params.num_heads, params.attention_dropout, params.enable_relative_positional_embedding, params.enable_sentence_embedding) self.layer_norm = modules.LayerNorm(params.hidden_size)