def __init__(self, params, name="cached_decoder"): super().__init__(name=name) self.normalization = params.normalization self.enable_cache = params.enable_decoder_cache self.enable_relative_positional_embedding = params.enable_relative_positional_embedding self.query_method = params.tgt_query_method self.dropout = params.residual_dropout with utils.scope(name): self.cache = Cache(params, name="decoder_cache") if self.query_method == "single_linear": self.query_transform = nn.Sequential(nn.Linear(params.hidden_size, self.cache_dk), nn.Tanh()) self.layers = nn.ModuleList([CachedTransformerDecoderLayer(params, name="layer_%d" % i) for i in range(params.num_decoder_layers)]) if params.enable_relative_positional_embedding: self.pos_emb = PositionalEmbedding(params.hidden_size) self.pos_bias_u = nn.Parameter(torch.Tensor(params.num_heads, params.hidden_size // params.num_heads)) self.pos_bias_v = nn.Parameter(torch.Tensor(params.num_heads, params.hidden_size // params.num_heads)) self.add_name(self.pos_bias_u, "pos_bias_u") self.add_name(self.pos_bias_v, "pos_bias_v") else: self.pos_bias_u, self.pos_bias_v = None, None if self.normalization == "before": self.layer_norm = modules.LayerNorm(params.hidden_size) else: self.layer_norm = None self.reset_parameters()
def __init__(self, hidden_size, num_heads, dropout=0.0, enable_rel_emb=True, enable_sent_emb=False, gated=False, name="learnable_multihead_selfattention"): super().__init__(name=name) self.num_heads = num_heads self.hidden_size = hidden_size self.dropout = dropout self.enable_rel_emb = enable_rel_emb self.enable_sent_emb = enable_sent_emb self.gated = gated if enable_sent_emb: self.sent_emb = PositionalEmbedding(hidden_size) if gated: self.W_c = Affine(hidden_size, hidden_size, name="W_c") self.W_i = Affine(hidden_size, hidden_size, name="W_i") with utils.scope(name): self.q_transform = Affine(hidden_size, hidden_size, name="q_transform") self.k_transform = Affine(hidden_size, hidden_size, name="k_transform") self.v_transform = Affine(hidden_size, hidden_size, name="v_transform") self.o_transform = Affine(hidden_size, hidden_size, name="o_transform") if self.enable_rel_emb: self.r_transform = Affine(hidden_size, hidden_size, name="r_transform") self.reset_parameters()
def __init__(self, params, name="layer"): super(TransformerDecoderLayer, self).__init__(name=name) with utils.scope(name): self.self_attention = AttentionSubLayer(params, name="self_attention") self.encdec_attention = AttentionSubLayer(params, name="encdec_attention") self.feed_forward = FFNSubLayer(params)
def __init__(self, params, name="layer"): super().__init__(name=name) with utils.scope(name): self.self_attention = LearnableSelfAttentionSubLayer(params, name="self_attention") self.encdec_attention = AttentionSubLayer(params, name="encdec_attention") self.feed_forward = FFNSubLayer(params)
def __init__(self, params, name="attention"): super(AttentionSubLayer, self).__init__(name=name) self.dropout = params.residual_dropout self.normalization = params.normalization with utils.scope(name): self.attention = modules.MultiHeadAttention( params.hidden_size, params.num_heads, params.attention_dropout) self.layer_norm = modules.LayerNorm(params.hidden_size)
def __init__(self, params, dtype=None, name="ffn_layer"): super(FFNSubLayer, self).__init__(name=name) self.dropout = params.residual_dropout self.normalization = params.normalization with utils.scope(name): self.ffn_layer = modules.FeedForward(params.hidden_size, params.filter_size, dropout=params.relu_dropout) self.layer_norm = modules.LayerNorm(params.hidden_size)
def __init__(self, params, name="encoder"): super(TransformerEncoder, self).__init__(name=name) self.normalization = params.normalization with utils.scope(name): self.layers = nn.ModuleList([ TransformerEncoderLayer(params, name="layer_%d" % i) for i in range(params.num_encoder_layers)]) if self.normalization == "before": self.layer_norm = modules.LayerNorm(params.hidden_size) else: self.layer_norm = None
def __init__(self, q_size, k_size, hidden_size, name="attention"): super(Attention, self).__init__(name) self._q_size = q_size self._k_size = k_size self._hidden_size = hidden_size with utils.scope(name): self.q_transform = Affine(q_size, hidden_size, name="q_transform") self.k_transform = Affine(k_size, hidden_size, name="k_transform") self.v_transform = Affine(hidden_size, 1, name="v_transform") self.reset_parameters()
def __init__(self, in_features, out_features, bias=True, name="affine"): super(Affine, self).__init__(name=name) self.in_features = in_features self.out_features = out_features with utils.scope(name): self.weight = nn.Parameter(torch.Tensor(out_features, in_features)) self.add_name(self.weight, "weight") if bias: self.bias = nn.Parameter(torch.Tensor(out_features)) self.add_name(self.bias, "bias") else: self.register_parameter('bias', None) self.reset_parameters()
def __init__(self, input_size, hidden_size, output_size=None, dropout=0.0, name="feed_forward"): super(FeedForward, self).__init__(name=name) self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size or input_size self.dropout = dropout with utils.scope(name): self.input_transform = Affine(input_size, hidden_size, name="input_transform") self.output_transform = Affine(hidden_size, self.output_size, name="output_transform") self.reset_parameters()
def __init__(self, params, name="transformer"): super(Transformer, self).__init__(name=name) self.params = params with utils.scope(name): self.build_embedding(params) self.encoding = modules.PositionalEmbedding() self.encoder = TransformerEncoder(params) self.decoder = TransformerDecoder(params) self.criterion = modules.SmoothedCrossEntropyLoss( params.label_smoothing) self.dropout = params.residual_dropout self.hidden_size = params.hidden_size self.num_encoder_layers = params.num_encoder_layers self.num_decoder_layers = params.num_decoder_layers self.reset_parameters()
def __init__(self, params, name="learnableselfattention"): super().__init__(name=name) self.dropout = params.residual_dropout self.normalization = params.normalization self.gated = params.enable_residual_gate if self.gated: hidden_size = params.hidden_size self.W_x = Affine(hidden_size, hidden_size, name="W_x") self.W_y = Affine(hidden_size, hidden_size, name="W_y") with utils.scope(name): self.attention = modules.LearnableMultiHeadSelfAttention(params.hidden_size, params.num_heads, params.attention_dropout, params.enable_relative_positional_embedding, params.enable_sentence_embedding) self.layer_norm = modules.LayerNorm(params.hidden_size)
def __init__(self, q_size, k_size, hidden_size, num_heads, dropout=0.0, name="multihead_attention"): super(MultiHeadAdditiveAttention, self).__init__(name=name) self.num_heads = num_heads self.hidden_size = hidden_size self.dropout = dropout with utils.scope(name): self.q_transform = Affine(q_size, hidden_size, name="q_transform") self.k_transform = Affine(k_size, hidden_size, name="k_transform") self.v_transform = Affine(hidden_size, num_heads, name="v_transform") self.o_transform = Affine(k_size, k_size, name="o_transform") self.reset_parameters()
def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True, name="layer_norm"): super(LayerNorm, self).__init__(name=name) if isinstance(normalized_shape, numbers.Integral): normalized_shape = (normalized_shape, ) self.normalized_shape = tuple(normalized_shape) self.eps = eps self.elementwise_affine = elementwise_affine with utils.scope(name): if self.elementwise_affine: self.weight = nn.Parameter(torch.Tensor(*normalized_shape)) self.bias = nn.Parameter(torch.Tensor(*normalized_shape)) self.add_name(self.weight, "weight") self.add_name(self.bias, "bias") else: self.register_parameter('weight', None) self.register_parameter('bias', None) self.reset_parameters()