def __init__(self, args): super(TransformerDecoderLayer, self).__init__() self.layernorm_positioning = args.layernorm_positioning if hasattr(args, "attention_head_size"): attention_head_size = args.attention_head_size else: attention_head_size = args.hidden_size // args.heads_num has_bias = bool(1 - args.remove_transformer_bias) with_scale = bool(1 - args.remove_attention_scale) # Multi-headed self-attention. self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, attention_head_size, args.dropout, has_bias=has_bias, with_scale=with_scale) self.dropout_1 = nn.Dropout(args.dropout) # Multi-headed context-attention. self.context_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, attention_head_size, args.dropout, has_bias=has_bias, with_scale=with_scale) self.dropout_2 = nn.Dropout(args.dropout) # Feed forward layer. if args.feed_forward == "gated": self.feed_forward = GatedFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act, has_bias) else: self.feed_forward = PositionwiseFeedForward( args.hidden_size, args.feedforward_size, args.hidden_act, has_bias) self.dropout_3 = nn.Dropout(args.dropout) # Layer Normalization if args.layernorm == "t5": self.layer_norm_1 = T5LayerNorm(args.hidden_size) self.layer_norm_2 = T5LayerNorm(args.hidden_size) self.layer_norm_3 = T5LayerNorm(args.hidden_size) else: self.layer_norm_1 = LayerNorm(args.hidden_size) self.layer_norm_2 = LayerNorm(args.hidden_size) self.layer_norm_3 = LayerNorm(args.hidden_size)
def __init__(self, args): super(TransformerDecoder, self).__init__() self.layers_num = args.layers_num self.layernorm_positioning = args.layernorm_positioning self.relative_position_embedding = args.relative_position_embedding self.transformer_decoder = nn.ModuleList( [TransformerDecoderLayer(args) for _ in range(self.layers_num)] ) if "deepspeed_checkpoint_activations" in args: self.deepspeed_checkpoint_activations = args.deepspeed_checkpoint_activations self.deepspeed_checkpoint_layers_num = args.deepspeed_checkpoint_layers_num else: self.deepspeed_checkpoint_activations = False has_bias = bool(1 - args.remove_transformer_bias) if self.layernorm_positioning == "pre": if args.layernorm == "t5": self.layer_norm = T5LayerNorm(args.hidden_size) else: self.layer_norm = LayerNorm(args.hidden_size) if self.relative_position_embedding: self.self_pos_emb = RelativePositionEmbedding(bidirectional=False, heads_num=args.heads_num, num_buckets=args.relative_attention_buckets_num)
def __init__(self, args): super(TransformerEncoder, self).__init__() self.mask = args.mask self.layers_num = args.layers_num self.parameter_sharing = args.parameter_sharing self.factorized_embedding_parameterization = args.factorized_embedding_parameterization self.layernorm_positioning = args.layernorm_positioning self.relative_position_embedding = args.relative_position_embedding self.has_residual_attention = args.has_residual_attention if "deepspeed_checkpoint_activations" in args: self.deepspeed_checkpoint_activations = args.deepspeed_checkpoint_activations self.deepspeed_checkpoint_layers_num = args.deepspeed_checkpoint_layers_num else: self.deepspeed_checkpoint_activations = False has_bias = bool(1 - args.remove_transformer_bias) if self.factorized_embedding_parameterization: self.linear = nn.Linear(args.emb_size, args.hidden_size) if self.parameter_sharing: self.transformer = TransformerLayer(args) else: self.transformer = nn.ModuleList( [TransformerLayer(args) for _ in range(self.layers_num)] ) if self.layernorm_positioning == "pre": if args.layernorm == "t5": self.layer_norm = T5LayerNorm(args.hidden_size) else: self.layer_norm = LayerNorm(args.hidden_size) if self.relative_position_embedding: self.relative_pos_emb = RelativePositionEmbedding(bidirectional=True, heads_num=args.heads_num, num_buckets=args.relative_attention_buckets_num)