def __init__(self, embed_dim, conv_dim, num_heads, kernel_size, weight_dropout=0.1, dropout=0.3, input_dropout=0.0, weight_softmax=True, encoder_glu=False, normalize_before=False): super().__init__() self.embed_dim = embed_dim self.conv_dim = conv_dim if encoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=weight_softmax, num_heads=num_heads, weight_dropout=weight_dropout) self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = dropout self.input_dropout = input_dropout self.normalize_before = normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, d_model, kernel_size, num_heads, dropout, weight_softmax=True): super(LConvBlock, self).__init__() self.embed_dim = d_model padding_l = (kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2)) self.act_linear = LinearNorm(self.embed_dim, 2 * self.embed_dim, bias=True) self.act = nn.GLU() self.conv_layer = LightweightConv( self.embed_dim, kernel_size, padding_l=padding_l, weight_softmax=weight_softmax, num_heads=num_heads, weight_dropout=dropout, ) self.fc1 = LinearNorm(self.embed_dim, 4 * self.embed_dim, bias=True) self.fc2 = LinearNorm(4 * self.embed_dim, self.embed_dim, bias=True) self.layer_norm = nn.LayerNorm(self.embed_dim)
def get_layer(self, args, index, out_dim, num_heads, layer_type): kernel_size = layer_type.split(':')[1] if kernel_size == 'default': kernel_size = args.encoder_kernel_size_list[index] else: kernel_size = int(kernel_size) padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ( (kernel_size - 1) // 2, kernel_size // 2) if 'lightweight' in layer_type: layer = LightweightConv(out_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=num_heads, weight_dropout=args.weight_dropout) elif 'dynamic' in layer_type: layer = DynamicConv(out_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=num_heads, weight_dropout=args.weight_dropout) elif 'attn' in layer_type: layer = MultiheadAttention( out_dim, num_heads, dropout=args.attention_dropout, self_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) else: raise NotImplementedError return layer
def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == 'lightweight': self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) elif args.encoder_conv_type == 'dynamic': self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
def get_layer(self, args, index, out_dim, num_heads, layer_type, add_bias_kv, add_zero_attn): kernel_size = layer_type.split(':')[1] if kernel_size == 'default': kernel_size = args.decoder_kernel_size_list[index] else: kernel_size = int(kernel_size) layer_type = layer_type.split(':')[0] if layer_type == 'lightweight': layer = LightweightConv( out_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=num_heads, weight_dropout=args.weight_dropout, with_linear=args.conv_linear, ) elif layer_type == 'dynamic': layer = DynamicConv( out_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=num_heads, weight_dropout=args.weight_dropout, with_linear=args.conv_linear, glu=args.decoder_glu, ) elif layer_type == 'attn': layer = MultiheadAttention( embed_dim=out_dim, num_heads=num_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=True, ) else: raise NotImplementedError return layer
def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == "lightweight": self.conv = LightweightConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) elif args.decoder_conv_type == "dynamic": self.conv = DynamicConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.relu_dropout_module = FairseqDropout( args.relu_dropout, module_name=self.__class__.__name__) self.input_dropout_module = FairseqDropout( args.input_dropout, module_name=self.__class__.__name__) self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
def __init__(self, args, no_encoder_attn=False, kernel_size=0, use_linear_se=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: if use_linear_se: self.linear1 = LinearSE(self.embed_dim, 2*self.conv_dim) else: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: if use_linear_se: self.linear1 = LinearSE(self.embed_dim, self.conv_dim) else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == 'lightweight': self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout) elif args.decoder_conv_type == 'dynamic': self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=kernel_size-1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError if use_linear_se: self.linear2 = LinearSE(self.conv_dim, self.embed_dim) else: self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) if use_linear_se: self.fc1 = LinearSE(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = LinearSE(args.decoder_ffn_embed_dim, self.embed_dim) else: self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
def get_layer(self, args, index, out_dim, num_heads, layer_type, add_bias_kv, add_zero_attn): kernel_size = layer_type.split(':')[1] if kernel_size == 'default': kernel_size = args.decoder_kernel_size_list[index] else: kernel_size = int(kernel_size) layer_type = layer_type.split(':')[0] if layer_type == 'lightweight': layer = LightweightConv(out_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=num_heads, weight_dropout=args.weight_dropout) elif layer_type == 'dynamic': layer = DynamicConv(out_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=num_heads, weight_dropout=args.weight_dropout) elif layer_type == 'attn': layer = MultiheadAttention( out_dim, num_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not getattr(args, "cross_self_attention", False), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) else: raise NotImplementedError return layer
class LightConvDecoderLayer(nn.Module): def __init__(self, embed_dim, conv_dim, num_heads, kernel_size, weight_dropout=0.1, dropout=0.3, input_dropout=0.0, weight_softmax=True, encoder_glu=False, normalize_before=False): super().__init__() self.embed_dim = embed_dim self.conv_dim = conv_dim if encoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None self.conv = LightweightConv(self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=weight_softmax, num_heads=num_heads, weight_dropout=weight_dropout) self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = dropout self.input_dropout = input_dropout self.normalize_before = normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) def forward(self, x, incremental_state=None, prev_conv_state=None, **unused): residual = x x = self.maybe_layer_norm(x, before=True) if prev_conv_state is not None: if incremental_state is None: incremental_state = {} self.conv._set_input_buffer(incremental_state, prev_conv_state) x = F.dropout(x, p=self.input_dropout, training=self.training) x = self.linear1(x) if self.act is not None: x = self.act(x) x = self.conv(x, incremental_state=incremental_state) x = self.linear2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(x, after=True) return x, None def maybe_layer_norm(self, x, before=False, after=False): assert before ^ after if after ^ self.normalize_before: return self.conv_layer_norm(x) else: return x def extra_repr(self): return 'dropout={}, input_dropout={}, normalize_before={}'.format( self.dropout, self.input_dropout, self.normalize_before)