def __init__(self, layer_id, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention820(self.embed_dim, args.encoder_attention_heads, layer_id=layer_id, args=args, dropout=args.attention_dropout, cur_attn_type='es') self.self_attn_layer_norm = LayerNorm(self.embed_dim, args=args) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim, layer_id=layer_id, cur_linear='fc1', args=args) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim, layer_id=layer_id, cur_linear='fc2', args=args) self.dropout = args.dropout self.info_linear = None self.se = None self.input_dropout = args.input_dropout if 'input_dropout' in args else 0.0
def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ( (kernel_size - 1) // 2, kernel_size // 2) if args.encoder_glu: self.linear1 = Linear( self.embed_dim, 2 * self.conv_dim, args=args, ) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim, args=args) self.act = None if args.encoder_conv_type == 'lightweight': self.conv = LightweightConv1dTBC( self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout, args=args, cur_attn_type='es') elif args.encoder_conv_type == 'dynamic': self.conv = DynamicConv1dTBC( self.conv_dim, kernel_size=kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout, args=args) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim, args=args) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim, args=args) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim, args=args) self.layer_norms = nn.ModuleList( [LayerNorm(self.embed_dim, args=args) for _ in range(2)])
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, layer_id=0, args=args, cur_linear='in',bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerCombineDecoderLayer(layer_id=i, args=args, no_encoder_attn=no_encoder_attn) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, layer_id=args.decoder_layers-1, args=args,cur_linear='out', bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim, args=args)
def __init__(self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, 'char_inputs', False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False
def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, ) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, 'activation_fn', 'relu')) self.activation_dropout = getattr(args, 'activation_dropout', 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, 'relu_dropout', 0) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim, args=args) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim, args=args) self.act = None if args.decoder_conv_type == 'lightweight': self.conv = LightweightConv1dTBC( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, args=args, cur_attn_type='ds') elif args.decoder_conv_type == 'dynamic': self.conv = DynamicConv1dTBC( self.conv_dim, kernel_size=kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, args=args) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim, args=args) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim, args=args) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, args=args) self.fc1 = Linear( self.embed_dim, args.decoder_ffn_embed_dim, args=args, ) self.fc2 = Linear( args.decoder_ffn_embed_dim, self.embed_dim, args=args, ) self.final_layer_norm = LayerNorm(self.embed_dim, args=args) self.need_attn = True