Ejemplo n.º 1
0
    def __init__(self, layer_id, args):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.self_attn = MultiheadAttention820(self.embed_dim,
                                               args.encoder_attention_heads,
                                               layer_id=layer_id,
                                               args=args,
                                               dropout=args.attention_dropout,
                                               cur_attn_type='es')

        self.self_attn_layer_norm = LayerNorm(self.embed_dim, args=args)
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, 'activation_fn', 'relu'))
        self.activation_dropout = getattr(args, 'activation_dropout', 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, 'relu_dropout', 0)
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim,
                          args.encoder_ffn_embed_dim,
                          layer_id=layer_id,
                          cur_linear='fc1',
                          args=args)
        self.fc2 = Linear(args.encoder_ffn_embed_dim,
                          self.embed_dim,
                          layer_id=layer_id,
                          cur_linear='fc2',
                          args=args)
        self.dropout = args.dropout
        self.info_linear = None
        self.se = None
        self.input_dropout = args.input_dropout if 'input_dropout' in args else 0.0
Ejemplo n.º 2
0
    def __init__(self, args, kernel_size=0):
        super().__init__()
        self.embed_dim = args.encoder_embed_dim
        self.conv_dim = args.encoder_conv_dim
        padding_l = kernel_size // 2 if kernel_size % 2 == 1 else (
            (kernel_size - 1) // 2, kernel_size // 2)

        if args.encoder_glu:
            self.linear1 = Linear(
                self.embed_dim,
                2 * self.conv_dim,
                args=args,
            )
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim, args=args)
            self.act = None
        if args.encoder_conv_type == 'lightweight':
            self.conv = LightweightConv1dTBC(
                self.conv_dim,
                kernel_size,
                padding_l=padding_l,
                weight_softmax=args.weight_softmax,
                num_heads=args.encoder_attention_heads,
                weight_dropout=args.weight_dropout,
                args=args,
                cur_attn_type='es')
        elif args.encoder_conv_type == 'dynamic':
            self.conv = DynamicConv1dTBC(
                self.conv_dim,
                kernel_size=kernel_size,
                padding_l=padding_l,
                weight_softmax=args.weight_softmax,
                num_heads=args.encoder_attention_heads,
                weight_dropout=args.weight_dropout,
                args=args)
        else:
            raise NotImplementedError
        self.linear2 = Linear(self.conv_dim, self.embed_dim, args=args)

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.input_dropout = args.input_dropout
        self.normalize_before = args.encoder_normalize_before
        self.fc1 = Linear(self.embed_dim,
                          args.encoder_ffn_embed_dim,
                          args=args)
        self.fc2 = Linear(args.encoder_ffn_embed_dim,
                          self.embed_dim,
                          args=args)
        self.layer_norms = nn.ModuleList(
            [LayerNorm(self.embed_dim, args=args) for _ in range(2)])
Ejemplo n.º 3
0
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_decoder_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.decoder_embed_dim
        self.output_embed_dim = args.decoder_output_dim

        padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args.max_target_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim

        self.project_in_dim = Linear(input_embed_dim, embed_dim, layer_id=0, args=args, cur_linear='in',bias=False) if embed_dim != input_embed_dim else None

        self.embed_positions = PositionalEmbedding(
            args.max_target_positions, embed_dim, padding_idx,
            learned=args.decoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
                TransformerCombineDecoderLayer(layer_id=i, args=args, no_encoder_attn=no_encoder_attn)
                for i in range(args.decoder_layers)
            ])

        self.adaptive_softmax = None

        self.project_out_dim = Linear(embed_dim, self.output_embed_dim, layer_id=args.decoder_layers-1, args=args,cur_linear='out',  bias=False) \
            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None

        if args.adaptive_softmax_cutoff is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
                dropout=args.adaptive_softmax_dropout,
                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
                factor=args.adaptive_softmax_factor,
                tie_proj=args.tie_adaptive_proj,
            )
        elif not self.share_input_output_embed:
            self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim))
            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.decoder_normalize_before and final_norm
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim, args=args)
Ejemplo n.º 4
0
    def __init__(self,
                 args,
                 no_encoder_attn=False,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.self_attn = MultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=args.decoder_attention_heads,
            dropout=args.attention_dropout,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
        )
        self.dropout = args.dropout
        self.activation_fn = utils.get_activation_fn(
            activation=getattr(args, 'activation_fn', 'relu'))
        self.activation_dropout = getattr(args, 'activation_dropout', 0)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = getattr(args, 'relu_dropout', 0)
        self.normalize_before = args.decoder_normalize_before

        # use layerNorm rather than FusedLayerNorm for exporting.
        # char_inputs can be used to determint this.
        # TODO  remove this once we update apex with the fix
        export = getattr(args, 'char_inputs', False)
        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim,
                                                     export=export)

        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
        self.need_attn = True

        self.onnx_trace = False
Ejemplo n.º 5
0
 def __init__(self, args):
     super().__init__()
     self.embed_dim = args.encoder_embed_dim
     self.self_attn = MultiheadAttention(
         self.embed_dim,
         args.encoder_attention_heads,
         dropout=args.attention_dropout,
     )
     self.self_attn_layer_norm = LayerNorm(self.embed_dim)
     self.dropout = args.dropout
     self.activation_fn = utils.get_activation_fn(
         activation=getattr(args, 'activation_fn', 'relu'))
     self.activation_dropout = getattr(args, 'activation_dropout', 0)
     if self.activation_dropout == 0:
         # for backwards compatibility with models that use args.relu_dropout
         self.activation_dropout = getattr(args, 'relu_dropout', 0)
     self.normalize_before = args.encoder_normalize_before
     self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
     self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
     self.final_layer_norm = LayerNorm(self.embed_dim)
Ejemplo n.º 6
0
    def __init__(self, args, no_encoder_attn=False, kernel_size=0):
        super().__init__()
        self.embed_dim = args.decoder_embed_dim
        self.conv_dim = args.decoder_conv_dim
        if args.decoder_glu:
            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim, args=args)
            self.act = nn.GLU()
        else:
            self.linear1 = Linear(self.embed_dim, self.conv_dim, args=args)
            self.act = None
        if args.decoder_conv_type == 'lightweight':
            self.conv = LightweightConv1dTBC(
                self.conv_dim,
                kernel_size,
                padding_l=kernel_size - 1,
                weight_softmax=args.weight_softmax,
                num_heads=args.decoder_attention_heads,
                weight_dropout=args.weight_dropout,
                args=args,
                cur_attn_type='ds')
        elif args.decoder_conv_type == 'dynamic':
            self.conv = DynamicConv1dTBC(
                self.conv_dim,
                kernel_size=kernel_size,
                padding_l=kernel_size - 1,
                weight_softmax=args.weight_softmax,
                num_heads=args.decoder_attention_heads,
                weight_dropout=args.weight_dropout,
                args=args)
        else:
            raise NotImplementedError
        self.linear2 = Linear(self.conv_dim, self.embed_dim, args=args)

        self.dropout = args.dropout
        self.relu_dropout = args.relu_dropout
        self.input_dropout = args.input_dropout
        self.normalize_before = args.decoder_normalize_before

        self.conv_layer_norm = LayerNorm(self.embed_dim, args=args)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = MultiheadAttention(
                self.embed_dim,
                args.decoder_attention_heads,
                dropout=args.attention_dropout,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, args=args)

        self.fc1 = Linear(
            self.embed_dim,
            args.decoder_ffn_embed_dim,
            args=args,
        )
        self.fc2 = Linear(
            args.decoder_ffn_embed_dim,
            self.embed_dim,
            args=args,
        )

        self.final_layer_norm = LayerNorm(self.embed_dim, args=args)
        self.need_attn = True