def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__() attention_dim = encoder_output_size if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(vocab_size, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(vocab_size, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) else: raise ValueError( f"only 'embed' or 'linear' is supported: {input_layer}") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, vocab_size) else: self.output_layer = None
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, conv_wshare: int = 4, conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11), conv_usebias: int = False, ): assert check_argument_types() if len(conv_kernel_length) != num_blocks: raise ValueError( "conv_kernel_length must have equal number of values to num_blocks: " f"{len(conv_kernel_length)} != {num_blocks}") super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, input_layer=input_layer, use_output_layer=use_output_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, ) attention_dim = encoder_output_size self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( wshare=conv_wshare, n_feat=attention_dim, dropout_rate=self_attention_dropout_rate, kernel_size=conv_kernel_length[lnum], use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), )
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate) ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate) ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise NotImplementedError("only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after ) ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, vocab_size: int, encoder_output_size: int, attention_heads: int = 4, linear_units: int = 2048, num_blocks: int = 6, dropout_rate: float = 0.1, positional_dropout_rate: float = 0.1, self_attention_dropout_rate: float = 0.0, src_attention_dropout_rate: float = 0.0, input_layer: str = "embed", use_output_layer: bool = True, pos_enc_class=PositionalEncoding, normalize_before: bool = True, concat_after: bool = False, ): assert check_argument_types() super().__init__( vocab_size=vocab_size, encoder_output_size=encoder_output_size, dropout_rate=dropout_rate, positional_dropout_rate=positional_dropout_rate, input_layer=input_layer, use_output_layer=use_output_layer, pos_enc_class=pos_enc_class, normalize_before=normalize_before, ) attention_dim = encoder_output_size self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), )
def __init__(self, odim, args): super(Decoder, self).__init__() self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, args.adim), PositionalEncoding(args.adim, args.dropout_rate) ) self.decoders = repeat( args.dlayers, lambda: DecoderLayer( args.adim, MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate), PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate), args.dropout_rate ) ) self.output_norm = LayerNorm(args.adim) self.output_layer = torch.nn.Linear(args.adim, odim)
def __init__( self, odim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, ): """Construct an Decoder object.""" torch.nn.Module.__init__(self) self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before # self-attention module definition if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") decoder_selfattn_layer = MultiHeadedAttention decoder_selfattn_layer_args = [( attention_heads, attention_dim, self_attention_dropout_rate, )] * num_blocks elif selfattention_layer_type == "lightconv": logging.info( "decoder self-attention layer type = lightweight convolution") decoder_selfattn_layer = LightweightConvolution decoder_selfattn_layer_args = [( conv_wshare, attention_dim, self_attention_dropout_rate, int(conv_kernel_length.split("_")[lnum]), True, conv_usebias, ) for lnum in range(num_blocks)] elif selfattention_layer_type == "lightconv2d": logging.info("decoder self-attention layer " "type = lightweight convolution 2-dimentional") decoder_selfattn_layer = LightweightConvolution2D decoder_selfattn_layer_args = [( conv_wshare, attention_dim, self_attention_dropout_rate, int(conv_kernel_length.split("_")[lnum]), True, conv_usebias, ) for lnum in range(num_blocks)] elif selfattention_layer_type == "dynamicconv": logging.info( "decoder self-attention layer type = dynamic convolution") decoder_selfattn_layer = DynamicConvolution decoder_selfattn_layer_args = [( conv_wshare, attention_dim, self_attention_dropout_rate, int(conv_kernel_length.split("_")[lnum]), True, conv_usebias, ) for lnum in range(num_blocks)] elif selfattention_layer_type == "dynamicconv2d": logging.info( "decoder self-attention layer type = dynamic convolution 2-dimentional" ) decoder_selfattn_layer = DynamicConvolution2D decoder_selfattn_layer_args = [( conv_wshare, attention_dim, self_attention_dropout_rate, int(conv_kernel_length.split("_")[lnum]), True, conv_usebias, ) for lnum in range(num_blocks)] self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, decoder_selfattn_layer(*decoder_selfattn_layer_args[lnum]), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) self.selfattention_layer_type = selfattention_layer_type if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__( self, odim, selfattention_layer_type="selfattn", attention_dim=256, attention_heads=4, conv_wshare=4, conv_kernel_length=11, conv_usebias=False, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, attention_type="self_attn", max_attn_span=[None], span_init=0, span_ratio=0.5, ratio_adaptive=False ): """Construct an Decoder object.""" torch.nn.Module.__init__(self) self._register_load_state_dict_pre_hook(_pre_hook) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate) ) else: raise NotImplementedError("only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before if selfattention_layer_type == "selfattn": logging.info("decoder self-attention layer type = self-attention") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, multi_headed_attention(attention_heads, attention_dim, self_attention_dropout_rate, attention_type, max_span=max_attn_span[min(len(max_attn_span)-1, lnum)], span_init=span_init, span_ratio=span_ratio, ratio_adaptive=ratio_adaptive, causal_flag=True), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv": logging.info("decoder self-attention layer type = lightweight convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "lightconv2d": logging.info( "decoder self-attention layer " "type = lightweight convolution 2-dimentional" ) self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, LightweightConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv": logging.info("decoder self-attention layer type = dynamic convolution") self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) elif selfattention_layer_type == "dynamicconv2d": logging.info( "decoder self-attention layer type = dynamic convolution 2-dimentional" ) self.decoders = repeat( num_blocks, lambda lnum: DecoderLayer( attention_dim, DynamicConvolution2D( conv_wshare, attention_dim, self_attention_dropout_rate, conv_kernel_length, lnum, use_kernel_mask=True, use_bias=conv_usebias, ), MultiHeadedAttention( attention_heads, attention_dim, src_attention_dropout_rate ), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, ), ) self.selfattention_layer_type = selfattention_layer_type if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None
def __init__(self, odim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, self_attention_dropout_rate=0.0, src_attention_dropout_rate=0.0, input_layer="embed", use_output_layer=True, pos_enc_class=PositionalEncoding, normalize_before=True, concat_after=False, cross_operator=None, cross_shared=False, cross_weight_learnable=False, cross_weight=0.0): """Construct an Decoder object.""" torch.nn.Module.__init__(self) if input_layer == "embed": self.embed = torch.nn.Sequential( torch.nn.Embedding(odim, attention_dim), pos_enc_class(attention_dim, positional_dropout_rate)) elif input_layer == "linear": self.embed = torch.nn.Sequential( torch.nn.Linear(odim, attention_dim), torch.nn.LayerNorm(attention_dim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) elif isinstance(input_layer, torch.nn.Module): self.embed = torch.nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate)) else: raise NotImplementedError( "only `embed` or torch.nn.Module is supported.") self.normalize_before = normalize_before cross_self_attn = None cross_src_attn = None if cross_operator: if 'src_' in cross_operator: # cross_src_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) cross_src_attn = True if 'self_' in cross_operator: if cross_shared and cross_src_attn is not None: # cross_self_attn = cross_src_attn cross_self_attn = True # TODO: backward compatibility for shared self and source else: # cross_self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate) cross_self_attn = True if 'concat' in cross_operator: cross_operator = 'concat' elif 'sum' in cross_operator: cross_operator = 'sum' else: raise NotImplementedError self.decoders = repeat( num_blocks, lambda: DecoderLayer( attention_dim, MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate), MultiHeadedAttention(attention_heads, attention_dim, src_attention_dropout_rate), PositionwiseFeedForward(attention_dim, linear_units, dropout_rate), dropout_rate, normalize_before, concat_after, cross_self_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if cross_self_attn else None, cross_src_attn=MultiHeadedAttention( attention_heads, attention_dim, self_attention_dropout_rate ) if cross_src_attn else None, cross_operator=cross_operator, cross_shared=cross_shared, cross_weight_learnable=cross_weight_learnable, cross_weight=cross_weight)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) if use_output_layer: self.output_layer = torch.nn.Linear(attention_dim, odim) else: self.output_layer = None