Esempio n. 1
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
    ):
        assert check_argument_types()
        super().__init__()
        attention_dim = encoder_output_size

        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(vocab_size, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(vocab_size, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        else:
            raise ValueError(
                f"only 'embed' or 'linear' is supported: {input_layer}")

        self.normalize_before = normalize_before
        self.decoders = repeat(
            num_blocks,
            lambda: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
        else:
            self.output_layer = None
Esempio n. 2
0
 def __init__(self,
              odim,
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              self_attention_dropout_rate=0.0,
              src_attention_dropout_rate=0.0,
              input_layer="embed",
              use_output_layer=True,
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False,
              moe_att_mode='linear'):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate))
     else:
         raise NotImplementedError(
             "only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     self.decoders = repeat(
         num_blocks, lambda: HANDecoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim,
                                  self_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim,
                                  src_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim,
                                  src_attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units,
                                     dropout_rate),
             dropout_rate=dropout_rate,
             moe_att_mode=moe_att_mode,
             normalize_before=normalize_before,
             concat_after=concat_after,
         ))
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
Esempio n. 3
0
    def __init__(self,
                 n_head,
                 d_model,
                 d_head,
                 pos_ff,
                 att_type,
                 dropout,
                 dropatt,
                 pre_lnorm,
                 tgt_len=None,
                 ext_len=0,
                 mem_len=0,
                 future_len=0,
                 rel_pos=True):
        super(EncoderLayer, self).__init__()
        self.register_buffer('mems', None)
        self.n_head = n_head
        self.d_head = d_head
        self.d_model = d_model
        self.mem_len = mem_len
        self.rel_pos = rel_pos
        self.future_len = future_len
        self.tgt_len = tgt_len
        self.att = MultiHeadedAttention(n_head, d_model, dropatt)
        if att_type == "mta":
            self.att = MultiHeadedAttention(n_head, d_model, dropatt)
        elif att_type == "win":
            self.att = WinMultiHeadedAttention(n_head, d_model, dropatt)
        elif att_type == "smooth":
            self.att = SmoothMultiHeadedAttention(n_head, d_model, dropatt)
        elif att_type == "rel":
            self.att = RelMultiHeadedAttention(n_head, d_model, dropatt)
        else:
            raise ValueError("unknown attention type: " + att_type)

        self.layer = CashEncoderLayer(d_model,
                                      self.att,
                                      pos_ff,
                                      dropout,
                                      pre_lnorm,
                                      concat_after=False)

        self.drop = nn.Dropout(dropout)
        self.ext_len = ext_len
        self.rel_pos = rel_pos
        if rel_pos:
            self.re_pos_embed = PositionalEncoding(self.d_model, dropout)
        else:
            self.re_pos_embed = None
Esempio n. 4
0
    def __init__(
        self,
        odim,
        jdim,
        attention_dim=512,
        attention_heads=4,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.0,
        attention_dropout_rate=0.0,
        input_layer="embed",
        pos_enc_class=PositionalEncoding,
        blank=0,
    ):
        """Construct a Decoder object for transformer-transducer models."""
        torch.nn.Module.__init__(self)

        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer, pos_enc_class(attention_dim, positional_dropout_rate)
            )
        else:
            raise NotImplementedError("only `embed` or torch.nn.Module is supported.")

        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(
                    attention_heads, attention_dim, attention_dropout_rate
                ),
                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                dropout_rate,
            ),
        )

        self.after_norm = LayerNorm(attention_dim)

        self.lin_enc = torch.nn.Linear(attention_dim, jdim)
        self.lin_dec = torch.nn.Linear(attention_dim, jdim, bias=False)
        self.lin_out = torch.nn.Linear(jdim, odim)

        self.attention_dim = attention_dim
        self.odim = odim

        self.blank = blank
Esempio n. 5
0
    def __init__(self,
                 n_head,
                 d_model,
                 d_head,
                 pos_ff,
                 dropout,
                 dropatt,
                 pre_lnorm,
                 tgt_len=None,
                 ext_len=0,
                 mem_len=0,
                 future_len=0,
                 rel_pos=True):
        super(EncoderLayer, self).__init__()
        self.register_buffer('mems', None)
        self.n_head = n_head
        self.d_head = d_head
        self.d_model = d_model
        self.mem_len = mem_len
        self.rel_pos = rel_pos
        self.future_len = future_len
        self.tgt_len = tgt_len

        self.layer = CashEncoderLayer(d_model,
                                      MultiHeadedAttention(
                                          n_head, d_model, dropatt),
                                      pos_ff,
                                      dropout,
                                      pre_lnorm,
                                      concat_after=False)

        self.drop = nn.Dropout(dropout)
        self.ext_len = ext_len
Esempio n. 6
0
 def __init__(self,
              idim,
              time_len=8,
              mem_len=0,
              ext_len=0,
              future_len=0,
              attention_type="memory",
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     super(Encoder, self).__init__()
     self.idim = idim
     self.time_len = time_len
     self.future_len = future_len
     self.attention_dim = attention_dim
     self.attention_heads = attention_heads
     self.linear_units = linear_units
     self.dropout_rate = dropout_rate
     self.input_layer = input_layer
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     self.attention_type = attention_type
     self.positional_dropout_rate = positional_dropout_rate
     self.pos_enc_class = pos_enc_class
     self._generateInputLayer()
     if attention_type == "memory":
         self.encoders = repeat(
             num_blocks, lambda:
             EncoderLayerXL(n_head=attention_heads,
                            d_model=attention_dim,
                            d_head=attention_dim // attention_heads,
                            ext_len=ext_len,
                            mem_len=mem_len,
                            future_len=future_len,
                            dropout=dropout_rate,
                            dropatt=attention_dropout_rate,
                            pre_lnorm=normalize_before,
                            pos_ff=PositionwiseFeedForward(
                                attention_dim, linear_units, dropout_rate)))
     elif attention_type == "traditional":
         self.encoders = repeat(
             num_blocks, lambda: EncoderLayerTD(
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate), dropout_rate,
                 normalize_before, concat_after))
     else:
         ValueError("only memory or traditional can be used")
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
Esempio n. 7
0
    def __init__(self, idim, args):
        super(Encoder, self).__init__()
        if args.transformer_input_layer == "linear":
            self.input_layer = torch.nn.Sequential(
                torch.nn.Linear(idim, args.adim),
                torch.nn.LayerNorm(args.adim),
                torch.nn.Dropout(args.dropout_rate), torch.nn.ReLU(),
                PositionalEncoding(args.adim, args.dropout_rate))
        elif args.transformer_input_layer == "conv2d":
            self.input_layer = Conv2dSubsampling(idim, args.adim,
                                                 args.dropout_rate)
        elif args.transformer_input_layer == "embed":
            self.input_layer = torch.nn.Sequential(
                torch.nn.Embedding(idim, args.adim),
                PositionalEncoding(args.adim, args.dropout_rate))
        else:
            raise ValueError("unknown input_layer: " +
                             args.transformer_input_layer)

        self.encoders = repeat(
            args.elayers, lambda: EncoderLayer(
                args.adim,
                MultiHeadedAttention(args.aheads, args.adim, args.
                                     transformer_attn_dropout_rate),
                PositionwiseFeedForward(args.adim, args.eunits, args.
                                        dropout_rate), args.dropout_rate))
        self.norm = LayerNorm(args.adim)
Esempio n. 8
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
        conv_wshare: int = 4,
        conv_kernel_length: Sequence[int] = (11, 11, 11, 11, 11, 11),
        conv_usebias: int = False,
    ):
        assert check_argument_types()
        if len(conv_kernel_length) != num_blocks:
            raise ValueError(
                "conv_kernel_length must have equal number of values to num_blocks: "
                f"{len(conv_kernel_length)} != {num_blocks}")
        super().__init__(
            vocab_size=vocab_size,
            encoder_output_size=encoder_output_size,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            input_layer=input_layer,
            use_output_layer=use_output_layer,
            pos_enc_class=pos_enc_class,
            normalize_before=normalize_before,
        )
        attention_dim = encoder_output_size

        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                DynamicConvolution2D(
                    wshare=conv_wshare,
                    n_feat=attention_dim,
                    dropout_rate=self_attention_dropout_rate,
                    kernel_size=conv_kernel_length[lnum],
                    use_kernel_mask=True,
                    use_bias=conv_usebias,
                ),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
Esempio n. 9
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
    ):
        assert check_argument_types()
        super().__init__(
            vocab_size=vocab_size,
            encoder_output_size=encoder_output_size,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            input_layer=input_layer,
            use_output_layer=use_output_layer,
            pos_enc_class=pos_enc_class,
            normalize_before=normalize_before,
        )

        attention_dim = encoder_output_size
        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
Esempio n. 10
0
 def __init__(self, odim, args):
     super(Decoder, self).__init__()
     self.embed = torch.nn.Sequential(
         torch.nn.Embedding(odim, args.adim),
         PositionalEncoding(args.adim, args.dropout_rate)
     )
     self.decoders = repeat(
         args.dlayers,
         lambda: DecoderLayer(
             args.adim,
             MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate),
             MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate),
             PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate),
             args.dropout_rate
         )
     )
     self.output_norm = LayerNorm(args.adim)
     self.output_layer = torch.nn.Linear(args.adim, odim)
Esempio n. 11
0
def build_transformer_block(
    net_part: str, block_arch: Dict, pw_layer_type: str, pw_activation_type: str
) -> Union[EncoderLayer, TransformerDecoderLayer]:
    """Build function for transformer block.

    Args:
        net_part: Network part, either 'encoder' or 'decoder'.
        block_arch: Transformer block parameters.
        pw_layer_type: Positionwise layer type.
        pw_activation_type: Positionwise activation type.

    Returns:
        : Function to create transformer (encoder or decoder) block.

    """
    d_hidden = block_arch["d_hidden"]
    d_ff = block_arch["d_ff"]
    heads = block_arch["heads"]

    dropout_rate = block_arch["dropout-rate"] if "dropout-rate" in block_arch else 0.0
    pos_dropout_rate = (
        block_arch["pos-dropout-rate"] if "pos-dropout-rate" in block_arch else 0.0
    )
    att_dropout_rate = (
        block_arch["att-dropout-rate"] if "att-dropout-rate" in block_arch else 0.0
    )

    if pw_layer_type == "linear":
        pw_layer = PositionwiseFeedForward
        pw_activation = get_activation(pw_activation_type)
        pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation)
    else:
        raise NotImplementedError("Transformer block only supports linear yet.")

    if net_part == "encoder":
        transformer_layer_class = EncoderLayer
    elif net_part == "decoder":
        transformer_layer_class = TransformerDecoderLayer

    return lambda: transformer_layer_class(
        d_hidden,
        MultiHeadedAttention(heads, d_hidden, att_dropout_rate),
        pw_layer(*pw_layer_args),
        dropout_rate,
    )
Esempio n. 12
0
def build_transformer_block(
    net_part: str,
    block: Dict[str, Any],
    pw_layer_type: str,
    pw_activation_type: str,
) -> Union[EncoderLayer, TransformerDecoderLayer]:
    """Build function for transformer block.

    Args:
        net_part: Network part, either 'encoder' or 'decoder'.
        block: Transformer block parameters.
        pw_layer_type: Positionwise layer type.
        pw_activation_type: Positionwise activation type.

    Returns:
        : Function to create transformer (encoder or decoder) block.

    """
    d_hidden = block["d_hidden"]

    dropout_rate = block.get("dropout-rate", 0.0)
    pos_dropout_rate = block.get("pos-dropout-rate", 0.0)
    att_dropout_rate = block.get("att-dropout-rate", 0.0)

    if pw_layer_type != "linear":
        raise NotImplementedError(
            "Transformer block only supports linear pointwise layer.")

    if net_part == "encoder":
        transformer_layer_class = EncoderLayer
    elif net_part == "decoder":
        transformer_layer_class = TransformerDecoderLayer

    return lambda: transformer_layer_class(
        d_hidden,
        MultiHeadedAttention(block["heads"], d_hidden, att_dropout_rate),
        PositionwiseFeedForward(
            d_hidden,
            block["d_ff"],
            pos_dropout_rate,
            get_activation(pw_activation_type),
        ),
        dropout_rate,
    )
Esempio n. 13
0
 def __init__(self,
              idim,
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     super(Encoder, self).__init__()
     if input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(idim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif input_layer == "conv2d":
         self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
     elif input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(idim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     else:
         raise ValueError("unknown input_layer: " + input_layer)
     self.normalize_before = normalize_before
     self.encoders = repeat(
         num_blocks, lambda: EncoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim,
                                  attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units,
                                     dropout_rate), dropout_rate,
             normalize_before, concat_after))
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
Esempio n. 14
0
def build_transformer_block(net_part, block_arch, pw_layer_type,
                            pw_activation_type):
    """Build function for transformer block.

    Args:
        net_part (str): either 'encoder' or 'decoder'
        block_arch (dict): transformer block parameters
        pw_layer_type (str): positionwise layer type
        pw_activation_type (str): positionwise activation type

    Returns:
        (function): function to create transformer block

    """
    d_hidden = block_arch["d_hidden"]
    d_ff = block_arch["d_ff"]
    heads = block_arch["heads"]

    dropout_rate = block_arch[
        "dropout-rate"] if "dropout-rate" in block_arch else 0.0
    pos_dropout_rate = (block_arch["pos-dropout-rate"]
                        if "pos-dropout-rate" in block_arch else 0.0)
    att_dropout_rate = (block_arch["att-dropout-rate"]
                        if "att-dropout-rate" in block_arch else 0.0)

    if pw_layer_type == "linear":
        pw_layer = PositionwiseFeedForward
        pw_activation = get_activation(pw_activation_type)
        pw_layer_args = (d_hidden, d_ff, pos_dropout_rate, pw_activation)
    else:
        raise NotImplementedError(
            "Transformer block only supports linear yet.")

    if net_part == "encoder":
        transformer_layer_class = EncoderLayer
    elif net_part == "decoder":
        transformer_layer_class = DecoderLayer

    return lambda: transformer_layer_class(
        d_hidden,
        MultiHeadedAttention(heads, d_hidden, att_dropout_rate),
        pw_layer(*pw_layer_args),
        dropout_rate,
    )
Esempio n. 15
0
    def __init__(self,
                 hparams,
                 window_sizes=[100, 50],
                 channels=[128, 64, 32],
                 dropout_rate=0.3):
        super(Wgan_GP, self).__init__()
        self.hparams = hparams
        self.window_sizes = window_sizes
        self.channels = channels
        self.convs = torch.nn.ModuleList()
        self.smooth_dense_layer = torch.nn.ModuleList()

        for k in range(len(channels)):
            self.convs_k = torch.nn.Sequential(
                Conv2Norm(in_channels=1,
                          out_channels=channels[k],
                          kernel_size=(3, 3),
                          bias=False,
                          w_init_gain='leaky_relu'),
                torch.nn.BatchNorm2d(channels[k]), torch.nn.ReLU(),
                Conv2Norm(in_channels=channels[k],
                          out_channels=channels[k],
                          kernel_size=(3, 3),
                          bias=False,
                          w_init_gain='leaky_relu'),
                torch.nn.BatchNorm2d(channels[k]), torch.nn.ReLU(),
                Conv2Norm(in_channels=channels[k],
                          out_channels=channels[k],
                          kernel_size=(3, 3),
                          bias=False,
                          w_init_gain='leaky_relu'),
                torch.nn.BatchNorm2d(channels[k]), torch.nn.ReLU(),
                torch.nn.Dropout(dropout_rate))
            self.dense_k = torch.nn.Linear(channels[k] * hparams.num_mels, 32)
            self.convs.append(self.convs_k)
            self.smooth_dense_layer.append(self.dense_k)

        self.multihead_attention = MultiHeadedAttention(
            hparams.aheads, 32, hparams.transformer_enc_dropout_rate)
        self.smooth_dense_layer_final = torch.nn.Linear(32, 1)
Esempio n. 16
0
 def __init__(
     self,
     odim,
     selfattention_layer_type="selfattn",
     attention_dim=256,
     attention_heads=4,
     conv_wshare=4,
     conv_kernel_length=11,
     conv_usebias=False,
     linear_units=2048,
     num_blocks=6,
     dropout_rate=0.1,
     positional_dropout_rate=0.1,
     self_attention_dropout_rate=0.0,
     src_attention_dropout_rate=0.0,
     input_layer="embed",
     use_output_layer=True,
     pos_enc_class=PositionalEncoding,
     normalize_before=True,
     concat_after=False,
 ):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     self._register_load_state_dict_pre_hook(_pre_hook)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate),
             torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate))
     else:
         raise NotImplementedError(
             "only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     if selfattention_layer_type == "selfattn":
         logging.info("decoder self-attention layer type = self-attention")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      self_attention_dropout_rate),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "lightconv":
         logging.info(
             "decoder self-attention layer type = lightweight convolution")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 LightweightConvolution(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "lightconv2d":
         logging.info("decoder self-attention layer "
                      "type = lightweight convolution 2-dimentional")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 LightweightConvolution2D(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "dynamicconv":
         logging.info(
             "decoder self-attention layer type = dynamic convolution")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 DynamicConvolution(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     elif selfattention_layer_type == "dynamicconv2d":
         logging.info(
             "decoder self-attention layer type = dynamic convolution 2-dimentional"
         )
         self.decoders = repeat(
             num_blocks,
             lambda lnum: DecoderLayer(
                 attention_dim,
                 DynamicConvolution2D(
                     conv_wshare,
                     attention_dim,
                     self_attention_dropout_rate,
                     conv_kernel_length,
                     lnum,
                     use_kernel_mask=True,
                     use_bias=conv_usebias,
                 ),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         )
     self.selfattention_layer_type = selfattention_layer_type
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
Esempio n. 17
0
    def __init__(
        self,
        idim,
        selfattention_layer_type="selfattn",
        attention_dim=256,
        attention_heads=4,
        conv_wshare=4,
        conv_kernel_length=11,
        conv_usebias=False,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        input_layer="conv2d",
        pos_enc_class=PositionalEncoding,
        normalize_before=True,
        concat_after=False,
        positionwise_layer_type="linear",
        positionwise_conv_kernel_size=1,
        padding_idx=-1,
    ):
        """Construct an Encoder object."""
        super(Encoder, self).__init__()
        self._register_load_state_dict_pre_hook(_pre_hook)

        self.conv_subsampling_factor = 1
        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(idim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
            self.conv_subsampling_factor = 4
        elif input_layer == "conv2d-scaled-pos-enc":
            self.embed = Conv2dSubsampling(
                idim,
                attention_dim,
                dropout_rate,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
            self.conv_subsampling_factor = 4
        elif input_layer == "conv2d6":
            self.embed = Conv2dSubsampling6(idim, attention_dim, dropout_rate)
            self.conv_subsampling_factor = 6
        elif input_layer == "conv2d8":
            self.embed = Conv2dSubsampling8(idim, attention_dim, dropout_rate)
            self.conv_subsampling_factor = 8
        elif input_layer == "vgg2l":
            self.embed = VGG2L(idim, attention_dim)
            self.conv_subsampling_factor = 4
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(idim,
                                   attention_dim,
                                   padding_idx=padding_idx),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
            positionwise_layer_type,
            attention_dim,
            linear_units,
            dropout_rate,
            positionwise_conv_kernel_size,
        )
        if selfattention_layer_type == "selfattn":
            logging.info("encoder self-attention layer type = self-attention")
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    MultiHeadedAttention(attention_heads, attention_dim,
                                         attention_dropout_rate),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        elif selfattention_layer_type == "lightconv":
            logging.info(
                "encoder self-attention layer type = lightweight convolution")
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    LightweightConvolution(
                        conv_wshare,
                        attention_dim,
                        attention_dropout_rate,
                        conv_kernel_length,
                        lnum,
                        use_bias=conv_usebias,
                    ),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        elif selfattention_layer_type == "lightconv2d":
            logging.info("encoder self-attention layer "
                         "type = lightweight convolution 2-dimentional")
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    LightweightConvolution2D(
                        conv_wshare,
                        attention_dim,
                        attention_dropout_rate,
                        conv_kernel_length,
                        lnum,
                        use_bias=conv_usebias,
                    ),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        elif selfattention_layer_type == "dynamicconv":
            logging.info(
                "encoder self-attention layer type = dynamic convolution")
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    DynamicConvolution(
                        conv_wshare,
                        attention_dim,
                        attention_dropout_rate,
                        conv_kernel_length,
                        lnum,
                        use_bias=conv_usebias,
                    ),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        elif selfattention_layer_type == "dynamicconv2d":
            logging.info(
                "encoder self-attention layer type = dynamic convolution 2-dimentional"
            )
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    DynamicConvolution2D(
                        conv_wshare,
                        attention_dim,
                        attention_dropout_rate,
                        conv_kernel_length,
                        lnum,
                        use_bias=conv_usebias,
                    ),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after,
                ),
            )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
Esempio n. 18
0
    def __init__(
        self,
        idim,
        pred_into_type,
        into_type_num,
        reduce_character_embedding,
        attention_dim=256,
        attention_heads=4,
        conv_wshare=4,
        conv_kernel_length=11,
        conv_usebias=False,
        linear_units=2048,
        num_blocks=3,
        dropout_rate=0.2,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        pos_enc_class=PositionalEncoding,
        normalize_before=True,
        concat_after=False,
        positionwise_conv_kernel_size=1,
        padding_idx=-1,
        elayers=None,
        eunits=None,
    ):
        """Construct an Encoder object."""
        super(SentenceEncoder, self).__init__()

        self.conv_subsampling_factor = 1
        self.embed = torch.nn.Sequential(
            torch.nn.Linear(idim, attention_dim),
            torch.nn.LayerNorm(attention_dim),
            torch.nn.Dropout(dropout_rate),
            torch.nn.ReLU(),
            pos_enc_class(attention_dim, positional_dropout_rate),
        )

        self.normalize_before = normalize_before

        positionwise_layer = PositionwiseFeedForward
        positionwise_layer_args = (attention_dim, linear_units, dropout_rate)

        self.encoders = repeat(
            num_blocks,
            lambda lnum: EncoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     attention_dropout_rate),
                positionwise_layer(*positionwise_layer_args),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)

        # For reduction
        self.reduce_character_embedding = reduce_character_embedding
        self.query = None  # For embedding reduction
        if reduce_character_embedding or pred_into_type:
            query = torch.nn.Parameter(torch.FloatTensor((attention_dim)),
                                       requires_grad=True)
            self.query = torch.nn.init.uniform_(query)
            # self.d_k = math.sqrt(eunits)
            self.K = torch.nn.Linear(attention_dim, attention_dim)
            # self.V = torch.nn.Linear(eunits, eunits)
            self.score_dropout = torch.nn.Dropout(p=dropout_rate)

        # For prediction
        self.pred_prj = None
        if pred_into_type:
            self.pred_prj = torch.nn.Linear(attention_dim, into_type_num)
 def __init__(
     self,
     languages,
     odim_dict,
     selfattention_layer_type="selfattn",
     attention_dim=256,
     attention_heads=4,
     conv_wshare=4,
     conv_kernel_length=11,
     conv_usebias=False,
     linear_units=2048,
     num_blocks=6,
     dropout_rate=0.1,
     positional_dropout_rate=0.1,
     self_attention_dropout_rate=0.0,
     src_attention_dropout_rate=0.0,
     input_layer="embed",
     use_output_layer=True,
     pos_enc_class=PositionalEncoding,
     normalize_before=True,
     concat_after=False,
     sim_adapter=False,
     shared_adapter=False,
     use_adapters=True,
     fusion_languages=None,
 ):
     super().__init__(1, selfattention_layer_type, attention_dim,
                      attention_heads, conv_wshare, conv_kernel_length,
                      conv_usebias, linear_units, num_blocks, dropout_rate,
                      positional_dropout_rate, self_attention_dropout_rate,
                      src_attention_dropout_rate, input_layer,
                      use_output_layer, pos_enc_class, normalize_before,
                      concat_after)
     if input_layer == "embed":
         self.embed = torch.nn.ModuleDict()
         for lang in odim_dict.keys():
             self.embed[lang] = torch.nn.Sequential(
                 torch.nn.Embedding(odim_dict[lang], attention_dim),
                 pos_enc_class(attention_dim, positional_dropout_rate),
             )
     else:
         raise NotImplementedError("only support embed embedding layer")
     assert self_attention_dropout_rate == src_attention_dropout_rate
     if selfattention_layer_type == "selfattn":
         logging.info("decoder self-attention layer type = self-attention")
         self.decoders = repeat(
             num_blocks,
             lambda lnum: AdaptiveDecoderLayer(
                 languages,
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      self_attention_dropout_rate),
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      src_attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate),
                 dropout_rate,
                 normalize_before,
                 concat_after,
                 torch.nn.ModuleDict({
                     "_".join(sorted(fusion_languages)):
                     SimAdapter(attention_dim, self_attention_dropout_rate,
                                fusion_languages)
                 }) if sim_adapter else None,
                 shared_adapter,
                 use_adapters,
             ),
         )
     else:
         raise NotImplementedError(
             "Only support self-attention decoder layer")
     if use_output_layer:
         self.output_layer = torch.nn.ModuleDict()
         for lang in odim_dict.keys():
             self.output_layer[lang] = torch.nn.Linear(
                 attention_dim, odim_dict[lang])
     else:
         self.output_layer = None
 def __init__(
     self,
     languages,
     idim,
     selfattention_layer_type="selfattn",
     attention_dim=256,
     attention_heads=4,
     conv_wshare=4,
     conv_kernel_length=11,
     conv_usebias=False,
     linear_units=2048,
     num_blocks=6,
     dropout_rate=0.1,
     positional_dropout_rate=0.1,
     attention_dropout_rate=0.0,
     input_layer="conv2d",
     pos_enc_class=PositionalEncoding,
     normalize_before=True,
     concat_after=False,
     positionwise_layer_type="linear",
     positionwise_conv_kernel_size=1,
     padding_idx=-1,
     sim_adapter=False,
     shared_adapter=None,
     use_adapters=True,
     fusion_languages=None,
 ):
     super().__init__(idim, selfattention_layer_type, attention_dim,
                      attention_heads, conv_wshare, conv_kernel_length,
                      conv_usebias, linear_units, num_blocks, dropout_rate,
                      positional_dropout_rate, attention_dropout_rate,
                      input_layer, pos_enc_class, normalize_before,
                      concat_after, positionwise_layer_type,
                      positionwise_conv_kernel_size, padding_idx)
     positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
         positionwise_layer_type,
         attention_dim,
         linear_units,
         dropout_rate,
         positionwise_conv_kernel_size,
     )
     if selfattention_layer_type == "selfattn":
         logging.info("encoder self-attention layer type = self-attention")
         self.encoders = repeat(
             num_blocks,
             lambda lnum: AdaptiveEncoderLayer(
                 languages,
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      attention_dropout_rate),
                 positionwise_layer(*positionwise_layer_args),
                 dropout_rate,
                 normalize_before,
                 concat_after,
                 torch.nn.ModuleDict({
                     "_".join(sorted(fusion_languages)):
                     SimAdapter(attention_dim, attention_dropout_rate,
                                fusion_languages)
                 }) if sim_adapter else None,
                 shared_adapter,
                 use_adapters,
             ),
         )
     else:
         raise NotImplementedError(
             "Only support self-attention encoder layer")
    def __init__(
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        attention_dropout_rate: float = 0.0,
        input_layer: Optional[str] = "conv2d",
        pos_enc_class=StreamPositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
        positionwise_layer_type: str = "linear",
        positionwise_conv_kernel_size: int = 1,
        padding_idx: int = -1,
        block_size: int = 40,
        hop_size: int = 16,
        look_ahead: int = 16,
        init_average: bool = True,
        ctx_pos_enc: bool = True,
    ):
        assert check_argument_types()
        super().__init__()
        self._output_size = output_size

        self.pos_enc = pos_enc_class(output_size, positional_dropout_rate)

        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(input_size, output_size),
                torch.nn.LayerNorm(output_size),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
            )
            self.subsample = 1
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsamplingWOPosEnc(input_size,
                                                   output_size,
                                                   dropout_rate,
                                                   kernels=[3, 3],
                                                   strides=[2, 2])
            self.subsample = 4
        elif input_layer == "conv2d6":
            self.embed = Conv2dSubsamplingWOPosEnc(input_size,
                                                   output_size,
                                                   dropout_rate,
                                                   kernels=[3, 5],
                                                   strides=[2, 3])
            self.subsample = 6
        elif input_layer == "conv2d8":
            self.embed = Conv2dSubsamplingWOPosEnc(
                input_size,
                output_size,
                dropout_rate,
                kernels=[3, 3, 3],
                strides=[2, 2, 2],
            )
            self.subsample = 8
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(input_size,
                                   output_size,
                                   padding_idx=padding_idx), )
            self.subsample = 1
        elif input_layer is None:
            self.embed = None
            self.subsample = 1
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (
                output_size,
                linear_units,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        self.encoders = repeat(
            num_blocks,
            lambda lnum: ContextualBlockEncoderLayer(
                output_size,
                MultiHeadedAttention(attention_heads, output_size,
                                     attention_dropout_rate),
                positionwise_layer(*positionwise_layer_args),
                dropout_rate,
                num_blocks,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(output_size)

        # for block processing
        self.block_size = block_size
        self.hop_size = hop_size
        self.look_ahead = look_ahead
        self.init_average = init_average
        self.ctx_pos_enc = ctx_pos_enc
Esempio n. 22
0
    def __init__(
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        attention_dropout_rate: float = 0.0,
        input_layer: Optional[str] = "conv2d",
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
        positionwise_layer_type: str = "linear",
        positionwise_conv_kernel_size: int = 1,
        padding_idx: int = -1,
    ):
        assert check_argument_types()
        super().__init__()
        self._output_size = output_size

        if input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(input_size, output_size),
                torch.nn.LayerNorm(output_size),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(input_size, output_size, dropout_rate)
        elif input_layer == "conv2d6":
            self.embed = Conv2dSubsampling6(input_size, output_size, dropout_rate)
        elif input_layer == "conv2d8":
            self.embed = Conv2dSubsampling8(input_size, output_size, dropout_rate)
        elif input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(output_size, positional_dropout_rate)
            )
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (
                output_size,
                linear_units,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                output_size,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        self.encoders = repeat(
            num_blocks,
            lambda lnum: EncoderLayer(
                output_size,
                MultiHeadedAttention(
                    attention_heads, output_size, attention_dropout_rate
                ),
                positionwise_layer(*positionwise_layer_args),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(output_size)
Esempio n. 23
0
 def __init__(self, idim,
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False,
              positionwise_layer_type="linear",
              positionwise_conv_kernel_size=1,
              padding_idx=-1):
     super(Encoder, self).__init__()
     if input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(idim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate),
             torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     elif input_layer == "custom":
         self.embed = EncoderConv2d(idim, attention_dim)
     elif input_layer == "conv2d":
         self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
     elif input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     elif input_layer is None:
         self.embed = torch.nn.Sequential(
             pos_enc_class(attention_dim, positional_dropout_rate)
         )
     else:
         raise ValueError("unknown input_layer: " + input_layer)
     self.normalize_before = normalize_before
     if positionwise_layer_type == "linear":
         positionwise_layer = PositionwiseFeedForward
         positionwise_layer_args = (attention_dim, linear_units, dropout_rate)
     elif positionwise_layer_type == "conv1d":
         positionwise_layer = MultiLayeredConv1d
         positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate)
     else:
         raise NotImplementedError("Support only linear or conv1d.")
     self.encoders = repeat(
         num_blocks,
         lambda: EncoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim, attention_dropout_rate),
             positionwise_layer(*positionwise_layer_args),
             dropout_rate,
             normalize_before,
             concat_after
         )
     )
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
    def __init__(self,
                 odim,
                 attention_dim=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 dropout_rate=0.1,
                 positional_dropout_rate=0.1,
                 self_attention_dropout_rate=0.0,
                 src_attention_dropout_rate=0.0,
                 input_layer="embed",
                 use_output_layer=True,
                 pos_enc_class=PositionalEncoding,
                 normalize_before=True,
                 concat_after=False,
                 cross_operator=None,
                 cross_shared=False,
                 cross_weight_learnable=False,
                 cross_weight=0.0):
        """Construct an Decoder object."""
        torch.nn.Module.__init__(self)
        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate))
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate))
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise NotImplementedError(
                "only `embed` or torch.nn.Module is supported.")
        self.normalize_before = normalize_before

        cross_self_attn = None
        cross_src_attn = None
        if cross_operator:
            if 'src_' in cross_operator:
                # cross_src_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate)
                cross_src_attn = True

            if 'self_' in cross_operator:
                if cross_shared and cross_src_attn is not None:
                    # cross_self_attn = cross_src_attn
                    cross_self_attn = True  # TODO: backward compatibility for shared self and source
                else:
                    # cross_self_attn = MultiHeadedAttention(attention_heads, attention_dim, self_attention_dropout_rate)
                    cross_self_attn = True
            if 'concat' in cross_operator:
                cross_operator = 'concat'
            elif 'sum' in cross_operator:
                cross_operator = 'sum'
            else:
                raise NotImplementedError

        self.decoders = repeat(
            num_blocks, lambda: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
                cross_self_attn=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if cross_self_attn else None,
                cross_src_attn=MultiHeadedAttention(
                    attention_heads, attention_dim, self_attention_dropout_rate
                ) if cross_src_attn else None,
                cross_operator=cross_operator,
                cross_shared=cross_shared,
                cross_weight_learnable=cross_weight_learnable,
                cross_weight=cross_weight))

        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, odim)
        else:
            self.output_layer = None
 def __init__(self):
     super().__init__()
     self.att1 = MultiHeadedAttention(2, 10, 0.0)
     self.att2 = AttAdd(10, 20, 15)
     self.desired = defaultdict(list)
Esempio n. 26
0
    def __init__(
        self,
        idim,
        attention_dim=256,
        attention_heads=4,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.1,
        attention_dropout_rate=0.0,
        input_layer="conv2d",
        pos_enc_class=PositionalEncoding,
        normalize_before=True,
        concat_after=False,
        positionwise_layer_type="linear",
        positionwise_conv_kernel_size=1,
        padding_idx=-1,
    ):
        """Construct an Encoder object."""
        super(Encoder, self).__init__()

        if input_layer == "custom":
            self.embed = EncoderConv2d(idim, attention_dim)
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (attention_dim, linear_units,
                                       dropout_rate)
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (
                attention_dim,
                linear_units,
                positionwise_conv_kernel_size,
                dropout_rate,
            )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        self.encoders = repeat(
            num_blocks,
            lambda: EncoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     attention_dropout_rate),
                positionwise_layer(*positionwise_layer_args),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
Esempio n. 27
0
 def __init__(
     self,
     idim,
     attention_dim=256,
     attention_heads=4,
     linear_units=2048,
     num_blocks_sd=4,
     num_blocks_rec=8,
     dropout_rate=0.1,
     positional_dropout_rate=0.1,
     attention_dropout_rate=0.0,
     input_layer="conv2d",
     pos_enc_class=PositionalEncoding,
     normalize_before=True,
     concat_after=False,
     positionwise_layer_type="linear",
     positionwise_conv_kernel_size=1,
     padding_idx=-1,
     num_spkrs=2,
 ):
     """Construct an Encoder object."""
     super(EncoderMix, self).__init__(
         idim=idim,
         selfattention_layer_type="selfattn",
         attention_dim=attention_dim,
         attention_heads=attention_heads,
         linear_units=linear_units,
         num_blocks=num_blocks_rec,
         dropout_rate=dropout_rate,
         positional_dropout_rate=positional_dropout_rate,
         attention_dropout_rate=attention_dropout_rate,
         input_layer=input_layer,
         pos_enc_class=pos_enc_class,
         normalize_before=normalize_before,
         concat_after=concat_after,
         positionwise_layer_type=positionwise_layer_type,
         positionwise_conv_kernel_size=positionwise_conv_kernel_size,
         padding_idx=padding_idx,
     )
     positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
         positionwise_layer_type,
         attention_dim,
         linear_units,
         dropout_rate,
         positionwise_conv_kernel_size,
     )
     self.num_spkrs = num_spkrs
     self.encoders_sd = torch.nn.ModuleList([
         repeat(
             num_blocks_sd,
             lambda lnum: EncoderLayer(
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      attention_dropout_rate),
                 positionwise_layer(*positionwise_layer_args),
                 dropout_rate,
                 normalize_before,
                 concat_after,
             ),
         ) for i in range(num_spkrs)
     ])
Esempio n. 28
0
    def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0):
        """Construct an E2E object for transducer model."""
        torch.nn.Module.__init__(self)

        if "transformer" in args.etype:
            if args.enc_block_arch is None:
                raise ValueError(
                    "Transformer-based blocks in transducer mode should be"
                    "defined individually in the YAML file."
                    "See egs/vivos/asr1/conf/transducer/* for more info.")

            self.subsample = get_subsample(args,
                                           mode="asr",
                                           arch="transformer")
            # 2. use transformer to joint feature maps
            # transformer without positional encoding

            self.clayers = repeat(
                2,
                lambda lnum: EncoderLayer(
                    16,
                    MultiHeadedAttention(4, 16, 0.1),
                    PositionwiseFeedForward(16, 2048, 0.1),
                    dropout_rate=0.1,
                    normalize_before=True,
                    concat_after=False,
                ),
            )

            self.conv = torch.nn.Sequential(
                torch.nn.Conv2d(1, 32, kernel_size=(3, 5), stride=(1, 2)),
                torch.nn.ReLU(),
                torch.nn.Conv2d(32, 32, kernel_size=(3, 7), stride=(2, 2)),
                torch.nn.ReLU())

            self.encoder = Encoder(
                idim,
                args.enc_block_arch,
                input_layer=args.transformer_enc_input_layer,
                repeat_block=args.enc_block_repeat,
                self_attn_type=args.transformer_enc_self_attn_type,
                positional_encoding_type=args.
                transformer_enc_positional_encoding_type,
                positionwise_activation_type=args.
                transformer_enc_pw_activation_type,
                conv_mod_activation_type=args.
                transformer_enc_conv_mod_activation_type,
            )
            encoder_out = self.encoder.enc_out
            args.eprojs = self.encoder.enc_out

            self.most_dom_list = args.enc_block_arch[:]
        else:
            self.subsample = get_subsample(args, mode="asr", arch="rnn-t")

            self.enc = encoder_for(args, idim, self.subsample)

            encoder_out = args.eprojs

        if "transformer" in args.dtype:
            if args.dec_block_arch is None:
                raise ValueError(
                    "Transformer-based blocks in transducer mode should be"
                    "defined individually in the YAML file."
                    "See egs/vivos/asr1/conf/transducer/* for more info.")

            self.decoder = DecoderTT(
                odim,
                encoder_out,
                args.joint_dim,
                args.dec_block_arch,
                input_layer=args.transformer_dec_input_layer,
                repeat_block=args.dec_block_repeat,
                joint_activation_type=args.joint_activation_type,
                positionwise_activation_type=args.
                transformer_dec_pw_activation_type,
                dropout_rate_embed=args.dropout_rate_embed_decoder,
            )

            if "transformer" in args.etype:
                self.most_dom_list += args.dec_block_arch[:]
            else:
                self.most_dom_list = args.dec_block_arch[:]
        else:
            if args.rnnt_mode == "rnnt-att":
                self.att = att_for(args)

                self.dec = DecoderRNNTAtt(
                    args.eprojs,
                    odim,
                    args.dtype,
                    args.dlayers,
                    args.dunits,
                    blank_id,
                    self.att,
                    args.dec_embed_dim,
                    args.joint_dim,
                    args.joint_activation_type,
                    args.dropout_rate_decoder,
                    args.dropout_rate_embed_decoder,
                )
            else:
                self.dec = DecoderRNNT(
                    args.eprojs,
                    odim,
                    args.dtype,
                    args.dlayers,
                    args.dunits,
                    blank_id,
                    args.dec_embed_dim,
                    args.joint_dim,
                    args.joint_activation_type,
                    args.dropout_rate_decoder,
                    args.dropout_rate_embed_decoder,
                )

        if hasattr(self, "most_dom_list"):
            self.most_dom_dim = sorted(
                Counter(d["d_hidden"] for d in self.most_dom_list
                        if "d_hidden" in d).most_common(),
                key=lambda x: x[0],
                reverse=True,
            )[0][0]

        self.etype = args.etype
        self.dtype = args.dtype
        self.rnnt_mode = args.rnnt_mode

        self.sos = odim - 1
        self.eos = odim - 1
        self.blank_id = blank_id
        self.ignore_id = ignore_id

        self.space = args.sym_space
        self.blank = args.sym_blank

        self.odim = odim

        self.reporter = Reporter()

        self.criterion = TransLoss(args.trans_type, self.blank_id)

        self.default_parameters(args)

        if args.report_cer or args.report_wer:
            from espnet.nets.e2e_asr_common import ErrorCalculatorTransducer

            if self.dtype == "transformer":
                decoder = self.decoder
            else:
                decoder = self.dec

            self.error_calculator = ErrorCalculatorTransducer(
                decoder,
                args.char_list,
                args.sym_space,
                args.sym_blank,
                args.report_cer,
                args.report_wer,
            )
        else:
            self.error_calculator = None

        self.loss = None
        self.rnnlm = None
Esempio n. 29
0
    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        torch.nn.Module.__init__(self)
        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate
        self.encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate)
        self.decoder = Decoder(
            odim=odim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.dunits,
            num_blocks=args.dlayers,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            self_attention_dropout_rate=args.transformer_attn_dropout_rate,
            src_attention_dropout_rate=args.transformer_attn_dropout_rate)
        self.sos = odim - 1
        self.eos = odim - 1
        self.odim = odim
        self.ignore_id = ignore_id
        self.subsample = [1]
        self.reporter = Reporter()

        # self.lsm_weight = a
        self.criterion = LabelSmoothingLoss(
            self.odim, self.ignore_id, args.lsm_weight,
            args.transformer_length_normalized_loss)
        # self.verbose = args.verbose
        self.adim = args.adim
        self.mtlalpha = args.mtlalpha
        if args.mtlalpha > 0.0:
            self.ctc = CTC(odim,
                           args.adim,
                           args.dropout_rate,
                           ctc_type=args.ctc_type,
                           reduce=True)
        else:
            self.ctc = None

        if args.report_cer or args.report_wer:
            from espnet.nets.e2e_asr_common import ErrorCalculator
            self.error_calculator = ErrorCalculator(args.char_list,
                                                    args.sym_space,
                                                    args.sym_blank,
                                                    args.report_cer,
                                                    args.report_wer)
        else:
            self.error_calculator = None
        self.rnnlm = None

        # yzl23 config
        self.remove_blank_in_ctc_mode = True
        # lid multitask related
        adim = args.adim
        self.lid_odim = 2  # cn and en
        # src attention
        self.lid_src_att = MultiHeadedAttention(
            args.aheads, args.adim, args.transformer_attn_dropout_rate)
        # self.lid_output_layer = torch.nn.Sequential(torch.nn.Linear(adim, adim),
        #                                         torch.nn.Tanh(),
        #                                         torch.nn.Linear(adim, self.lid_odim))
        self.lid_output_layer = torch.nn.Linear(adim, self.lid_odim)
        # here we hack to use lsm loss, but with lsm_weight ZERO
        self.lid_criterion = LanguageIDMultitakLoss(self.ignore_id, \
                                                normalize_length=args.transformer_length_normalized_loss)
        self.lid_mtl_alpha = args.lid_mtl_alpha
        logging.warning("language id multitask training alpha %f" %
                        (self.lid_mtl_alpha))
        self.log_lid_mtl_acc = args.log_lid_mtl_acc

        # reset parameters
        self.reset_parameters(args)
    def __init__(self, idim, odim, args, ignore_id=-1):
        """Construct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        """
        torch.nn.Module.__init__(self)
        if args.transformer_attn_dropout_rate is None:
            args.transformer_attn_dropout_rate = args.dropout_rate
        self.cn_encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate)
        self.en_encoder = Encoder(
            idim=idim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.eunits,
            num_blocks=args.elayers,
            input_layer=args.transformer_input_layer,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            attention_dropout_rate=args.transformer_attn_dropout_rate)
        # gated add module
        self.vectorize_lambda = args.vectorize_lambda
        lambda_dim = args.adim if self.vectorize_lambda else 1
        self.aggregation_module = torch.nn.Sequential(
            torch.nn.Linear(2 * args.adim, lambda_dim), torch.nn.Sigmoid())
        self.additional_encoder_layer = EncoderLayer(
            args.adim,
            MultiHeadedAttention(args.aheads, args.adim,
                                 args.transformer_attn_dropout_rate),
            PositionwiseFeedForward(args.adim, args.eunits, args.dropout_rate),
            args.dropout_rate,
            normalize_before=True,
            concat_after=False)
        self.additional_after_norm = LayerNorm(args.adim)
        self.decoder = Decoder(
            odim=odim,
            attention_dim=args.adim,
            attention_heads=args.aheads,
            linear_units=args.dunits,
            num_blocks=args.dlayers,
            dropout_rate=args.dropout_rate,
            positional_dropout_rate=args.dropout_rate,
            self_attention_dropout_rate=args.transformer_attn_dropout_rate,
            src_attention_dropout_rate=args.transformer_attn_dropout_rate)
        self.sos = odim - 1
        self.eos = odim - 1
        self.odim = odim
        self.ignore_id = ignore_id
        self.subsample = [1]
        self.reporter = Reporter()

        # self.lsm_weight = a
        self.criterion = LabelSmoothingLoss(
            self.odim, self.ignore_id, args.lsm_weight,
            args.transformer_length_normalized_loss)
        # self.verbose = args.verbose
        self.adim = args.adim
        self.mtlalpha = args.mtlalpha
        if args.mtlalpha > 0.0:
            self.ctc = CTC(odim,
                           args.adim,
                           args.dropout_rate,
                           ctc_type=args.ctc_type,
                           reduce=True)
        else:
            self.ctc = None

        if args.report_cer or args.report_wer:
            from espnet.nets.e2e_asr_common import ErrorCalculator
            self.error_calculator = ErrorCalculator(args.char_list,
                                                    args.sym_space,
                                                    args.sym_blank,
                                                    args.report_cer,
                                                    args.report_wer)
        else:
            self.error_calculator = None
        self.rnnlm = None

        # yzl23 config
        self.remove_blank_in_ctc_mode = True
        self.reset_parameters(args)  # reset params at the last

        logging.warning(
            "Model total size: {}M, requires_grad size: {}M".format(
                self.count_parameters(),
                self.count_parameters(requires_grad=True)))