Ejemplo n.º 1
0
    def __init__(
        self,
        size,
        self_attn,
        mixed_attn,
        feed_forward_enc,
        feed_forward_dec,
        dropout_rate,
        normalize_before=True,
        concat_after=False,
    ):
        """Construct an DecoderLayer object."""
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.mixed_attn = mixed_attn
        # feed forwar 應該會有兩個 for enc 跟 dec
        self.feed_forward_enc = feed_forward_enc
        self.feed_forward_dec = feed_forward_dec

        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)
        self.norm3 = LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat_linear1 = nn.Linear(size + size, size)
            self.concat_linear2 = nn.Linear(size + size, size)
Ejemplo n.º 2
0
    def __init__(self,
                 size,
                 self_attn,
                 cn_src_attn,
                 en_src_attn,
                 feed_forward,
                 dropout_rate,
                 moe_att_mode='linear',
                 normalize_before=True,
                 concat_after=False):
        """Construct an DecoderLayer object."""
        super(HANDecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)
        self.norm3 = LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat_linear1 = nn.Linear(size + size, size)
            self.concat_linear2 = nn.Linear(size + size, size)

        # Hierarchical attention
        self.cn_src_attn = cn_src_attn  # declare attn here for initialization
        self.en_src_attn = en_src_attn
        self.src_attn = MoEAttn(size, cn_src_attn, en_src_attn, moe_att_mode)
Ejemplo n.º 3
0
    def __init__(
        self,
        size,
        self_attn,
        feed_forward,
        dropout_rate,
        normalize_before=True,
        concat_after=False,
    ):
        """Construct an DecoderLayer object."""
        super(DecoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward

        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)

        self.dropout = nn.Dropout(dropout_rate)

        self.size = size

        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat = nn.Linear((size + size), size)
Ejemplo n.º 4
0
 def __init__(
     self,
     size,
     self_attn,
     src_attn,
     feed_forward_1,  #fix
     feed_forward_2,  #fix
     dropout_rate,
     normalize_before=True,
     concat_after=False,
 ):
     """Construct an DecoderLayer object."""
     super(DecoderLayer, self).__init__()
     self.size = size
     self.self_attn = self_attn
     self.src_attn = src_attn
     self.feed_forward_1 = feed_forward_1  #fix
     self.feed_forward_2 = feed_forward_2  #fix
     self.norm1 = LayerNorm(size)
     self.norm2 = LayerNorm(size)
     self.norm3 = LayerNorm(size)
     self.dropout = nn.Dropout(dropout_rate)
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     if self.concat_after:
         self.concat_linear1 = nn.Linear(size + size, size)
         self.concat_linear2 = nn.Linear(size + size, size)
Ejemplo n.º 5
0
    def __init__(self,
                 size,
                 self_attn,
                 feed_forward,
                 dropout_rate,
                 normalize_before=True,
                 concat_after=False,
                 time_window=15):
        super(EncoderLayerTimeRestricted, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)
        self.size = size
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat_linear = nn.Linear(size + size, size)

        self.window_size = time_window
        pad_front = int(self.window_size / 2)
        pad_end = self.window_size - pad_front - 1
        self.pad = (
            0, 0, pad_front, pad_end
        )  # pad the second to last dimension by (pad_front, pad_end)
Ejemplo n.º 6
0
 def __init__(
     self,
     size,
     self_attn,
     feed_forward,
     feed_forward_macaron,
     conv_module,
     dropout_rate,
     normalize_before=True,
     concat_after=False,
 ):
     """Construct an EncoderLayer object."""
     super(EncoderLayer, self).__init__()
     self.self_attn = self_attn
     self.feed_forward = feed_forward
     self.feed_forward_macaron = feed_forward_macaron
     self.conv_module = conv_module
     self.norm_ff = LayerNorm(size)  # for the FNN module
     self.norm_mha = LayerNorm(size)  # for the MHA module
     if feed_forward_macaron is not None:
         self.norm_ff_macaron = LayerNorm(size)
         self.ff_scale = 0.5
     else:
         self.ff_scale = 1.0
     if self.conv_module is not None:
         self.norm_conv = LayerNorm(size)  # for the CNN module
         self.norm_final = LayerNorm(size)  # for the final output of the block
     self.dropout = nn.Dropout(dropout_rate)
     self.size = size
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     if self.concat_after:
         self.concat_linear = nn.Linear(size + size, size)
Ejemplo n.º 7
0
 def __init__(self, size, self_attn, feed_forward, dropout_rate):
     super(EncoderLayer, self).__init__()
     self.self_attn = self_attn
     self.feed_forward = feed_forward
     self.norm1 = LayerNorm(size)
     self.norm2 = LayerNorm(size)
     self.dropout = nn.Dropout(dropout_rate)
     self.size = size
    def __init__(self,
                 size,
                 self_attn,
                 src_attn,
                 feed_forward,
                 dropout_rate,
                 normalize_before=True,
                 concat_after=False,
                 cross_self_attn=None,
                 cross_src_attn=None,
                 cross_operator=None,
                 cross_shared=False,
                 cross_weight_learnable=False,
                 cross_weight=0.0):
        """Construct an DecoderLayer object."""
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        if not cross_shared and cross_self_attn is not None and cross_src_attn is not None:
            self.cross_self_attn = cross_self_attn
            self.cross_src_attn = cross_src_attn
            self.cross_shared = False
        else:
            self.cross_self_attn = None
            self.cross_src_attn = None
            if cross_self_attn is not None:
                self.cross_attn = cross_self_attn
            if cross_src_attn is not None:
                self.cross_attn = cross_src_attn
            if cross_self_attn is None and cross_src_attn is None:
                self.cross_attn = None
            self.cross_shared = True

        self.cross_operator = cross_operator
        if cross_self_attn is not None or cross_src_attn is not None:
            if cross_operator == "concat":
                self.cross_concat_linear1 = nn.Linear(size + size, size)
                self.cross_concat_linear2 = nn.Linear(size + size, size)
            elif cross_operator == "sum":
                if cross_weight_learnable:
                    assert float(cross_weight) > 0
                    self.cross_weight = torch.nn.Parameter(
                        torch.tensor(cross_weight))
                else:
                    self.cross_weight = cross_weight

        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)
        self.norm3 = LayerNorm(size)
        self.dropout = nn.Dropout(dropout_rate)
        self.normalize_before = normalize_before
        self.concat_after = concat_after
        if self.concat_after:
            self.concat_linear1 = nn.Linear(size + size, size)
            self.concat_linear2 = nn.Linear(size + size, size)
Ejemplo n.º 9
0
    def __init__(
        self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0, hparams=None
    ):
        """Initilize duration predictor module.

        Args:
            idim (int): Input dimension.
            n_layers (int, optional): Number of convolutional layers.
            n_chans (int, optional): Number of channels of convolutional layers.
            kernel_size (int, optional): Kernel size of convolutional layers.
            dropout_rate (float, optional): Dropout rate.
            offset (float, optional): Offset value to avoid nan in log domain.

        """
        super(DurationPredictor, self).__init__()
        self.hparams = hparams
        self.offset = offset
        self.conv = torch.nn.ModuleList()
        self.norm = torch.nn.ModuleList() if hparams.is_spk_layer_norm else None
        self.dropout = torch.nn.ModuleList() if hparams.is_spk_layer_norm else None
        for idx in range(n_layers):
            in_chans = idim if idx == 0 else n_chans
            if hparams.is_spk_layer_norm:
                self.conv += [
                    torch.nn.Sequential(
                        torch.nn.Conv1d(
                            in_chans,
                            n_chans,
                            kernel_size,
                            stride=1,
                            padding=(kernel_size - 1) // 2,
                        ),
                        torch.nn.ReLU(),
                    )
                ]
            else:
                self.conv += [
                torch.nn.Sequential(
                    torch.nn.Conv1d(
                        in_chans,
                        n_chans,
                        kernel_size,
                        stride=1,
                        padding=(kernel_size - 1) // 2,
                    ),
                    torch.nn.ReLU(),
                    LayerNorm(n_chans, hparams=hparams, dim=1),
                    torch.nn.Dropout(dropout_rate),
                )
                ]
            if hparams.is_spk_layer_norm:
                self.norm.append(LayerNorm(n_chans, hparams=hparams, dim=1))    
                self.dropout.append(torch.nn.Dropout(dropout_rate))
        self.linear = torch.nn.Linear(n_chans, 1)
Ejemplo n.º 10
0
    def __init__(self, size, self_attn, feed_forward, dropout_rate):
        """Construct an DecoderLayer object."""
        super(DecoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward

        self.norm1 = LayerNorm(size)
        self.norm2 = LayerNorm(size)

        self.dropout = nn.Dropout(dropout_rate)

        self.size = size
Ejemplo n.º 11
0
 def __init__(self, size, lstm, src_attn, feed_forward, dropout_rate,
              normalize_before=True, concat_after=False):
     super(DecoderLayer, self).__init__()
     self.size = size
     self.lstm = lstm
     self.src_attn = src_attn
     self.feed_forward = feed_forward
     self.norm1 = LayerNorm(size)
     self.norm2 = LayerNorm(size)
     self.norm3 = LayerNorm(size)
     self.dropout = nn.Dropout(dropout_rate)
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     if self.concat_after:
         self.concat_linear1 = nn.Linear(size + size, size)
         self.concat_linear2 = nn.Linear(size + size, size)
Ejemplo n.º 12
0
    def __init__(self,
                 n_head,
                 d_model,
                 d_head,
                 dropout,
                 dropatt=0,
                 tgt_len=None,
                 ext_len=None,
                 mem_len=None,
                 pre_lnorm=False):
        super(RelMultiHeadAttn, self).__init__()

        self.n_head = n_head
        self.d_model = d_model
        self.d_head = d_head
        self.dropout = dropout

        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)

        self.drop = nn.Dropout(dropout)
        self.dropatt = nn.Dropout(dropatt)
        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)

        self.layer_norm = LayerNorm(d_model)

        self.scale = 1 / (d_head**0.5)

        self.pre_lnorm = pre_lnorm
Ejemplo n.º 13
0
    def __init__(self, idim, args):
        super(Encoder, self).__init__()
        if args.transformer_input_layer == "linear":
            self.input_layer = torch.nn.Sequential(
                torch.nn.Linear(idim, args.adim),
                torch.nn.LayerNorm(args.adim),
                torch.nn.Dropout(args.dropout_rate), torch.nn.ReLU(),
                PositionalEncoding(args.adim, args.dropout_rate))
        elif args.transformer_input_layer == "conv2d":
            self.input_layer = Conv2dSubsampling(idim, args.adim,
                                                 args.dropout_rate)
        elif args.transformer_input_layer == "embed":
            self.input_layer = torch.nn.Sequential(
                torch.nn.Embedding(idim, args.adim),
                PositionalEncoding(args.adim, args.dropout_rate))
        else:
            raise ValueError("unknown input_layer: " +
                             args.transformer_input_layer)

        self.encoders = repeat(
            args.elayers, lambda: EncoderLayer(
                args.adim,
                MultiHeadedAttention(args.aheads, args.adim, args.
                                     transformer_attn_dropout_rate),
                PositionwiseFeedForward(args.adim, args.eunits, args.
                                        dropout_rate), args.dropout_rate))
        self.norm = LayerNorm(args.adim)
Ejemplo n.º 14
0
    def __init__(
        self,
        odim,
        jdim,
        attention_dim=512,
        attention_heads=4,
        linear_units=2048,
        num_blocks=6,
        dropout_rate=0.1,
        positional_dropout_rate=0.0,
        attention_dropout_rate=0.0,
        input_layer="embed",
        pos_enc_class=PositionalEncoding,
        blank=0,
    ):
        """Construct a Decoder object for transformer-transducer models."""
        torch.nn.Module.__init__(self)

        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(odim, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(odim, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif isinstance(input_layer, torch.nn.Module):
            self.embed = torch.nn.Sequential(
                input_layer, pos_enc_class(attention_dim, positional_dropout_rate)
            )
        else:
            raise NotImplementedError("only `embed` or torch.nn.Module is supported.")

        self.decoders = repeat(
            num_blocks,
            lambda lnum: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(
                    attention_heads, attention_dim, attention_dropout_rate
                ),
                PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                dropout_rate,
            ),
        )

        self.after_norm = LayerNorm(attention_dim)

        self.lin_enc = torch.nn.Linear(attention_dim, jdim)
        self.lin_dec = torch.nn.Linear(attention_dim, jdim, bias=False)
        self.lin_out = torch.nn.Linear(jdim, odim)

        self.attention_dim = attention_dim
        self.odim = odim

        self.blank = blank
Ejemplo n.º 15
0
    def __init__(
        self,
        idim,
        enc_arch,
        input_layer="linear",
        repeat_block=0,
        self_attn_type="selfattn",
        positional_encoding_type="abs_pos",
        positionwise_layer_type="linear",
        positionwise_activation_type="relu",
        conv_mod_activation_type="relu",
        normalize_before=True,
        padding_idx=-1,
    ):
        """Construct an Transformer encoder object."""
        super(Encoder, self).__init__()

        self.embed, self.encoders, self.enc_out = build_blocks(
            "encoder",
            idim,
            input_layer,
            enc_arch,
            repeat_block=repeat_block,
            self_attn_type=self_attn_type,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            conv_mod_activation_type=conv_mod_activation_type,
            padding_idx=padding_idx,
        )

        self.normalize_before = normalize_before

        if self.normalize_before:
            self.after_norm = LayerNorm(self.enc_out)
Ejemplo n.º 16
0
    def __init__(
        self,
        size: int,
        kernel_size: int,
        dropout_rate: float,
        use_linear_after_conv: bool,
        gate_activation: str,
    ):
        super().__init__()

        n_channels = size // 2  # split input channels
        self.norm = LayerNorm(n_channels)
        self.conv = torch.nn.Conv1d(
            n_channels,
            n_channels,
            kernel_size,
            1,
            (kernel_size - 1) // 2,
            groups=n_channels,
        )
        if use_linear_after_conv:
            self.linear = torch.nn.Linear(n_channels, n_channels)
        else:
            self.linear = None

        if gate_activation == "identity":
            self.act = torch.nn.Identity()
        else:
            self.act = get_activation(gate_activation)

        self.dropout = torch.nn.Dropout(dropout_rate)
Ejemplo n.º 17
0
 def __init__(self,
              idim,
              time_len=8,
              mem_len=0,
              ext_len=0,
              future_len=0,
              attention_type="memory",
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     super(Encoder, self).__init__()
     self.idim = idim
     self.time_len = time_len
     self.future_len = future_len
     self.attention_dim = attention_dim
     self.attention_heads = attention_heads
     self.linear_units = linear_units
     self.dropout_rate = dropout_rate
     self.input_layer = input_layer
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     self.attention_type = attention_type
     self.positional_dropout_rate = positional_dropout_rate
     self.pos_enc_class = pos_enc_class
     self._generateInputLayer()
     if attention_type == "memory":
         self.encoders = repeat(
             num_blocks, lambda:
             EncoderLayerXL(n_head=attention_heads,
                            d_model=attention_dim,
                            d_head=attention_dim // attention_heads,
                            ext_len=ext_len,
                            mem_len=mem_len,
                            future_len=future_len,
                            dropout=dropout_rate,
                            dropatt=attention_dropout_rate,
                            pre_lnorm=normalize_before,
                            pos_ff=PositionwiseFeedForward(
                                attention_dim, linear_units, dropout_rate)))
     elif attention_type == "traditional":
         self.encoders = repeat(
             num_blocks, lambda: EncoderLayerTD(
                 attention_dim,
                 MultiHeadedAttention(attention_heads, attention_dim,
                                      attention_dropout_rate),
                 PositionwiseFeedForward(attention_dim, linear_units,
                                         dropout_rate), dropout_rate,
                 normalize_before, concat_after))
     else:
         ValueError("only memory or traditional can be used")
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
Ejemplo n.º 18
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        self_attention_dropout_rate: float = 0.0,
        src_attention_dropout_rate: float = 0.0,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
        concat_after: bool = False,
    ):
        assert check_argument_types()
        super().__init__()
        attention_dim = encoder_output_size

        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(vocab_size, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(vocab_size, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        else:
            raise ValueError(
                f"only 'embed' or 'linear' is supported: {input_layer}")

        self.normalize_before = normalize_before
        self.decoders = repeat(
            num_blocks,
            lambda: DecoderLayer(
                attention_dim,
                MultiHeadedAttention(attention_heads, attention_dim,
                                     self_attention_dropout_rate),
                MultiHeadedAttention(attention_heads, attention_dim,
                                     src_attention_dropout_rate),
                PositionwiseFeedForward(attention_dim, linear_units,
                                        dropout_rate),
                dropout_rate,
                normalize_before,
                concat_after,
            ),
        )
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
        else:
            self.output_layer = None
Ejemplo n.º 19
0
    def __init__(
        self,
        input_size: int,
        w2v_url: str,
        w2v_dir_path: str = "./",
        output_size: int = 256,
        normalize_before: bool = False,
        freeze_finetune_updates: int = 0,
    ):
        assert check_argument_types()
        super().__init__()

        if w2v_url != "":
            try:
                import fairseq
                from fairseq.models.wav2vec.wav2vec2 import Wav2Vec2Model
            except Exception as e:
                print("Error: FairSeq is not properly installed.")
                print(
                    "Please install FairSeq: cd ${MAIN_ROOT}/tools && make fairseq.done"
                )
                raise e

        self.w2v_model_path = download_w2v(w2v_url, w2v_dir_path)

        self._output_size = output_size

        models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [self.w2v_model_path],
            arg_overrides={"data": w2v_dir_path},
        )
        model = models[0]

        if not isinstance(model, Wav2Vec2Model):
            try:
                model = model.w2v_encoder.w2v_model
            except Exception as e:
                print("Error: pretrained models should be within: "
                      "'Wav2Vec2Model, Wav2VecCTC' classes, etc.")
                raise e

        self.encoders = model

        self.pretrained_params = copy.deepcopy(model.state_dict())

        self.normalize_before = normalize_before
        if self.normalize_before:
            self.after_norm = LayerNorm(output_size)

        if model.cfg.encoder_embed_dim != output_size:
            # TODO(xkc09): try LSTM
            self.output_layer = torch.nn.Sequential(
                torch.nn.Linear(model.cfg.encoder_embed_dim, output_size), )
        else:
            self.output_layer = None

        self.freeze_finetune_updates = freeze_finetune_updates
        self.register_buffer("num_updates", torch.LongTensor([0]))
Ejemplo n.º 20
0
 def __init__(self,
              odim,
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              self_attention_dropout_rate=0.0,
              src_attention_dropout_rate=0.0,
              input_layer="embed",
              use_output_layer=True,
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False,
              moe_att_mode='linear'):
     """Construct an Decoder object."""
     torch.nn.Module.__init__(self)
     if input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(odim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(odim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate))
     else:
         raise NotImplementedError(
             "only `embed` or torch.nn.Module is supported.")
     self.normalize_before = normalize_before
     self.decoders = repeat(
         num_blocks, lambda: HANDecoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim,
                                  self_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim,
                                  src_attention_dropout_rate),
             MultiHeadedAttention(attention_heads, attention_dim,
                                  src_attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units,
                                     dropout_rate),
             dropout_rate=dropout_rate,
             moe_att_mode=moe_att_mode,
             normalize_before=normalize_before,
             concat_after=concat_after,
         ))
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
     if use_output_layer:
         self.output_layer = torch.nn.Linear(attention_dim, odim)
     else:
         self.output_layer = None
Ejemplo n.º 21
0
 def __init__(self,
              idim,
              center_len=8,
              left_len=0,
              hop_len=0,
              right_len=0,
              abs_pos=1,
              rel_pos=0,
              use_mem=1,
              att_type="mta",
              subpos=None,
              subtype="normal",
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     super(Encoder, self).__init__()
     if subpos is None:
         subpos = [0, 0]
     self.idim = idim
     self.center_len = center_len
     self.use_mem = use_mem != 0
     self.left_len = left_len
     if self.use_mem:
         self.mem_len = left_len
     else:
         self.mem_len = 0
     self.hop_len = hop_len
     self.right_len = right_len
     self.abs_pos = abs_pos != 0
     self.rel_pos = rel_pos != 0
     self.attention_dim = attention_dim
     self.attention_heads = attention_heads
     self.linear_units = linear_units
     self.dropout_rate = dropout_rate
     self.input_layer = input_layer
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     self.positional_dropout_rate = positional_dropout_rate
     self.pos_enc_class = pos_enc_class
     self.subpos = subpos
     self.subtype = subtype
     self.num_blocks = num_blocks
     self.attention_dropout_rate = attention_dropout_rate
     self.att_type = att_type
     self.encoders = torch.nn.ModuleList()
     self._generateInputLayer()
     self._generateEncoderLayer()
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
Ejemplo n.º 22
0
    def __init__(
        self,
        hdim: int,
        self_attention: MultiHeadedAttention,
        feed_forward: PositionwiseFeedForward,
        dropout_rate: float,
    ):
        """Construct an DecoderLayer object."""
        super().__init__()

        self.self_attention = self_attention
        self.feed_forward = feed_forward

        self.norm1 = LayerNorm(hdim)
        self.norm2 = LayerNorm(hdim)

        self.dropout = torch.nn.Dropout(dropout_rate)

        self.hdim = hdim
Ejemplo n.º 23
0
 def __init__(
     self,
     size,
     self_attn,
     feed_forward,
     dropout_rate,
     total_layer_num,
     normalize_before=True,
     concat_after=False,
 ):
     """Construct an EncoderLayer object."""
     super(ContextualBlockEncoderLayer, self).__init__()
     self.self_attn = self_attn
     self.feed_forward = feed_forward
     self.norm1 = LayerNorm(size)
     self.norm2 = LayerNorm(size)
     self.dropout = nn.Dropout(dropout_rate)
     self.size = size
     self.normalize_before = normalize_before
     self.concat_after = concat_after
     self.total_layer_num = total_layer_num
     if self.concat_after:
         self.concat_linear = nn.Linear(size + size, size)
Ejemplo n.º 24
0
    def __init__(
        self,
        idim: int,
        enc_arch: List,
        input_layer: str = "linear",
        repeat_block: int = 1,
        self_attn_type: str = "selfattn",
        positional_encoding_type: str = "abs_pos",
        positionwise_layer_type: str = "linear",
        positionwise_activation_type: str = "relu",
        conv_mod_activation_type: str = "relu",
        aux_enc_output_layers: List = [],
        input_layer_dropout_rate: float = 0.0,
        input_layer_pos_enc_dropout_rate: float = 0.0,
        padding_idx: int = -1,
    ):
        """Construct an CustomEncoder object."""
        super().__init__()

        (
            self.embed,
            self.encoders,
            self.enc_out,
            self.conv_subsampling_factor,
        ) = build_blocks(
            "encoder",
            idim,
            input_layer,
            enc_arch,
            repeat_block=repeat_block,
            self_attn_type=self_attn_type,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            conv_mod_activation_type=conv_mod_activation_type,
            input_layer_dropout_rate=input_layer_dropout_rate,
            input_layer_pos_enc_dropout_rate=input_layer_pos_enc_dropout_rate,
            padding_idx=padding_idx,
        )

        self.after_norm = LayerNorm(self.enc_out)

        self.n_blocks = len(enc_arch) * repeat_block

        self.aux_enc_output_layers = aux_enc_output_layers
Ejemplo n.º 25
0
    def __init__(
        self,
        idim,
        enc_arch,
        input_layer="linear",
        repeat_block=0,
        self_attn_type="selfattn",
        positional_encoding_type="abs_pos",
        positionwise_layer_type="linear",
        positionwise_activation_type="relu",
        conv_mod_activation_type="relu",
        normalize_before=True,
        aux_task_layer_list=[],
        padding_idx=-1,
    ):
        """Construct an CustomEncoder object."""
        super().__init__()

        (
            self.embed,
            self.encoders,
            self.enc_out,
            self.conv_subsampling_factor,
        ) = build_blocks(
            "encoder",
            idim,
            input_layer,
            enc_arch,
            repeat_block=repeat_block,
            self_attn_type=self_attn_type,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            conv_mod_activation_type=conv_mod_activation_type,
            padding_idx=padding_idx,
        )

        self.normalize_before = normalize_before

        if self.normalize_before:
            self.after_norm = LayerNorm(self.enc_out)

        self.n_blocks = len(enc_arch) * repeat_block

        self.aux_task_layer_list = aux_task_layer_list
Ejemplo n.º 26
0
 def __init__(self, odim, args):
     super(Decoder, self).__init__()
     self.embed = torch.nn.Sequential(
         torch.nn.Embedding(odim, args.adim),
         PositionalEncoding(args.adim, args.dropout_rate)
     )
     self.decoders = repeat(
         args.dlayers,
         lambda: DecoderLayer(
             args.adim,
             MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate),
             MultiHeadedAttention(args.aheads, args.adim, args.transformer_attn_dropout_rate),
             PositionwiseFeedForward(args.adim, args.dunits, args.dropout_rate),
             args.dropout_rate
         )
     )
     self.output_norm = LayerNorm(args.adim)
     self.output_layer = torch.nn.Linear(args.adim, odim)
Ejemplo n.º 27
0
 def __init__(self,
              idim,
              attention_dim=256,
              attention_heads=4,
              linear_units=2048,
              num_blocks=6,
              dropout_rate=0.1,
              positional_dropout_rate=0.1,
              attention_dropout_rate=0.0,
              input_layer="conv2d",
              pos_enc_class=PositionalEncoding,
              normalize_before=True,
              concat_after=False):
     super(Encoder, self).__init__()
     if input_layer == "linear":
         self.embed = torch.nn.Sequential(
             torch.nn.Linear(idim, attention_dim),
             torch.nn.LayerNorm(attention_dim),
             torch.nn.Dropout(dropout_rate), torch.nn.ReLU(),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif input_layer == "conv2d":
         self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate)
     elif input_layer == "embed":
         self.embed = torch.nn.Sequential(
             torch.nn.Embedding(idim, attention_dim),
             pos_enc_class(attention_dim, positional_dropout_rate))
     elif isinstance(input_layer, torch.nn.Module):
         self.embed = torch.nn.Sequential(
             input_layer,
             pos_enc_class(attention_dim, positional_dropout_rate),
         )
     else:
         raise ValueError("unknown input_layer: " + input_layer)
     self.normalize_before = normalize_before
     self.encoders = repeat(
         num_blocks, lambda: EncoderLayer(
             attention_dim,
             MultiHeadedAttention(attention_heads, attention_dim,
                                  attention_dropout_rate),
             PositionwiseFeedForward(attention_dim, linear_units,
                                     dropout_rate), dropout_rate,
             normalize_before, concat_after))
     if self.normalize_before:
         self.after_norm = LayerNorm(attention_dim)
Ejemplo n.º 28
0
    def __init__(
        self,
        odim,
        edim,
        jdim,
        dec_arch,
        input_layer="embed",
        repeat_block=0,
        joint_activation_type="tanh",
        positional_encoding_type="abs_pos",
        positionwise_layer_type="linear",
        positionwise_activation_type="relu",
        dropout_rate_embed=0.0,
        blank=0,
    ):
        """Construct a Decoder object for transformer-transducer models."""
        torch.nn.Module.__init__(self)

        self.embed, self.decoders, ddim = build_blocks(
            "decoder",
            odim,
            input_layer,
            dec_arch,
            repeat_block=repeat_block,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            dropout_rate_embed=dropout_rate_embed,
            padding_idx=blank,
        )

        self.after_norm = LayerNorm(ddim)

        self.lin_enc = torch.nn.Linear(edim, jdim)
        self.lin_dec = torch.nn.Linear(ddim, jdim, bias=False)
        self.lin_out = torch.nn.Linear(jdim, odim)

        self.joint_activation = get_activation(joint_activation_type)

        self.dunits = ddim
        self.odim = odim

        self.blank = blank
Ejemplo n.º 29
0
    def __init__(
        self,
        vocab_size: int,
        encoder_output_size: int,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        input_layer: str = "embed",
        use_output_layer: bool = True,
        pos_enc_class=PositionalEncoding,
        normalize_before: bool = True,
    ):
        assert check_argument_types()
        super().__init__()
        attention_dim = encoder_output_size

        if input_layer == "embed":
            self.embed = torch.nn.Sequential(
                torch.nn.Embedding(vocab_size, attention_dim),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        elif input_layer == "linear":
            self.embed = torch.nn.Sequential(
                torch.nn.Linear(vocab_size, attention_dim),
                torch.nn.LayerNorm(attention_dim),
                torch.nn.Dropout(dropout_rate),
                torch.nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate),
            )
        else:
            raise ValueError(
                f"only 'embed' or 'linear' is supported: {input_layer}")

        self.normalize_before = normalize_before
        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)
        if use_output_layer:
            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
        else:
            self.output_layer = None

        # Must set by the inheritance
        self.decoders = None
Ejemplo n.º 30
0
    def __init__(
        self,
        idim: int,
        n_layers: int = 2,
        n_chans: int = 384,
        kernel_size: int = 3,
        bias: bool = True,
        dropout_rate: float = 0.5,
        output_dim = 1,
    ):
        """Initilize duration predictor module.

        Args:
            idim (int): Input dimension.
            n_layers (int, optional): Number of convolutional layers.
            n_chans (int, optional): Number of channels of convolutional layers.
            kernel_size (int, optional): Kernel size of convolutional layers.
            dropout_rate (float, optional): Dropout rate.

        """
        # print('n_layers:', n_layers)
        # assert check_argument_types()
        super().__init__()
        self.conv = torch.nn.ModuleList()
        for idx in range(n_layers):
            in_chans = idim if idx == 0 else n_chans
            self.conv += [
                torch.nn.Sequential(
                    torch.nn.Conv1d(
                        in_chans,
                        n_chans,
                        kernel_size,
                        stride=1,
                        padding=(kernel_size - 1) // 2,
                        bias=bias,
                    ),
                    torch.nn.ReLU(),
                    LayerNorm(n_chans, dim=1),
                    torch.nn.Dropout(dropout_rate),
                )
            ]
        self.linear = torch.nn.Linear(n_chans, output_dim)