Ejemplo n.º 1
0
 def __init__(self,
              attention_rnn_dim,
              embedding_dim,
              attention_dim,
              attention_location_n_filters,
              attention_location_kernel_size,
              initscheme="xavier_uniform"):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(in_dim=attention_rnn_dim,
                                   out_dim=attention_dim,
                                   bias=False,
                                   initscheme=initscheme,
                                   nonlinearity="tanh")
     self.memory_layer = LinearNorm(in_dim=embedding_dim,
                                    out_dim=attention_dim,
                                    bias=False,
                                    initscheme=initscheme,
                                    nonlinearity="tanh")
     self.v = LinearNorm(in_dim=attention_dim,
                         out_dim=1,
                         bias=False,
                         initscheme=initscheme)
     self.location_layer = LocationLayer(
         attention_n_filters=attention_location_n_filters,
         attention_kernel_size=attention_location_kernel_size,
         attention_dim=attention_dim,
         initscheme=initscheme)
     self.score_mask_value = -float("inf")
Ejemplo n.º 2
0
    def __init__(self, query_dim, key_dim, num_units, num_heads):
        super().__init__()
        self.num_units = num_units
        self.num_heads = num_heads
        self.key_dim = key_dim

        self.W_query = LinearNorm(query_dim, num_units, bias=False)
        self.W_key = LinearNorm(key_dim, num_units, bias=False)
        self.W_value = LinearNorm(key_dim, num_units, bias=False)
Ejemplo n.º 3
0
    def __init__(self, in_dim, sizes, initscheme='xavier_uniform', activation="relu"):
        super(Prenet, self).__init__()

        in_sizes = [in_dim] + sizes[:-1]

        layers = []
        for in_size, out_size in zip(in_sizes, sizes):
            layers.extend([
                LinearNorm(in_size, out_size, bias=False, initscheme=initscheme, nonlinearity="linear"),
                activation_func(activation)
            ])

        self.layers = nn.ModuleList(layers)
Ejemplo n.º 4
0
    def __init__(self, hparams, dropout=0.5):
        super().__init__()

        self.device = torch.device(
            "cpu" if not torch.cuda.is_available() else hparams.device)

        vocab_size = get_ctc_symbols_length(hparams.charset)
        decoder_dim = hparams.decoder_rnn_dim

        self.use_gaf = hparams.use_gaf

        self.proj = torch.nn.Sequential(
            LinearNorm(decoder_dim,
                       decoder_dim,
                       bias=True,
                       initscheme="xavier_uniform",
                       nonlinearity="relu"), torch.nn.ReLU(),
            torch.nn.Dropout(p=dropout))
        self.ctc_proj = LinearNorm(decoder_dim, vocab_size, bias=True)
        self.ctc = torch.nn.CTCLoss(blank=vocab_size - 1,
                                    reduction="none",
                                    zero_infinity=True)

        self.to(self.device)
Ejemplo n.º 5
0
    def __init__(self, attention_n_filters, attention_kernel_size, attention_dim, initscheme="xavier_uniform"):
        super(LocationLayer, self).__init__()

        self.location_conv = ConvNorm(
            dimensions=1,
            in_channels=2,
            out_channels=attention_n_filters,
            kernel_size=attention_kernel_size,
            padding=int((attention_kernel_size - 1) / 2),
            bias=False,
            stride=1,
            dilation=1,
            initscheme=initscheme
        )
        self.location_dense = LinearNorm(
            in_dim=attention_n_filters,
            out_dim=attention_dim,
            bias=False,
            initscheme=initscheme,
            nonlinearity='tanh'
        )
Ejemplo n.º 6
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold

        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout

        self.use_mmi = hparams.use_mmi

        self.prenet = Prenet(in_dim=hparams.n_mel_channels *
                             hparams.n_frames_per_step,
                             sizes=[hparams.prenet_dim, hparams.prenet_dim],
                             initscheme=hparams.initscheme,
                             activation=hparams.activation)

        self.attention_rnn = nn.LSTMCell(input_size=hparams.prenet_dim +
                                         hparams.encoder_embedding_dim,
                                         hidden_size=hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            attention_rnn_dim=hparams.attention_rnn_dim,
            embedding_dim=hparams.encoder_embedding_dim,
            attention_dim=hparams.attention_dim,
            attention_location_n_filters=hparams.attention_location_n_filters,
            attention_location_kernel_size=hparams.
            attention_location_kernel_size,
            initscheme=hparams.initscheme)

        self.decoder_rnn = nn.LSTMCell(input_size=hparams.attention_rnn_dim +
                                       hparams.encoder_embedding_dim,
                                       hidden_size=hparams.decoder_rnn_dim,
                                       bias=True)

        lp_out_dim = hparams.decoder_rnn_dim if self.use_mmi else hparams.n_mel_channels * hparams.n_frames_per_step

        self.mel_layer = None
        if not self.use_mmi:
            self.linear_projection = LinearNorm(
                in_dim=hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
                out_dim=lp_out_dim,
                bias=True,
                initscheme=hparams.initscheme)
        else:
            self.linear_projection = nn.Sequential(
                LinearNorm(in_dim=hparams.decoder_rnn_dim +
                           hparams.encoder_embedding_dim,
                           out_dim=lp_out_dim,
                           bias=True,
                           initscheme=hparams.initscheme,
                           nonlinearity="relu"),
                nn.ReLU(),
                nn.Dropout(p=0.5),
            )

            self.mel_layer = nn.Sequential(
                LinearNorm(in_dim=hparams.decoder_rnn_dim,
                           out_dim=hparams.decoder_rnn_dim,
                           bias=True,
                           initscheme=hparams.initscheme,
                           nonlinearity="relu"), nn.ReLU(), nn.Dropout(p=0.5),
                LinearNorm(in_dim=hparams.decoder_rnn_dim,
                           out_dim=hparams.n_mel_channels *
                           hparams.n_frames_per_step))

        gate_in_dim = hparams.decoder_rnn_dim if self.use_mmi else \
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim

        self.gate_layer = LinearNorm(in_dim=gate_in_dim,
                                     out_dim=1,
                                     bias=True,
                                     nonlinearity="sigmoid")