def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size, initscheme="xavier_uniform"): super(Attention, self).__init__() self.query_layer = LinearNorm(in_dim=attention_rnn_dim, out_dim=attention_dim, bias=False, initscheme=initscheme, nonlinearity="tanh") self.memory_layer = LinearNorm(in_dim=embedding_dim, out_dim=attention_dim, bias=False, initscheme=initscheme, nonlinearity="tanh") self.v = LinearNorm(in_dim=attention_dim, out_dim=1, bias=False, initscheme=initscheme) self.location_layer = LocationLayer( attention_n_filters=attention_location_n_filters, attention_kernel_size=attention_location_kernel_size, attention_dim=attention_dim, initscheme=initscheme) self.score_mask_value = -float("inf")
def __init__(self, query_dim, key_dim, num_units, num_heads): super().__init__() self.num_units = num_units self.num_heads = num_heads self.key_dim = key_dim self.W_query = LinearNorm(query_dim, num_units, bias=False) self.W_key = LinearNorm(key_dim, num_units, bias=False) self.W_value = LinearNorm(key_dim, num_units, bias=False)
def __init__(self, in_dim, sizes, initscheme='xavier_uniform', activation="relu"): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] layers = [] for in_size, out_size in zip(in_sizes, sizes): layers.extend([ LinearNorm(in_size, out_size, bias=False, initscheme=initscheme, nonlinearity="linear"), activation_func(activation) ]) self.layers = nn.ModuleList(layers)
def __init__(self, hparams, dropout=0.5): super().__init__() self.device = torch.device( "cpu" if not torch.cuda.is_available() else hparams.device) vocab_size = get_ctc_symbols_length(hparams.charset) decoder_dim = hparams.decoder_rnn_dim self.use_gaf = hparams.use_gaf self.proj = torch.nn.Sequential( LinearNorm(decoder_dim, decoder_dim, bias=True, initscheme="xavier_uniform", nonlinearity="relu"), torch.nn.ReLU(), torch.nn.Dropout(p=dropout)) self.ctc_proj = LinearNorm(decoder_dim, vocab_size, bias=True) self.ctc = torch.nn.CTCLoss(blank=vocab_size - 1, reduction="none", zero_infinity=True) self.to(self.device)
def __init__(self, attention_n_filters, attention_kernel_size, attention_dim, initscheme="xavier_uniform"): super(LocationLayer, self).__init__() self.location_conv = ConvNorm( dimensions=1, in_channels=2, out_channels=attention_n_filters, kernel_size=attention_kernel_size, padding=int((attention_kernel_size - 1) / 2), bias=False, stride=1, dilation=1, initscheme=initscheme ) self.location_dense = LinearNorm( in_dim=attention_n_filters, out_dim=attention_dim, bias=False, initscheme=initscheme, nonlinearity='tanh' )
def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.use_mmi = hparams.use_mmi self.prenet = Prenet(in_dim=hparams.n_mel_channels * hparams.n_frames_per_step, sizes=[hparams.prenet_dim, hparams.prenet_dim], initscheme=hparams.initscheme, activation=hparams.activation) self.attention_rnn = nn.LSTMCell(input_size=hparams.prenet_dim + hparams.encoder_embedding_dim, hidden_size=hparams.attention_rnn_dim) self.attention_layer = Attention( attention_rnn_dim=hparams.attention_rnn_dim, embedding_dim=hparams.encoder_embedding_dim, attention_dim=hparams.attention_dim, attention_location_n_filters=hparams.attention_location_n_filters, attention_location_kernel_size=hparams. attention_location_kernel_size, initscheme=hparams.initscheme) self.decoder_rnn = nn.LSTMCell(input_size=hparams.attention_rnn_dim + hparams.encoder_embedding_dim, hidden_size=hparams.decoder_rnn_dim, bias=True) lp_out_dim = hparams.decoder_rnn_dim if self.use_mmi else hparams.n_mel_channels * hparams.n_frames_per_step self.mel_layer = None if not self.use_mmi: self.linear_projection = LinearNorm( in_dim=hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, out_dim=lp_out_dim, bias=True, initscheme=hparams.initscheme) else: self.linear_projection = nn.Sequential( LinearNorm(in_dim=hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, out_dim=lp_out_dim, bias=True, initscheme=hparams.initscheme, nonlinearity="relu"), nn.ReLU(), nn.Dropout(p=0.5), ) self.mel_layer = nn.Sequential( LinearNorm(in_dim=hparams.decoder_rnn_dim, out_dim=hparams.decoder_rnn_dim, bias=True, initscheme=hparams.initscheme, nonlinearity="relu"), nn.ReLU(), nn.Dropout(p=0.5), LinearNorm(in_dim=hparams.decoder_rnn_dim, out_dim=hparams.n_mel_channels * hparams.n_frames_per_step)) gate_in_dim = hparams.decoder_rnn_dim if self.use_mmi else \ hparams.decoder_rnn_dim + hparams.encoder_embedding_dim self.gate_layer = LinearNorm(in_dim=gate_in_dim, out_dim=1, bias=True, nonlinearity="sigmoid")