def lstm(self, lstm_in, nb_samples, scope_name, test): """ Apply 3-layered LSTM """ if self.unidirectional: lstm_hidden_size = self.hidden_size else: lstm_hidden_size = self.hidden_size // 2 h = nn.Variable((self.nb_layers, self.nb_of_directions, nb_samples, lstm_hidden_size), need_grad=False) c = nn.Variable((self.nb_layers, self.nb_of_directions, nb_samples, lstm_hidden_size), need_grad=False) h.data.zero() c.data.zero() lstm_out, _, _ = PF.lstm(lstm_in, h, c, num_layers=self.nb_layers, bidirectional=not self.unidirectional, training=not test, dropout=0.4, name=scope_name) return lstm_out
def call(self, inputs): r"""Encoder layer. Args: inputs (nn.Variable): An input variable of shape (B, T) indicates indices of character embeddings. Returns: nn.Variable: Output variable of shape (T, B, C). """ hp = self._hparams with nn.parameter_scope('embeddings'): val = np.sqrt(6.0 / (len(hp.vocab) + hp.symbols_embedding_dim)) inputs = PF.embed( inputs, n_inputs=len(hp.vocab), n_features=hp.symbols_embedding_dim, initializer=UniformInitializer(lim=(-val, val))) # (B, T, C=512) with nn.parameter_scope('ngrams'): out = inputs for i in range(hp.encoder_n_convolutions): with nn.parameter_scope(f'filter_{i}'): out = conv_norm(out, out_channels=hp.encoder_embedding_dim, kernel_size=hp.encoder_kernel_size, padding=(hp.encoder_kernel_size - 1) // 2, bias=False, stride=1, dilation=1, w_init_gain='relu', scope='conv_norm', channel_last=True) # (B, C=512, T) out = PF.batch_normalization(out, batch_stat=self.training, axes=[2]) out = F.relu(out) if self.training: # (B, C=512, T) --> (B, T, C=512) out = F.dropout(out, 0.5) with nn.parameter_scope('lstm_encoder'): out = F.transpose(out, (1, 0, 2)) # (2, 0, 1)) h = F.constant(shape=(2, 2, hp.batch_size, hp.encoder_embedding_dim // 2)) c = F.constant(shape=(2, 2, hp.batch_size, hp.encoder_embedding_dim // 2)) out, _, _ = PF.lstm(out, h, c, training=self.training, bidirectional=True) return out # (T, B, C=512)
def lstm(self, lstm_in, nb_samples, scope_name): ''' Apply 3-layered LSTM ''' h = F.constant(shape=(self.nb_layers, self.nb_of_directions, nb_samples, self.hidden_size // 2)) c = F.constant(shape=(self.nb_layers, self.nb_of_directions, nb_samples, self.hidden_size // 2)) lstm_out, _, _ = PF.lstm(lstm_in, h, c, num_layers=self.nb_layers, bidirectional=True, training=not self.test, dropout=0.4, name=scope_name) return lstm_out
def stack_lstm(x, prev_h, prev_c, state_size): """ stacked LSTMs. Consists of 2 layers inside. """ lstm_size = prev_h[0].shape[1] next_h = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(len(prev_h)) ] next_c = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(len(prev_c)) ] for layer_id, (_h, _c) in enumerate(zip(prev_h, prev_c)): inputs = x if layer_id == 0 else next_h[layer_id - 1] with nn.parameter_scope(str(layer_id)): curr_h, curr_c = PF.lstm(inputs, _h, _c, state_size) next_h[layer_id] = curr_h next_c[layer_id] = curr_c return next_h, next_c
def test_pf_lstm_execution(g_rng, inshape, w0_init, w_init, b_init, num_layers, dropout, bidirectional, with_bias, hidden_size, training, fix_parameters, rng, ctx, func_name): with nn.context_scope(ctx): if func_name == "LSTM": pytest.skip("Not implemented in CPU.") num_directions = 2 if bidirectional else 1 w0_shape = (num_directions, 4, hidden_size, inshape[2] + hidden_size) w_shape = (max(1, num_layers - 1), num_directions, 4, hidden_size, num_directions * hidden_size + hidden_size) b_shape = (num_layers, num_directions, 4, hidden_size) w0_init = process_param_init(w0_init, w0_shape, g_rng) w_init = process_param_init(w_init, w_shape, g_rng) b_init = process_param_init(b_init, b_shape, g_rng) rng = process_rng(rng) kw = {} insert_if_not_none(kw, 'w0_init', w0_init) insert_if_not_none(kw, 'w_init', w_init) insert_if_not_none(kw, 'b_init', b_init) insert_if_not_default(kw, 'num_layers', num_layers, 1) insert_if_not_default(kw, 'dropout', dropout, 0.0) insert_if_not_default(kw, 'bidirectional', bidirectional, False) insert_if_not_default(kw, 'training', training, True) insert_if_not_none(kw, 'rng', rng) insert_if_not_default(kw, 'with_bias', with_bias, True) insert_if_not_default(kw, 'fix_parameters', fix_parameters, False) x = nn.Variable.from_numpy_array(g_rng.randn(*inshape)) h = nn.Variable.from_numpy_array( g_rng.randn(*(num_layers, num_directions, inshape[1], hidden_size))) c = nn.Variable.from_numpy_array( g_rng.randn(*(num_layers, num_directions, inshape[1], hidden_size))) # Check execution y, hn, cn = PF.lstm(x, h, c, **kw) y.forward() if training: y.backward() # Check values # TODO # Check args assert y.parent.info.type_name == 'LSTM' args = y.parent.info.args # Check created parameters assert y.parent.inputs[0] == x assert y.parent.inputs[1] == h assert y.parent.inputs[2] == c w0 = nn.get_parameters()['lstm/weight_l0'] assert w0.shape == w0_shape assert w0.need_grad assert y.parent.inputs[3].need_grad == (not fix_parameters) if isinstance(w0_init, np.ndarray): assert np.allclose(w0_init, w0.d) if num_layers > 1: w = nn.get_parameters()['lstm/weight'] assert w.shape == w_shape assert w.need_grad assert y.parent.inputs[4].need_grad == (not fix_parameters) if isinstance(w_init, np.ndarray): assert np.allclose(w_init, w.d) if with_bias: b = nn.get_parameters()['lstm/bias'] assert b.shape == b_shape assert b.need_grad if num_layers > 1: assert y.parent.inputs[5].need_grad == (not fix_parameters) else: assert y.parent.inputs[4].need_grad == (not fix_parameters) if isinstance(b_init, np.ndarray): assert np.allclose(b_init, b.d)
def call(self, memory, decoder_inputs=None): r"""Return mel-spectrograms, gate outputs and an attention matrix. Args: memory (nn.Variable): A 3D tensor of shape (B, T, C). decoder_inputs (nn.Variable, optional): A 3D tensor with shape of (B, T/r, r*n_mels). Shifted log melspectrogram of sound files. Defaults to None. Returns: nn.Variable: The synthetic mel-spectrograms of shape (B, Ty/r, r*n_mels). nn.Variable: The gate outputs of shape (B, Ty). nn.Variable: The attention matrix of shape (B, Tx, Ty). """ hp = self._hparams mel_shape = hp.n_mels * hp.r # initialize decoder states decoder_input = F.constant(shape=(hp.batch_size, 1, mel_shape)) decoder_hidden = F.constant(shape=(1, 1, hp.batch_size, hp.decoder_rnn_dim)) decoder_cell = F.constant(shape=(1, 1, hp.batch_size, hp.decoder_rnn_dim)) # initialize attention states attention_weights = F.constant(shape=(hp.batch_size, 1, hp.text_len)) attention_weights_cum = F.constant(shape=(hp.batch_size, 1, hp.text_len)) attention_context = F.constant(shape=(hp.batch_size, 1, hp.encoder_embedding_dim)) attention_hidden = F.constant(shape=(1, 1, hp.batch_size, hp.attention_rnn_dim)) attention_cell = F.constant(shape=(1, 1, hp.batch_size, hp.attention_rnn_dim)) # store outputs mel_outputs, gate_outputs, alignments = [], [], [] for i in range(hp.mel_len): if i > 0: decoder_input = (mel_outputs[-1] if decoder_inputs is None else decoder_inputs[:, i - 1:i, :]) if decoder_inputs is None: decoder_input = decoder_input[None, ...] # decoder of shape (B, 1, prenet_channels=256) decoder_input = prenet(decoder_input, hp.prenet_channels, is_training=self.training, scope='prenet') with nn.parameter_scope('attention_rnn'): # cell_input of shape (B, 1, prenet_channels[-1] + C=768) cell_input = F.concatenate(decoder_input, attention_context, axis=2) _, attention_hidden, attention_cell = PF.lstm( F.transpose(cell_input, (1, 0, 2)), attention_hidden, attention_cell, training=self.training, name='lstm_attention' ) # (1, 1, B, attention_hidden), (1, 1, B, attention_hidden) if self.training: attention_hidden = F.dropout(attention_hidden, hp.p_attention_dropout) with nn.parameter_scope('location_attention'): attention_weights_cat = F.concatenate(attention_weights, attention_weights_cum, axis=1) attention_context, attention_weights = location_sensitive_attention( F.transpose(attention_hidden[0], (1, 0, 2)), memory, attention_weights_cat, attention_location_kernel_size=hp. attention_location_kernel_size, attention_n_filters=hp.attention_location_n_filters, attention_dim=hp.attention_dim, is_training=self.training, scope='ls_attention') attention_weights_cum += attention_weights alignments.append(attention_weights) with nn.parameter_scope('decoder_rnn'): # (1, B, attention_rnn_dim + encoder_embedding_dim) inp_decoder = F.concatenate(attention_hidden[0], F.transpose( attention_context, (1, 0, 2)), axis=2) _, decoder_hidden, decoder_cell = PF.lstm( inp_decoder, decoder_hidden, decoder_cell, training=self.training, name='lstm_decoder') if self.training: decoder_hidden = F.dropout(decoder_hidden, hp.p_decoder_dropout) with nn.parameter_scope('projection'): proj_input = F.concatenate( decoder_hidden[0, 0], F.reshape(attention_context, (hp.batch_size, -1), inplace=False), axis=1) # (B, decoder_rnn_dim + encoder_embedding_dim) decoder_output = affine_norm(proj_input, mel_shape, base_axis=1, with_bias=True, w_init_gain='affine', scope='affine') mel_outputs.append(decoder_output) with nn.parameter_scope('gate_prediction'): gate_prediction = affine_norm(proj_input, 1, base_axis=1, with_bias=True, w_init_gain='sigmoid', scope='affine') gate_outputs.append(gate_prediction) # (B, T2, n_mels*r) mel_outputs = F.stack(*mel_outputs, axis=1) gate_outputs = F.concatenate(*gate_outputs, axis=1) # (B, T2) alignments = F.concatenate(*alignments, axis=1) # (B, T1, T2) return mel_outputs, gate_outputs, alignments
def __call__(self, x, test=False): # x = PF.mean_subtraction(x, base_axis=0) if not self.input_is_spectrogram: x = Spectrogram(*STFT(x, n_fft=self.n_fft, n_hop=self.n_hop), power=self.power, mono=(self.nb_channels == 1)) nb_frames, nb_samples, nb_channels, nb_bins = x.shape mix = x x = x[..., :self.nb_bins] x += F.reshape(self.input_mean, shape=(1, 1, 1, self.nb_bins), inplace=False) x *= F.reshape(self.input_scale, shape=(1, 1, 1, self.nb_bins), inplace=False) with nn.parameter_scope("fc1"): x = PF.affine(x, self.hidden_size, base_axis=2) x = PF.batch_normalization(x, batch_stat=not test) x = F.tanh(x) with nn.parameter_scope("lstm"): if self.unidirectional: lstm_hidden_size = self.hidden_size else: lstm_hidden_size = self.hidden_size // 2 h = nn.Variable((self.nb_layers, self.nb_of_directions, nb_samples, lstm_hidden_size), need_grad=False) h.d = np.zeros(h.shape) c = nn.Variable((self.nb_layers, self.nb_of_directions, nb_samples, lstm_hidden_size), need_grad=False) c.d = np.zeros(c.shape) lstm_out, _, _ = PF.lstm(x, h, c, num_layers=self.nb_layers, bidirectional=not self.unidirectional, training=not test) x = F.concatenate(x, lstm_out) # concatenate along last axis with nn.parameter_scope("fc2"): x = PF.affine( x, (self.hidden_size), base_axis=2, ) x = PF.batch_normalization(x, batch_stat=not test) x = F.relu(x) with nn.parameter_scope("fc3"): x = PF.affine( x, (nb_channels, nb_bins), base_axis=2, ) x = PF.batch_normalization(x, batch_stat=not test) x = x.reshape( (nb_frames, nb_samples, nb_channels, self.nb_output_bins)) # apply output scaling x *= F.reshape(self.output_scale, shape=(1, 1, 1, self.nb_output_bins), inplace=False) x += F.reshape(self.output_mean, shape=(1, 1, 1, self.nb_output_bins), inplace=False) x = F.relu(x) * mix return x