def cbhg(inputs, K, projections, depth, is_training, scope): r"""Returns the 1D Convolution Bank Highwaynet bindirectional GRU (CBHG) module. Args: inputs (nn.Variable): NNabla Variable of shape (B, C, T). K (int): Maximum kernel size. projections (list of int): A list of channels. depth (int): A depth. This should be an even number. is_training (bool): Whether training mode is activated. scope (str): The parameter scope name. Returns: nn.Variable: Output variable. """ with nn.parameter_scope(scope): # Convolution bank: concatenate channels from all 1D convolutions with nn.parameter_scope('conv_bank'): conv = partial(conv1d, inputs, channels=128, activation=F.relu, is_training=is_training) conv_outputs = [ conv(kernel_size=k, scope=f'conv1d_{k}') for k in range(1, K + 1) ] conv_outputs = F.concatenate(*conv_outputs, axis=1) # make sure a valid input to max_pooling x = F.pad(conv_outputs, (0, ) * 5 + (1, ), mode='constant') # Maxpooling: reshape is needed because nnabla does support 1D pooling maxpool_output = F.max_pooling(x.reshape(x.shape + (1, )), kernel=(2, 1), stride=(1, 1)).reshape(conv_outputs.shape) # Two projection layers: proj1_output = conv1d(maxpool_output, kernel_size=3, channels=projections[0], activation=F.relu, is_training=is_training, scope='proj_1') proj2_output = conv1d(proj1_output, kernel_size=3, channels=projections[1], activation=None, is_training=is_training, scope='proj_2') # Residual connection: highway_input = proj2_output + inputs assert depth % 2 == 0 half_depth = depth // 2 with nn.parameter_scope('highwaynet'): # transposing to shape (B, T, C) highway_input = F.transpose(highway_input, (0, 2, 1)) # Handle dimensionality mismatch: if highway_input.shape[2] != half_depth: highway_input = PF.affine(highway_input, half_depth, base_axis=2, name='adjust_dim') # 4-layer HighwayNet: for i in range(4): highway_input = highwaynet(highway_input, half_depth, scope=f'highway_{i+1}') with nn.parameter_scope('rnn_net'): # transpose to shape (T, B, C) rnn_input = F.transpose(highway_input, (1, 0, 2)) outputs, _ = PF.gru(rnn_input, F.constant(shape=(2, 2, rnn_input.shape[1], half_depth)), training=is_training, bidirectional=True) # (T, B, C) return outputs
def test_pf_gru_execution(g_rng, inshape, w0_init, w_init, b_init, num_layers, dropout, bidirectional, with_bias, hidden_size, training, fix_parameters, rng, ctx, func_name): with nn.context_scope(ctx): if func_name == "GRU": pytest.skip("Not implemented in CPU.") num_directions = 2 if bidirectional else 1 w0_shape = (num_directions, 3, hidden_size, inshape[2] + hidden_size) w_shape = (max(1, num_layers - 1), num_directions, 3, hidden_size, num_directions * hidden_size + hidden_size) b_shape = (num_layers, num_directions, 4, hidden_size) w0_init = process_param_init(w0_init, w0_shape, g_rng) w_init = process_param_init(w_init, w_shape, g_rng) b_init = process_param_init(b_init, b_shape, g_rng) rng = process_rng(rng) kw = {} insert_if_not_none(kw, 'w0_init', w0_init) insert_if_not_none(kw, 'w_init', w_init) insert_if_not_none(kw, 'b_init', b_init) insert_if_not_default(kw, 'num_layers', num_layers, 1) insert_if_not_default(kw, 'dropout', dropout, 0.0) insert_if_not_default(kw, 'bidirectional', bidirectional, False) insert_if_not_default(kw, 'training', training, True) insert_if_not_none(kw, 'rng', rng) insert_if_not_default(kw, 'with_bias', with_bias, True) insert_if_not_default(kw, 'fix_parameters', fix_parameters, False) x = nn.Variable.from_numpy_array(g_rng.randn(*inshape)) h = nn.Variable.from_numpy_array( g_rng.randn(*(num_layers, num_directions, inshape[1], hidden_size))) # Check execution y, hn = PF.gru(x, h, **kw) y.forward() if training: y.backward() # Check values # TODO # Check args assert y.parent.info.type_name == 'GRU' args = y.parent.info.args # Check created parameters assert y.parent.inputs[0] == x assert y.parent.inputs[1] == h w0 = nn.get_parameters()['gru/weight_l0'] assert w0.shape == w0_shape assert w0.need_grad assert y.parent.inputs[2].need_grad == (not fix_parameters) if isinstance(w0_init, np.ndarray): assert np.allclose(w0_init, w0.d) if num_layers > 1: w = nn.get_parameters()['gru/weight'] assert w.shape == w_shape assert w.need_grad assert y.parent.inputs[3].need_grad == (not fix_parameters) if isinstance(w_init, np.ndarray): assert np.allclose(w_init, w.d) if with_bias: b = nn.get_parameters()['gru/bias'] assert b.shape == b_shape assert b.need_grad if num_layers > 1: assert y.parent.inputs[4].need_grad == (not fix_parameters) else: assert y.parent.inputs[3].need_grad == (not fix_parameters) if isinstance(b_init, np.ndarray): assert np.allclose(b_init, b.d)
def call(self, memory, inputs=None): r"""Return mel-spectrogram and attention matrix. Args: memory(nn.Variable): A 3D tensor of shape (T, B, C). inputs(nn.Variable, optional): A 3D tensor with shape of [B, T/r, n_mels(*r)]. Shifted log melspectrogram of sound files. Defaults to None. Returns: nn.Variable: The synthetic mel-spectrograms of shape (B, Ty/r, r*n_mels). nn.Variable: The attention matrix of shape (B, Tx, Ty). References: - https://github.com/Kyubyong/tacotron/ """ hp = self._hparams bz, mel_shape = hp.batch_size, hp.n_mels * hp.r encoder_dim = hp.encoder_embedding_dim # initialize input tensor input = F.constant(shape=(bz, 1, mel_shape)) # initialize hidden states context = F.constant(shape=(bz, 1, hp.attention_dim)) hidden = F.constant(shape=(1, 1, bz, encoder_dim)) h_gru = [ F.constant(shape=(1, 1, bz, encoder_dim)), F.constant(shape=(1, 1, bz, encoder_dim)) ] outputs, attends = [], [] for i in range(hp.n_frames): if i > 0: input = (outputs[-1] if inputs is None else inputs[:, i - 1:i, :]) # feed a prenet to the input input = prenet(input, layer_sizes=hp.prenet_channels, is_training=self.training, scope='prenet_decoder') # (bz, 1, C) # concat the input and context vector input = F.concatenate(input, context) # (bz, 1, 384) with nn.parameter_scope('rnn_attention'): # calculate the output output, hidden = PF.gru( input.reshape((1, bz, -1)), hidden, training=self.training, bidirectional=False) # (1, bz, 256), (1, 1, bz, 256) # compute the context and attention vectors context, attend = Bahdanau_attention( F.transpose(hidden[0], (1, 0, 2)), memory, out_features=hp.attention_dim, scope='Bahdanau_attention') # (bz, 1, 256), (bz, 1, T) with nn.parameter_scope('rnn_decoder'): # concat RNN output and attention context vector with nn.parameter_scope('project_to_decoder'): output = F.concatenate(output, F.transpose(context, (1, 0, 2)), axis=2) output = PF.affine(output, encoder_dim, base_axis=2) # (1, bz, 256) # decoder RNN with residual connection for j in range(2): with nn.parameter_scope(f'gru_resisidual_{j}'): out, h_gru[j] = PF.gru(output, h_gru[j], training=self.training, bidirectional=False) output += out # (1, bz, 256) # projector to mels with nn.parameter_scope('project_to_mel'): output = F.transpose(output, (1, 0, 2)) # (bz, 1, n_mels*r) output = PF.affine(output, mel_shape, base_axis=2) outputs.append(output) attends.append(attend) outputs = F.concatenate(*outputs, axis=1) # (B, T2, C2) attends = F.concatenate(*attends, axis=1) # (B, T2, T1) return outputs, attends