Esempio n. 1
0
def cbhg(inputs, K, projections, depth, is_training, scope):
    r"""Returns the 1D Convolution Bank Highwaynet bindirectional
    GRU (CBHG) module.

    Args:
        inputs (nn.Variable): NNabla Variable of shape (B, C, T).
        K (int): Maximum kernel size.
        projections (list of int): A list of channels.
        depth (int): A depth. This should be an even number.
        is_training (bool): Whether training mode is activated.
        scope (str): The parameter scope name.

    Returns:
        nn.Variable: Output variable.
    """

    with nn.parameter_scope(scope):
        # Convolution bank: concatenate channels from all 1D convolutions
        with nn.parameter_scope('conv_bank'):
            conv = partial(conv1d,
                           inputs,
                           channels=128,
                           activation=F.relu,
                           is_training=is_training)
            conv_outputs = [
                conv(kernel_size=k, scope=f'conv1d_{k}')
                for k in range(1, K + 1)
            ]
            conv_outputs = F.concatenate(*conv_outputs, axis=1)

        # make sure a valid input to max_pooling
        x = F.pad(conv_outputs, (0, ) * 5 + (1, ), mode='constant')

        # Maxpooling: reshape is needed because nnabla does support 1D pooling
        maxpool_output = F.max_pooling(x.reshape(x.shape + (1, )),
                                       kernel=(2, 1),
                                       stride=(1,
                                               1)).reshape(conv_outputs.shape)

        # Two projection layers:
        proj1_output = conv1d(maxpool_output,
                              kernel_size=3,
                              channels=projections[0],
                              activation=F.relu,
                              is_training=is_training,
                              scope='proj_1')
        proj2_output = conv1d(proj1_output,
                              kernel_size=3,
                              channels=projections[1],
                              activation=None,
                              is_training=is_training,
                              scope='proj_2')

        # Residual connection:
        highway_input = proj2_output + inputs

        assert depth % 2 == 0
        half_depth = depth // 2

        with nn.parameter_scope('highwaynet'):
            # transposing to shape (B, T, C)
            highway_input = F.transpose(highway_input, (0, 2, 1))

            # Handle dimensionality mismatch:
            if highway_input.shape[2] != half_depth:
                highway_input = PF.affine(highway_input,
                                          half_depth,
                                          base_axis=2,
                                          name='adjust_dim')

            # 4-layer HighwayNet:
            for i in range(4):
                highway_input = highwaynet(highway_input,
                                           half_depth,
                                           scope=f'highway_{i+1}')

        with nn.parameter_scope('rnn_net'):
            # transpose to shape (T, B, C)
            rnn_input = F.transpose(highway_input, (1, 0, 2))
            outputs, _ = PF.gru(rnn_input,
                                F.constant(shape=(2, 2, rnn_input.shape[1],
                                                  half_depth)),
                                training=is_training,
                                bidirectional=True)  # (T, B, C)

    return outputs
def test_pf_gru_execution(g_rng, inshape, w0_init, w_init, b_init, num_layers,
                          dropout, bidirectional, with_bias, hidden_size,
                          training, fix_parameters, rng, ctx, func_name):

    with nn.context_scope(ctx):
        if func_name == "GRU":
            pytest.skip("Not implemented in CPU.")

        num_directions = 2 if bidirectional else 1
        w0_shape = (num_directions, 3, hidden_size, inshape[2] + hidden_size)
        w_shape = (max(1, num_layers - 1), num_directions, 3, hidden_size,
                   num_directions * hidden_size + hidden_size)
        b_shape = (num_layers, num_directions, 4, hidden_size)

        w0_init = process_param_init(w0_init, w0_shape, g_rng)
        w_init = process_param_init(w_init, w_shape, g_rng)
        b_init = process_param_init(b_init, b_shape, g_rng)
        rng = process_rng(rng)

        kw = {}
        insert_if_not_none(kw, 'w0_init', w0_init)
        insert_if_not_none(kw, 'w_init', w_init)
        insert_if_not_none(kw, 'b_init', b_init)
        insert_if_not_default(kw, 'num_layers', num_layers, 1)
        insert_if_not_default(kw, 'dropout', dropout, 0.0)
        insert_if_not_default(kw, 'bidirectional', bidirectional, False)
        insert_if_not_default(kw, 'training', training, True)
        insert_if_not_none(kw, 'rng', rng)
        insert_if_not_default(kw, 'with_bias', with_bias, True)
        insert_if_not_default(kw, 'fix_parameters', fix_parameters, False)

        x = nn.Variable.from_numpy_array(g_rng.randn(*inshape))
        h = nn.Variable.from_numpy_array(
            g_rng.randn(*(num_layers, num_directions, inshape[1],
                          hidden_size)))

        # Check execution
        y, hn = PF.gru(x, h, **kw)
        y.forward()
        if training:
            y.backward()

        # Check values
        # TODO

        # Check args
        assert y.parent.info.type_name == 'GRU'
        args = y.parent.info.args

        # Check created parameters
        assert y.parent.inputs[0] == x
        assert y.parent.inputs[1] == h
        w0 = nn.get_parameters()['gru/weight_l0']
        assert w0.shape == w0_shape
        assert w0.need_grad
        assert y.parent.inputs[2].need_grad == (not fix_parameters)
        if isinstance(w0_init, np.ndarray):
            assert np.allclose(w0_init, w0.d)
        if num_layers > 1:
            w = nn.get_parameters()['gru/weight']
            assert w.shape == w_shape
            assert w.need_grad
            assert y.parent.inputs[3].need_grad == (not fix_parameters)
            if isinstance(w_init, np.ndarray):
                assert np.allclose(w_init, w.d)
        if with_bias:
            b = nn.get_parameters()['gru/bias']
            assert b.shape == b_shape
            assert b.need_grad
            if num_layers > 1:
                assert y.parent.inputs[4].need_grad == (not fix_parameters)
            else:
                assert y.parent.inputs[3].need_grad == (not fix_parameters)
            if isinstance(b_init, np.ndarray):
                assert np.allclose(b_init, b.d)
Esempio n. 3
0
    def call(self, memory, inputs=None):
        r"""Return mel-spectrogram and attention matrix.

        Args:
            memory(nn.Variable): A 3D tensor of shape (T, B, C).
            inputs(nn.Variable, optional): A 3D tensor with shape of
                [B, T/r, n_mels(*r)]. Shifted log melspectrogram of sound files.
                Defaults to None.

        Returns:
            nn.Variable: The synthetic mel-spectrograms of shape
                (B, Ty/r, r*n_mels).
            nn.Variable: The attention matrix of shape
                (B, Tx, Ty).

        References:
            - https://github.com/Kyubyong/tacotron/
        """
        hp = self._hparams
        bz, mel_shape = hp.batch_size, hp.n_mels * hp.r
        encoder_dim = hp.encoder_embedding_dim

        # initialize input tensor
        input = F.constant(shape=(bz, 1, mel_shape))

        # initialize hidden states
        context = F.constant(shape=(bz, 1, hp.attention_dim))
        hidden = F.constant(shape=(1, 1, bz, encoder_dim))
        h_gru = [
            F.constant(shape=(1, 1, bz, encoder_dim)),
            F.constant(shape=(1, 1, bz, encoder_dim))
        ]

        outputs, attends = [], []

        for i in range(hp.n_frames):
            if i > 0:
                input = (outputs[-1] if inputs is None else inputs[:,
                                                                   i - 1:i, :])

            # feed a prenet to the input
            input = prenet(input,
                           layer_sizes=hp.prenet_channels,
                           is_training=self.training,
                           scope='prenet_decoder')  # (bz, 1, C)

            # concat the input and context vector
            input = F.concatenate(input, context)  # (bz, 1, 384)

            with nn.parameter_scope('rnn_attention'):
                # calculate the output
                output, hidden = PF.gru(
                    input.reshape((1, bz, -1)),
                    hidden,
                    training=self.training,
                    bidirectional=False)  # (1, bz, 256), (1, 1, bz, 256)

            # compute the context and attention vectors
            context, attend = Bahdanau_attention(
                F.transpose(hidden[0], (1, 0, 2)),
                memory,
                out_features=hp.attention_dim,
                scope='Bahdanau_attention')  # (bz, 1, 256), (bz, 1, T)

            with nn.parameter_scope('rnn_decoder'):
                # concat RNN output and attention context vector
                with nn.parameter_scope('project_to_decoder'):
                    output = F.concatenate(output,
                                           F.transpose(context, (1, 0, 2)),
                                           axis=2)
                    output = PF.affine(output, encoder_dim,
                                       base_axis=2)  # (1, bz, 256)

                # decoder RNN with residual connection
                for j in range(2):
                    with nn.parameter_scope(f'gru_resisidual_{j}'):
                        out, h_gru[j] = PF.gru(output,
                                               h_gru[j],
                                               training=self.training,
                                               bidirectional=False)
                        output += out  # (1, bz, 256)

                # projector to mels
                with nn.parameter_scope('project_to_mel'):
                    output = F.transpose(output, (1, 0, 2))
                    # (bz, 1, n_mels*r)
                    output = PF.affine(output, mel_shape, base_axis=2)

            outputs.append(output)
            attends.append(attend)

        outputs = F.concatenate(*outputs, axis=1)  # (B, T2, C2)
        attends = F.concatenate(*attends, axis=1)  # (B, T2, T1)

        return outputs, attends