Esempio n. 1
0
 def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
                     rnn_layer_size, use_gru, share_rnn_weights):
     """Create data layers and model network."""
     # paddle.data_type.dense_array is used for variable batch input.
     # The size 161 * 161 is only an placeholder value and the real shape
     # of input batch data will be induced during training.
     audio_data = paddle.layer.data(name="audio_spectrogram",
                                    type=paddle.data_type.dense_array(161 *
                                                                      161))
     text_data = paddle.layer.data(
         name="transcript_text",
         type=paddle.data_type.integer_value_sequence(vocab_size))
     self._log_probs, self._loss = deep_speech_v2_network(
         audio_data=audio_data,
         text_data=text_data,
         dict_size=vocab_size,
         num_conv_layers=num_conv_layers,
         num_rnn_layers=num_rnn_layers,
         rnn_size=rnn_layer_size,
         use_gru=use_gru,
         share_rnn_weights=share_rnn_weights)
Esempio n. 2
0
    def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
                        rnn_layer_size, use_gru, share_rnn_weights):
        """Create data layers and model network."""
        # paddle.data_type.dense_array is used for variable batch input.
        # The size 161 * 161 is only an placeholder value and the real shape
        # of input batch data will be induced during training.
        audio_data = paddle.layer.data(name="audio_spectrogram",
                                       type=paddle.data_type.dense_array(161 *
                                                                         161))
        text_data = paddle.layer.data(
            name="transcript_text",
            type=paddle.data_type.integer_value_sequence(vocab_size))
        seq_offset_data = paddle.layer.data(
            name='sequence_offset',
            type=paddle.data_type.integer_value_sequence(1))
        seq_len_data = paddle.layer.data(
            name='sequence_length',
            type=paddle.data_type.integer_value_sequence(1))
        index_range_datas = []
        for i in xrange(num_rnn_layers):
            index_range_datas.append(
                paddle.layer.data(name='conv%d_index_range' % i,
                                  type=paddle.data_type.dense_vector(6)))

        self._log_probs, self._loss = deep_speech_v2_network(
            audio_data=audio_data,
            text_data=text_data,
            seq_offset_data=seq_offset_data,
            seq_len_data=seq_len_data,
            index_range_datas=index_range_datas,
            dict_size=vocab_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_layer_size,
            use_gru=use_gru,
            share_rnn_weights=share_rnn_weights)
    def create_network(self, is_infer=False):
        """Create data layers and model network.
        :param is_training: Whether to create a network for training.
        :type is_training: bool
        :return reader: Reader for input.
        :rtype reader: read generater
        :return log_probs: An output unnormalized log probability layer.
        :rtype lig_probs: Varable
        :return loss: A ctc loss layer.
        :rtype loss: Variable
        """

        if not is_infer:
            input_fields = {
                'names': ['audio_data', 'text_data', 'seq_len_data', 'masks'],
                'shapes': [[None, 161, None], [None, 1], [None, 1], [None, 32, 81, None]],
                'dtypes': ['float32', 'int32', 'int64', 'float32'],
                'lod_levels': [0, 1, 0, 0]
            }

            inputs = [
                fluid.data(name=input_fields['names'][i],
                           shape=input_fields['shapes'][i],
                           dtype=input_fields['dtypes'][i],
                           lod_level=input_fields['lod_levels'][i])
                for i in range(len(input_fields['names']))
            ]

            reader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                        capacity=128,
                                                        iterable=False,
                                                        use_double_buffer=True)

            (audio_data, text_data, seq_len_data, masks) = inputs
        else:
            audio_data = fluid.data(name='audio_data',
                                    shape=[None, 161, None],
                                    dtype='float32',
                                    lod_level=0)
            seq_len_data = fluid.data(name='seq_len_data',
                                      shape=[None, 1],
                                      dtype='int64',
                                      lod_level=0)
            masks = fluid.data(name='masks',
                               shape=[None, 32, 81, None],
                               dtype='float32',
                               lod_level=0)
            text_data = None
            reader = fluid.DataFeeder([audio_data, seq_len_data, masks], self._place)

        log_probs, loss = deep_speech_v2_network(audio_data=audio_data,
                                                 text_data=text_data,
                                                 seq_len_data=seq_len_data,
                                                 masks=masks,
                                                 dict_size=self._vocab_size,
                                                 num_conv_layers=self._num_conv_layers,
                                                 num_rnn_layers=self._num_rnn_layers,
                                                 rnn_size=self._rnn_layer_size,
                                                 use_gru=self._use_gru,
                                                 share_rnn_weights=self._share_rnn_weights)
        return reader, log_probs, loss