def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, use_gru, share_rnn_weights): """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape # of input batch data will be induced during training. audio_data = paddle.layer.data(name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(vocab_size)) self._log_probs, self._loss = deep_speech_v2_network( audio_data=audio_data, text_data=text_data, dict_size=vocab_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_size=rnn_layer_size, use_gru=use_gru, share_rnn_weights=share_rnn_weights)
def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, use_gru, share_rnn_weights): """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape # of input batch data will be induced during training. audio_data = paddle.layer.data(name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(vocab_size)) seq_offset_data = paddle.layer.data( name='sequence_offset', type=paddle.data_type.integer_value_sequence(1)) seq_len_data = paddle.layer.data( name='sequence_length', type=paddle.data_type.integer_value_sequence(1)) index_range_datas = [] for i in xrange(num_rnn_layers): index_range_datas.append( paddle.layer.data(name='conv%d_index_range' % i, type=paddle.data_type.dense_vector(6))) self._log_probs, self._loss = deep_speech_v2_network( audio_data=audio_data, text_data=text_data, seq_offset_data=seq_offset_data, seq_len_data=seq_len_data, index_range_datas=index_range_datas, dict_size=vocab_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_size=rnn_layer_size, use_gru=use_gru, share_rnn_weights=share_rnn_weights)
def create_network(self, is_infer=False): """Create data layers and model network. :param is_training: Whether to create a network for training. :type is_training: bool :return reader: Reader for input. :rtype reader: read generater :return log_probs: An output unnormalized log probability layer. :rtype lig_probs: Varable :return loss: A ctc loss layer. :rtype loss: Variable """ if not is_infer: input_fields = { 'names': ['audio_data', 'text_data', 'seq_len_data', 'masks'], 'shapes': [[None, 161, None], [None, 1], [None, 1], [None, 32, 81, None]], 'dtypes': ['float32', 'int32', 'int64', 'float32'], 'lod_levels': [0, 1, 0, 0] } inputs = [ fluid.data(name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names'])) ] reader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=128, iterable=False, use_double_buffer=True) (audio_data, text_data, seq_len_data, masks) = inputs else: audio_data = fluid.data(name='audio_data', shape=[None, 161, None], dtype='float32', lod_level=0) seq_len_data = fluid.data(name='seq_len_data', shape=[None, 1], dtype='int64', lod_level=0) masks = fluid.data(name='masks', shape=[None, 32, 81, None], dtype='float32', lod_level=0) text_data = None reader = fluid.DataFeeder([audio_data, seq_len_data, masks], self._place) log_probs, loss = deep_speech_v2_network(audio_data=audio_data, text_data=text_data, seq_len_data=seq_len_data, masks=masks, dict_size=self._vocab_size, num_conv_layers=self._num_conv_layers, num_rnn_layers=self._num_rnn_layers, rnn_size=self._rnn_layer_size, use_gru=self._use_gru, share_rnn_weights=self._share_rnn_weights) return reader, log_probs, loss