def compile_gru_model(input_dim=161, output_dim=29, recur_layers=3, nodes=1024, conv_context=11, conv_border_mode='valid', conv_stride=2, initialization='glorot_uniform', batch_norm=True): """ Build a recurrent network (CTC) for speech with GRU units """ logger.info("Building gru model") # Main acoustic input acoustic_input = Input(shape=(None, input_dim), name='acoustic_input') # Setup the network conv_1d = Convolution1D(nodes, conv_context, name='conv1d', border_mode=conv_border_mode, subsample_length=conv_stride, init=initialization, activation='relu')(acoustic_input) if batch_norm: output = BatchNormalization(name='bn_conv_1d', mode=2)(conv_1d) else: output = conv_1d output = Dropout(.2)(output) for r in range(recur_layers): output = GRU(nodes, activation='relu', name='rnn_{}'.format(r + 1), init=initialization, return_sequences=True)(output) if batch_norm: bn_layer = BatchNormalization(name='bn_rnn_{}'.format(r + 1), mode=2) output = bn_layer(output) # We don't softmax here because CTC does that network_output = TimeDistributed( Dense( output_dim, name='dense', activation='linear', init=initialization, ))(output) model = Model(input=acoustic_input, output=network_output) model.conv_output_length = lambda x: conv_output_length( x, conv_context, conv_border_mode, conv_stride) return model
def test(model, test_fn, datagen, mb_size=16, conv_context=11, conv_border_mode='valid', conv_stride=2): """ Testing routine for speech-models Params: model (keras.model): Constructed keras model test_fn (theano.function): A theano function that calculates the cost over a test set datagen (DataGenerator) mb_size (int): Size of each minibatch conv_context (int): Convolution context conv_border_mode (str): Convolution border mode conv_stride (int): Convolution stride Returns: test_cost (float): Average test cost over the whole test set """ avg_cost = 0.0 i = 0 for batch in datagen.iterate_test(mb_size): inputs = batch['x'] labels = batch['y'] input_lengths = batch['input_lengths'] label_lengths = batch['label_lengths'] ground_truth = batch['texts'] # Due to convolution, the number of timesteps of the output # is different from the input length. Calculate the resulting # timesteps output_lengths = [ conv_output_length(l, conv_context, conv_border_mode, conv_stride) for l in input_lengths ] predictions, ctc_cost = test_fn( [inputs, output_lengths, labels, label_lengths, True]) predictions = np.swapaxes(predictions, 0, 1) for i, prediction in enumerate(predictions): print("Truth: {}, Prediction: {}".format( ground_truth[i], argmax_decode(prediction))) avg_cost += ctc_cost i += 1 return avg_cost / i
def get_batch(self, index, size, audio_paths, texts): # pull necessary info max_length = max( [self.features[index + i].shape[0] for i in range(0, size)]) max_string_length = max( [len(self.train_texts[index + i]) for i in range(0, size)]) # initialize the arrays X_data = np.zeros([size, max_length, self.feat_dim]) labels = np.ones([size, max_string_length]) * 28 input_length = np.zeros([size, 1]) label_length = np.zeros([size, 1]) # populate the arrays for i in range(0, size): # X_data, input_length feat = self.features[index + i] input_length[i] = feat.shape[0] feat = self.normalize(feat) X_data[i, :feat.shape[0], :] = feat # y, label_length label = np.array(text_to_int_sequence(texts[index + i])) - 1 labels[i, :len(label)] = label label_length[i] = len(label) # repare and return the arrays input_length = np.array([ conv_output_length(i, filter_size=11, border_mode='valid', stride=2) for i in input_length ]) outputs = {'ctc': np.zeros([size])} inputs = { 'the_input': X_data, # array; dim: mb_size x max_aud_length x features[0].shape[1] 'the_labels': labels, # array; dim: mb_size, time_steps, num_categories 'input_length': input_length, # array; dim: mb_size x 1 'label_length': label_length # array; dim: mb_size x 1 } return (inputs, outputs)
[audio_gen.features[index + i].shape[0] for i in range(0, size)]) max_string_length = max( [len(audio_gen.train_texts[index + i]) for i in range(0, size)]) # initialize the arrays X_data = np.zeros([size, max_length, audio_gen.feat_dim]) labels = np.ones([size, max_string_length]) * 28 input_length = np.zeros([size, 1]) label_length = np.zeros([size, 1]) for i in range(0, size): # X_data, input_length feat = audio_gen.features[index + i] feat = audio_gen.normalize(feat) input_length[i] = conv_output_length(max_length, filter_size=11, border_mode='valid', stride=2) X_data[i, :feat.shape[0], :] = feat # y, label_length label = np.array(text_to_int_sequence( audio_gen.train_texts[index + i])) - 1 labels[i, :len(label)] = label label_length[i] = 133 def decode_batch(test_func, audio): out = test_func([audio])[0] ret = [] for j in range(out.shape[0]): out_best = list(np.argmax(out[j, :], 1))
def test(model, test_fn, datagen, result_file, mb_size=16, conv_context=11, conv_border_mode='valid', conv_stride=2): # def test(model, test_fn, datagen, result_file, mb_size=16): total_distance = 0 total_length = 0 wf = open(result_file, 'w') for batch in datagen.iterate_test(mb_size): inputs = batch['x'] labels = batch['y'] input_lengths = batch['input_lengths'] label_lengths = batch['label_lengths'] ground_truth = batch['texts'] output_lengths = [ conv_output_length(l, conv_context, conv_border_mode, conv_stride) for l in input_lengths ] predictions, ctc_cost = test_fn( [inputs, output_lengths, labels, label_lengths, True]) # ctc_in_length = ctc_input_length(model, input_lengths) # predictions, ctc_cost = test_fn([inputs, ctc_in_length, labels, # label_lengths, False]) predictions = np.swapaxes(predictions, 0, 1) for i, prediction in enumerate(predictions): truth = ground_truth[i] # 最佳结果 pre_prediction = argmax_decode(prediction) # 前三结果 preds = prefix_beam_search(lm_model, matrix_same_delete(prediction), 100, 3) max_pred_precision = [] for pred in preds: max_pred_precision.append(pred[1]) # 求三个中的最大概率 max_index = max_pred_precision.index(max(max_pred_precision)) # 获取三个中概率最大的字符串 best_pred_str = preds[max_index][0] # 计算标签和概率最大字符串的编辑距离 sm = edit_distance.SequenceMatcher(a=truth, b=best_pred_str) sm2 = edit_distance.SequenceMatcher(a=truth, b=pre_prediction) total_distance += sm.distance() total_length += len(truth) content = json.loads('{}') content['label'] = truth content['text'] = best_pred_str content['lm_distance'] = sm.distance() / len(truth) content['no_lm'] = pre_prediction content['no_lm_distance'] = sm2.distance() / len(truth) __write_and_print(wf, json.dumps(content, ensure_ascii=False)) total_distance_rate = -1 if total_length == 0 else float( total_distance) / total_length print('total_distance_rate:%s' % total_distance_rate) wf.close()