Exemple #1
0
def compile_gru_model(input_dim=161,
                      output_dim=29,
                      recur_layers=3,
                      nodes=1024,
                      conv_context=11,
                      conv_border_mode='valid',
                      conv_stride=2,
                      initialization='glorot_uniform',
                      batch_norm=True):
    """ Build a recurrent network (CTC) for speech with GRU units """
    logger.info("Building gru model")
    # Main acoustic input
    acoustic_input = Input(shape=(None, input_dim), name='acoustic_input')

    # Setup the network
    conv_1d = Convolution1D(nodes,
                            conv_context,
                            name='conv1d',
                            border_mode=conv_border_mode,
                            subsample_length=conv_stride,
                            init=initialization,
                            activation='relu')(acoustic_input)
    if batch_norm:
        output = BatchNormalization(name='bn_conv_1d', mode=2)(conv_1d)
    else:
        output = conv_1d
    output = Dropout(.2)(output)

    for r in range(recur_layers):
        output = GRU(nodes,
                     activation='relu',
                     name='rnn_{}'.format(r + 1),
                     init=initialization,
                     return_sequences=True)(output)
        if batch_norm:
            bn_layer = BatchNormalization(name='bn_rnn_{}'.format(r + 1),
                                          mode=2)
            output = bn_layer(output)

    # We don't softmax here because CTC does that
    network_output = TimeDistributed(
        Dense(
            output_dim,
            name='dense',
            activation='linear',
            init=initialization,
        ))(output)
    model = Model(input=acoustic_input, output=network_output)
    model.conv_output_length = lambda x: conv_output_length(
        x, conv_context, conv_border_mode, conv_stride)
    return model
Exemple #2
0
def test(model,
         test_fn,
         datagen,
         mb_size=16,
         conv_context=11,
         conv_border_mode='valid',
         conv_stride=2):
    """ Testing routine for speech-models
    Params:
        model (keras.model): Constructed keras model
        test_fn (theano.function): A theano function that calculates the cost
            over a test set
        datagen (DataGenerator)
        mb_size (int): Size of each minibatch
        conv_context (int): Convolution context
        conv_border_mode (str): Convolution border mode
        conv_stride (int): Convolution stride
    Returns:
        test_cost (float): Average test cost over the whole test set
    """
    avg_cost = 0.0
    i = 0
    for batch in datagen.iterate_test(mb_size):
        inputs = batch['x']
        labels = batch['y']
        input_lengths = batch['input_lengths']
        label_lengths = batch['label_lengths']
        ground_truth = batch['texts']
        # Due to convolution, the number of timesteps of the output
        # is different from the input length. Calculate the resulting
        # timesteps
        output_lengths = [
            conv_output_length(l, conv_context, conv_border_mode, conv_stride)
            for l in input_lengths
        ]
        predictions, ctc_cost = test_fn(
            [inputs, output_lengths, labels, label_lengths, True])
        predictions = np.swapaxes(predictions, 0, 1)
        for i, prediction in enumerate(predictions):
            print("Truth: {}, Prediction: {}".format(
                ground_truth[i], argmax_decode(prediction)))
        avg_cost += ctc_cost
        i += 1
    return avg_cost / i
    def get_batch(self, index, size, audio_paths, texts):
        # pull necessary info
        max_length = max(
            [self.features[index + i].shape[0] for i in range(0, size)])
        max_string_length = max(
            [len(self.train_texts[index + i]) for i in range(0, size)])

        # initialize the arrays
        X_data = np.zeros([size, max_length, self.feat_dim])
        labels = np.ones([size, max_string_length]) * 28
        input_length = np.zeros([size, 1])
        label_length = np.zeros([size, 1])

        # populate the arrays
        for i in range(0, size):
            # X_data, input_length
            feat = self.features[index + i]
            input_length[i] = feat.shape[0]
            feat = self.normalize(feat)
            X_data[i, :feat.shape[0], :] = feat

            # y, label_length
            label = np.array(text_to_int_sequence(texts[index + i])) - 1
            labels[i, :len(label)] = label
            label_length[i] = len(label)

        # repare and return the arrays
        input_length = np.array([
            conv_output_length(i,
                               filter_size=11,
                               border_mode='valid',
                               stride=2) for i in input_length
        ])
        outputs = {'ctc': np.zeros([size])}
        inputs = {
            'the_input':
            X_data,  # array; dim: mb_size x max_aud_length x features[0].shape[1]
            'the_labels':
            labels,  # array; dim: mb_size, time_steps, num_categories
            'input_length': input_length,  # array; dim: mb_size x 1
            'label_length': label_length  # array; dim: mb_size x 1
        }
        return (inputs, outputs)
    [audio_gen.features[index + i].shape[0] for i in range(0, size)])
max_string_length = max(
    [len(audio_gen.train_texts[index + i]) for i in range(0, size)])

# initialize the arrays
X_data = np.zeros([size, max_length, audio_gen.feat_dim])
labels = np.ones([size, max_string_length]) * 28
input_length = np.zeros([size, 1])
label_length = np.zeros([size, 1])

for i in range(0, size):
    # X_data, input_length
    feat = audio_gen.features[index + i]
    feat = audio_gen.normalize(feat)
    input_length[i] = conv_output_length(max_length,
                                         filter_size=11,
                                         border_mode='valid',
                                         stride=2)
    X_data[i, :feat.shape[0], :] = feat

    # y, label_length
    label = np.array(text_to_int_sequence(
        audio_gen.train_texts[index + i])) - 1
    labels[i, :len(label)] = label
    label_length[i] = 133


def decode_batch(test_func, audio):
    out = test_func([audio])[0]
    ret = []
    for j in range(out.shape[0]):
        out_best = list(np.argmax(out[j, :], 1))
Exemple #5
0
def test(model,
         test_fn,
         datagen,
         result_file,
         mb_size=16,
         conv_context=11,
         conv_border_mode='valid',
         conv_stride=2):
    # def test(model, test_fn, datagen, result_file, mb_size=16):

    total_distance = 0
    total_length = 0
    wf = open(result_file, 'w')
    for batch in datagen.iterate_test(mb_size):
        inputs = batch['x']
        labels = batch['y']
        input_lengths = batch['input_lengths']
        label_lengths = batch['label_lengths']
        ground_truth = batch['texts']

        output_lengths = [
            conv_output_length(l, conv_context, conv_border_mode, conv_stride)
            for l in input_lengths
        ]
        predictions, ctc_cost = test_fn(
            [inputs, output_lengths, labels, label_lengths, True])

        # ctc_in_length = ctc_input_length(model, input_lengths)
        # predictions, ctc_cost = test_fn([inputs, ctc_in_length, labels,
        #                                 label_lengths, False])
        predictions = np.swapaxes(predictions, 0, 1)
        for i, prediction in enumerate(predictions):
            truth = ground_truth[i]
            # 最佳结果
            pre_prediction = argmax_decode(prediction)
            # 前三结果
            preds = prefix_beam_search(lm_model,
                                       matrix_same_delete(prediction), 100, 3)

            max_pred_precision = []
            for pred in preds:
                max_pred_precision.append(pred[1])
            # 求三个中的最大概率
            max_index = max_pred_precision.index(max(max_pred_precision))
            # 获取三个中概率最大的字符串
            best_pred_str = preds[max_index][0]
            # 计算标签和概率最大字符串的编辑距离
            sm = edit_distance.SequenceMatcher(a=truth, b=best_pred_str)
            sm2 = edit_distance.SequenceMatcher(a=truth, b=pre_prediction)
            total_distance += sm.distance()
            total_length += len(truth)
            content = json.loads('{}')
            content['label'] = truth
            content['text'] = best_pred_str
            content['lm_distance'] = sm.distance() / len(truth)
            content['no_lm'] = pre_prediction
            content['no_lm_distance'] = sm2.distance() / len(truth)
            __write_and_print(wf, json.dumps(content, ensure_ascii=False))

    total_distance_rate = -1 if total_length == 0 else float(
        total_distance) / total_length
    print('total_distance_rate:%s' % total_distance_rate)
    wf.close()