Ejemplos de ctc_decode en Python, ejemplos de keras.backend.ctc_decode en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: ocr_ctc.py Proyecto: eglrp/OCRProblem

        print 'acc: %f%%' % acc


evaluator = Evaluate()
# model.fit_generator(gen(64), samples_per_epoch=512, nb_epoch=15,
#                     callbacks=[evaluator],
#                     )

# 测试模型
characters2 = characters + ' '
[X_test, y_test, _, _], _ = next(gen(1))
y_pred = base_model.predict(X_test)
y_pred = y_pred[:, 2:, :]
out = K.get_value(
    K.ctc_decode(
        y_pred,
        input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1],
    )[0][0])[:, :n_len]
# out = ''.join([characters[x] for x in out[0]])
# y_true = ''.join([characters[x] for x in y_test[0]])
#
# import pylab
# plt.imshow(X_test[0].transpose(1, 0, 2))
# plt.title('pred:' + str(out) + '\ntrue: ' + str(y_true))
# pylab.show()

argmax = np.argmax(y_pred, axis=2)[0]
print list(zip(argmax, ''.join([characters2[x] for x in argmax])))

# 计算模型总体准确率
print evaluate(base_model)
model.save('model.h5')

Ejemplo n.º 2

0

Mostrar archivo

Archivo: model.py Proyecto: mariomeissner/deep_turing_ocr

def create_model(params, gpu=False, two_rnns=False):

    input_data = Input(name="input",
                       shape=params["input_shape"],
                       dtype="float32")
    conv1 = Conv2D(
        params["conv_filters"],
        params["kernel_size"],
        padding="same",
        activation=params["act"],
        kernel_initializer="he_normal",
        name="conv1",
    )(input_data)
    conv1 = MaxPooling2D(pool_size=(params["pool_size"], params["pool_size"]),
                         name="max1")(conv1)
    conv1 = Dropout(0.2)(conv1)
    conv2 = Conv2D(
        params["conv_filters"],
        params["kernel_size"],
        padding="same",
        activation=params["act"],
        kernel_initializer="he_normal",
        name="conv2",
    )(conv1)
    conv2 = MaxPooling2D(pool_size=(params["pool_size"], params["pool_size"]),
                         name="max2")(conv2)
    conv2 = Dropout(0.2)(conv2)

    # conv1shape = (img_w // (pool_size ** (num_convs - 1)),
    #                     (img_h // (pool_size ** (num_convs - 1))) * conv_filters)
    conv2shape = (
        params["img_w"] // (params["pool_size"]**params["num_convs"]),
        (params["img_h"] // (params["pool_size"]**params["num_convs"])) *
        params["conv_filters"],
    )

    # Failed attempt to do a skip connection

    # conv1 = Reshape(target_shape=conv1shape)(conv1)
    # conv2 = Reshape(target_shape=conv2shape)(conv2)
    # inner = concatenate([conv1, conv2], axis=2)

    inner = Reshape(target_shape=conv2shape, name="reshape")(conv2)

    # cuts down input size going into RNN:
    inner = Dense(params["time_dense_size"],
                  activation=params["act"],
                  name="dense1")(inner)

    if gpu:
        gru_1 = CuDNNGRU(
            params["rnn1_size"],
            return_sequences=True,
            kernel_initializer="he_normal",
            name="gru1",
        )(inner)
        gru_1b = CuDNNGRU(
            params["rnn1_size"],
            return_sequences=True,
            go_backwards=True,
            kernel_initializer="he_normal",
            name="gru1_b",
        )(inner)
    else:
        gru_1 = GRU(
            params["rnn1_size"],
            return_sequences=True,
            kernel_initializer="he_normal",
            name="gru1",
            reset_after=True,
            recurrent_activation="sigmoid",
        )(inner)
        gru_1b = GRU(
            params["rnn1_size"],
            return_sequences=True,
            go_backwards=True,
            kernel_initializer="he_normal",
            name="gru1_b",
            reset_after=True,
            recurrent_activation="sigmoid",
        )(inner)

    gru1_merged = add([gru_1, gru_1b])
    if two_rnns:
        if gpu:
            gru_2 = CuDNNGRU(
                params["rnn2_size"],
                return_sequences=True,
                kernel_initializer="he_normal",
                name="gru2",
            )(gru1_merged)
            gru_2b = CuDNNGRU(
                params["rnn2_size"],
                return_sequences=True,
                go_backwards=True,
                kernel_initializer="he_normal",
                name="gru2_b",
            )(gru1_merged)
        else:
            gru_2 = GRU(
                params["rnn2_size"],
                return_sequences=True,
                kernel_initializer="he_normal",
                name="gru2",
                reset_after=True,
                recurrent_activation="sigmoid",
            )(gru1_merged)
            gru_2b = GRU(
                params["rnn2_size"],
                return_sequences=True,
                go_backwards=True,
                kernel_initializer="he_normal",
                name="gru2_b",
                reset_after=True,
                recurrent_activation="sigmoid",
            )(gru1_merged)

    # transforms RNN output to character activations:
    if two_rnns:
        inner = Dense(params["output_size"],
                      kernel_initializer="he_normal",
                      name="dense2")(concatenate([gru_2, gru_2b]))
    else:
        inner = Dense(params["output_size"],
                      kernel_initializer="he_normal",
                      name="dense2")(gru1_merged)

    y_pred = Activation("softmax", name="softmax")(inner)
    output_labels = Input(name="the_labels",
                          shape=[params["max_string_len"]],
                          dtype="float32")
    input_lengths = Input(name="input_length", shape=[1], dtype="int64")
    label_lengths = Input(name="label_length", shape=[1], dtype="int64")

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    # The loss function
    def ctc_lambda_func(args):
        y_pred, labels, input_length, label_length = args
        # the 2 is critical here since the first couple outputs of the RNN
        # tend to be garbage:
        y_pred = y_pred[:, params["ctc_cut"]:, :]
        return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

    loss_out = Lambda(ctc_lambda_func, output_shape=(1, ), name="ctc")(
        [y_pred, output_labels, input_lengths, label_lengths])

    train_model = Model(
        inputs=[input_data, output_labels, input_lengths, label_lengths],
        outputs=loss_out)

    top_k_dec_list, _ = K.ctc_decode(
        y_pred[:, params["ctc_cut"]:, :],
        K.squeeze(input_lengths, axis=1),
        greedy=False,
        top_paths=3,
    )
    decoder0 = K.function([input_data, input_lengths], [top_k_dec_list[0]])
    decoder1 = K.function([input_data, input_lengths], [top_k_dec_list[1]])
    decoder2 = K.function([input_data, input_lengths], [top_k_dec_list[2]])
    decoder_models = decoder0, decoder1, decoder2

    return train_model, decoder_models

Ejemplo n.º 3

0

Mostrar archivo

def predict_text(model, recs_all, recs_len, img_all, img_name=None):
    texts = []
    img_list = []
    width_list = []
    img_index = 0

    # fixme 当前是前面所有长度的和
    for i in range(len(recs_len)):
        if i > 0:
            recs_len[i] += recs_len[i - 1]

    for i in range(len(recs_all)):
        for j in range(len(recs_len)):
            if i < recs_len[j]:
                img_index = j
                break

        img_rec = dumpRotateImage(img_all[img_index], recs_all[i]).convert('L')

        scale = img_rec.size[1] * 1.0 / 32
        if not scale > 0:
            continue

        w = int(img_rec.size[0] / scale)

        # fixme 像素缩放后小于1pixel
        if not w > 0:
            continue

        img_rec = img_rec.resize((w, 32), Image.BILINEAR)
        width_list.append(w)

        # fixme 增强图像对比度 提高识别
        img_in = np.array(img_rec).T
        img_out = np.zeros(img_in.shape, np.uint8)
        cv2.normalize(img_in, img_out, 255, 0, cv2.NORM_MINMAX, cv2.CV_8U)

        # fixme 黑白色彩反转 达到黑字白底的目的
        # todo 根据面积比较的反转
        # todo 可以尝试提取图片的前景色
        # black = 0
        # for m in range(32):
        #    for n in range(64 if w >= 64 else w):
        #        if img_out[m, n] < 100 :
        #            black += 1
        # if black > (32*(64 if w >= 64 else w)/2):
        #    img_out = 255 - img_out

        # todo 根据顶点的线条比较反转
        black = 0
        for m in range(32):
            if img_out[0, m] < 100:
                black += 1
        for n in range(64 if w >= 64 else w):
            if img_out[n, 0] < 100:
                black += 1
        if black > (32 + (64 if w >= 64 else w)) // 2:
            img_out = 255 - img_out

        # todo 获取黑色文字进行二值化(效果不佳)
        # for i in range(32):
        #     for j in range(w):
        #         if not (img_out[i, j] < 50):
        #             img_out[i, j] = 255
        #
        # ret, img_out = cv2.threshold(img_out, 180, 255, cv2.THRESH_BINARY)

        img_rec = img_out.astype(np.float32) / 255.0 - 0.5  # img_rec is array
        img_list.append(img_rec)

    width_max = max(width_list)
    X = np.zeros((len(width_list), width_max, 32, 1), dtype=np.float)

    for i in range(len(width_list)):
        img_pad = np.zeros((width_max - width_list[i], 32), np.float32) + 0.5
        img_rec = np.concatenate((img_list[i], img_pad), axis=0)
        X[i] = np.expand_dims(img_rec, axis=2)

        # fixme 保存裁剪后的图像
        if not img_name is None:
            img_out = (img_rec + 0.5) * 255
            img_sa = Image.fromarray(img_out.T.astype(np.int32))
            img_sa.convert('L').save(root_recs + '/' + img_name +
                                     '_%d_.jpg' % i)

    y_pred = model.predict(X)

    out = K.get_value(
        K.ctc_decode(y_pred,
                     input_length=np.ones(y_pred.shape[0]) *
                     y_pred.shape[1])[0][0])

    for i in range(len(out)):
        out_s = u''.join([char[x] for x in out[i] if x != -1])
        # texts_str += (out_s)
        texts.append(out_s)

    # return texts_str
    return texts

Ejemplo n.º 4

0

Mostrar archivo

Archivo: SpeechModel5_old.py Proyecto: tkxcrf/ASRT_SpeechRecognition

    def RecognizeSpeech(self, wavsignal, fs):
        '''
		最终做语音识别用的函数，识别一个wav序列的语音
		不过这里现在还有bug
		'''

        #data = self.data
        data = DataSpeech('E:\\语音数据集')
        data.LoadDataList('dev')
        # 获取输入特征
        #data_input = data.GetMfccFeature(wavsignal, fs)
        data_input = data.GetFrequencyFeature(wavsignal, fs)
        input_length = len(data_input)
        input_length = input_length // 4

        data_input = np.array(data_input, dtype=np.float)
        in_len = np.zeros((1), dtype=np.int32)
        print(in_len.shape)
        in_len[0] = input_length

        batch_size = 1
        x_in = np.zeros((batch_size, 1600, 200), dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)
        print('base_pred:\n', base_pred)

        #input_length = tf.squeeze(input_length)

        #decode_pred = self.model_decode(x=[x_in, in_len])
        #print(decode_pred)
        base_pred = base_pred[:, 2:, :]
        r = K.ctc_decode(base_pred,
                         in_len,
                         greedy=True,
                         beam_width=64,
                         top_paths=1)
        print('r', r)
        #r = K.cast(r[0][0], dtype='float32')
        #print('r1', r)
        #print('解码完成')

        r1 = K.get_value(r[0][0])
        print('r1', r1)

        print('r0', r[1])
        r2 = K.get_value(r[1])
        print(r2)
        print('解码完成')
        list_symbol_dic = data.list_symbol  # 获取拼音列表
        #arr_zero = np.zeros((1, 200), dtype=np.int16) #一个全是0的行向量

        #import matplotlib.pyplot as plt
        #plt.subplot(111)
        #plt.imshow(data_input, cmap=plt.get_cmap('gray'))
        #plt.show()

        #while(len(data_input)<1600): #长度不够时补全到1600
        #	data_input = np.row_stack((data_input,arr_zero))
        #print(len(data_input))

        #list_symbol = data.list_symbol # 获取拼音列表

        #labels = [ list_symbol[0] ]
        #while(len(labels) < 64):
        #	labels.append('')

        #labels_num = []
        #for i in labels:
        #	labels_num.append(data.SymbolToNum(i))

        #data_input = np.array(data_input, dtype=np.int16)
        #data_input = data_input.reshape(data_input.shape[0],data_input.shape[1])

        #labels_num = np.array(labels_num, dtype=np.int16)
        #labels_num = labels_num.reshape(labels_num.shape[0])

        #input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16)
        #input_length = np.array(input_length)
        #input_length = input_length.reshape(input_length.shape[0])

        #label_length = np.array([labels_num.shape[0]], dtype=np.int16)
        #label_length = np.array(label_length)
        #label_length = label_length.reshape(label_length.shape[0])

        #x = [data_input, labels_num, input_length, label_length]
        #x = next(data.data_genetator(1, self.AUDIO_LENGTH))
        #x = kr.utils.np_utils.to_categorical(x)

        #print(x)
        #x=np.array(x)

        #pred = self._model.predict(x=x)
        #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length])
        #return [labels,pred]
        return r1
        pass

Ejemplo n.º 5

0

Mostrar archivo

Archivo: hw3p2_ctc_loss.py Proyecto: KiriteeGak/cmu-deep-learning-2018

                                      train_data_labels_=train_data_labels,
                                      reshape_=False),
                            shuffle=False,
                            steps_per_epoch=data_size)
        model.save(
            "/home/tatras/Desktop/github-general/cmu-deep-learning-2018/"
            "hw3/models/2_layer_lstm_ctc_epoch_{}".format(_))


def testing_():
    # Training the data in generators
    test_data_raw = np.load("/home/kiriteegak/Desktop/github-general/"
                            "cmu-deep-learning-2018/hw3/data/dev.npy")
    sizes = np.apply_along_axis(len, 0, test_data_raw)
    test_data_raw = np.apply_along_axis(np.expand_dims, 0, test_data_raw, 1)
    model = load_model(
        "/home/kiriteegak/Desktop/github-general/cmu-deep-learning-2018/"
        "hw3/models/2_layer_lstm_ctc_epoch_0",
        custom_objects={'tf': tf})
    print("here")
    model_changed = change_network_architecture(model)
    return model_changed.predict(x=test_data_raw), sizes


if __name__ == '__main__':
    test_data_labels = np.load(
        "/home/kiriteegak/Desktop/github-general/"
        "cmu-deep-learning-2018/hw3/data/dev_phonemes.npy")
    outputs, lengths_ = testing_()
    print(K.ctc_decode(outputs, lengths_, greedy=False))

Ejemplo n.º 6

0

Mostrar archivo

Archivo: submit_level2.py Proyecto: YuxiangJohn/Arithmetic_Breaker_Baidu

import time




start=time.clock()
X_test_1 = np.zeros((1, width1, height1, 3), dtype=np.uint8)
X_test_2 = np.zeros((1, width2, height2, 3), dtype=np.uint8)
file = codecs.open("test1.txt","a","utf-8")
for i in range(0,100000):
    result=""
    X_test_1[0] = cv2.resize(cv2.imread('test/'+str(i)+'_1.png'), (width1, height1), cv2.INTER_LINEAR).transpose(1,0,2)
    y_pred_1 = model1.predict(X_test_1)
    y_pred_1 = y_pred_1[:,2:,:]
    out1 = K.get_value(K.ctc_decode(y_pred_1, input_length=np.ones(y_pred_1.shape[0])*y_pred_1.shape[1], )[0][0])[:, :30]
    out1 = ''.join([characters[x] for x in out1[0]])
    result += out1 +";"
    if os.path.isfile('test/'+str(i)+'_2.png') == True:
        X_test_1[0] = cv2.resize(cv2.imread('test/'+str(i)+'_2.png'), (width1, height1), cv2.INTER_LINEAR).transpose(1,0,2)
        y_pred_1 = model1.predict(X_test_1)
        y_pred_1 = y_pred_1[:,2:,:]
        out1 = K.get_value(K.ctc_decode(y_pred_1, input_length=np.ones(y_pred_1.shape[0])*y_pred_1.shape[1], )[0][0])[:, :30]
        out1 = ''.join([characters[x] for x in out1[0]])   
        result +=  out1 +";"
            
    X_test_2[0] = cv2.resize(cv2.imread('test/'+str(i)+'_0.png'), (width2, height2), cv2.INTER_LINEAR).transpose(1,0,2)
    y_pred_2 = model2.predict(X_test_2)
    y_pred_2 = y_pred_2[:,2:,:]
    out2 = K.get_value(K.ctc_decode(y_pred_2, input_length=np.ones(y_pred_2.shape[0])*y_pred_2.shape[1], )[0][0])[:, :30]
    out2 = ''.join([characters2[x] for x in out2[0]])

Ejemplo n.º 7

0

Mostrar archivo

	def __keras_decode(y_pred: np.ndarray, input_lengths: np.ndarray, greedy: bool, beam_width: int, top_paths: int) -> list:
		decoded = k.ctc_decode(y_pred=y_pred, input_length=input_lengths, greedy=greedy, beam_width=beam_width, top_paths=top_paths)
		return [path.eval(session=k.get_session()) for path in decoded[0]]

Ejemplo n.º 8

0

Mostrar archivo

 def call(self, y_pred):
     top_k_decoded, logs = K.ctc_decode(y_pred,
                                        K.reshape(self.input_length,
                                                  (-1, )),
                                        greedy=True)
     return K.reshape(top_k_decoded, (-1, 1))

Ejemplo n.º 9

0

Mostrar archivo

def ctc_pred(model,x,batch_size,input_len,):
    pred = model.predict(x,batch_size=batch_size)
    input_len = K.constant([input_len]*len(pred),dtype="int32")
    decoded = K.ctc_decode(pred, input_len, greedy=True, beam_width=100, top_paths=1)
    return K.get_value(decoded[0][0])

Ejemplo n.º 10

0

Mostrar archivo

                                       sample_weight=sample_weight[i:i +
                                                                   batch_size])

        total_ctcloss += ctcloss * inputs_train["the_input"].shape[0] * 1.
    loss_train[epoch] = total_ctcloss / X_train.shape[0]

    inputs_train = {
        'the_input': X_train,
        'the_labels': y_train,
        'input_length': np.sum(X_train_mask, axis=1, dtype=np.int32),
        'label_length': np.squeeze(y_train_mask),
    }
    outputs_train = {'ctc': np.zeros([y_train.shape[0]])}
    preds = test_func([inputs_train["the_input"]])[0]
    decode_function = K.ctc_decode(preds[:, 2:, :],
                                   inputs_train["input_length"] - 2,
                                   greedy=False,
                                   top_paths=1)
    labellings = decode_function[0][0].eval(session=sess)

    #    print labellings, len(labellings), len(labellings[0]), shape(labellings)
    if labellings.shape[1] == 0:
        ua_train[epoch] = 0.0
        wa_train[epoch] = 0.0
    else:
        ua_train[epoch] = unweighted_accuracy(y_train.ravel(),
                                              labellings.T[0].ravel())
        wa_train[epoch] = weighted_accuracy(y_train.ravel(),
                                            labellings.T[0].ravel())

    inputs_test = {
        'the_input': X_test,

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test.py Proyecto: S12P/Speech-recognition

                  optimizer=sgd)

    batch, lab, input_len, lab_len = tt.get_batch()

    size_training_set = int(.8 * len(batch))
    print('The training set is of size {}\n'.format(size_training_set))

    [x_train, x_test] = np.split(batch, [size_training_set])
    [y_train, y_test] = np.split(lab, [size_training_set])
    [input_len_train, input_len_test] = np.split(input_len,
                                                 [size_training_set])
    [lab_len_train, lab_len_test] = np.split(lab_len, [size_training_set])

    model.fit([x_train, y_train, input_len_train, lab_len_train],
              [y_train, x_train],
              batch_size=100,
              epochs=1)

    score = model.evaluate([x_test, y_test, input_len_test, lab_len_test],
                           [y_test, x_test])

    print('The final score is {}'.format(score))

batch, lab, input_len, lab_len = tt.get_sound_examples('examples')
out = K.ctc_decode(
    model.predict([batch, lab, input_len, lab_len])[1], input_len)

E = K.eval(out[0][0])
for k in range(len(E)):
    print(tt.int_list_to_text(E[k]))

Ejemplo n.º 12

0

Mostrar archivo

    model.output_length = lambda x: x
    print(model.summary())
    return model


model = bidirectional_rnn_model(
    input_dim=161,  # change to 13 if you would like to use MFCC features
    units=512 + 32)

print('load Model')
model.load_weights('results/model_20.h5')
data_gen = AudioGenerator()
print("Load file")
audio_path = 'output.wav'
data_point = data_gen.normalize(data_gen.featurize(audio_path))

print("Start prediction")

#input_to_softmax.load_weights(model_path)
prediction = model.predict(np.expand_dims(data_point, axis=0), batch_size=1)
output_length = [model.output_length(data_point.shape[0])]
pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
             1).flatten().tolist()

print(prediction)
print(output_length)

print(pred_ints)
print('Predicted transcription:\n' + '\n' +
      ''.join(int_sequence_to_text(pred_ints)))

Ejemplo n.º 13

0

Mostrar archivo

    def __init__(self, learning_rate=0.001):
        conv_filters = 16
        kernel_size = (3, 3)
        pool_size = 2
        time_dense_size = 32
        rnn_size = 512
        img_h = 32
        act = 'relu'

        self.width = K.placeholder(name='width', ndim=0, dtype='int32')
        self.input_data = Input(name='the_input',
                                shape=(None, img_h, 1),
                                dtype='float32')
        self.inner = Conv2D(conv_filters,
                            kernel_size,
                            padding='same',
                            activation=act,
                            kernel_initializer='he_normal',
                            name='conv1')(self.input_data)
        self.inner = MaxPooling2D(pool_size=(pool_size, pool_size),
                                  name='max1')(self.inner)
        self.inner = Conv2D(conv_filters,
                            kernel_size,
                            padding='same',
                            activation=act,
                            kernel_initializer='he_normal',
                            name='conv2')(self.inner)
        self.inner = MaxPooling2D(pool_size=(pool_size, pool_size),
                                  name='max2')(self.inner)

        self.inner = Lambda(self.res, arguments={"last_dim": (img_h // (pool_size ** 2)) * conv_filters \
                                                      , "width": self.width // 4})(self.inner)

        # cuts down input size going into RNN:
        self.inp = Dense(time_dense_size, activation=act,
                         name='dense1')(self.inner)
        self.batch_norm = keras.layers.normalization.BatchNormalization()(
            self.inp)
        self.gru_1 = Bidirectional(GRU(rnn_size,
                                       return_sequences=True,
                                       kernel_initializer='he_normal',
                                       name='gru1'),
                                   merge_mode="sum")(self.batch_norm)
        self.gru_2 = Bidirectional(GRU(rnn_size,
                                       return_sequences=True,
                                       kernel_initializer='he_normal',
                                       name='gru2'),
                                   merge_mode="concat")(self.gru_1)
        self.y_pred = TimeDistributed(
            Dense(63,
                  kernel_initializer='he_normal',
                  name='dense2',
                  activation='linear'))(self.gru_2)
        self.model = Model(inputs=self.input_data, outputs=self.y_pred)
        self.model.summary()
        self.out = K.function(
            [self.input_data, self.width,
             K.learning_phase()], [self.y_pred])
        self.y_true = K.placeholder(name='y_true', ndim=1, dtype='int32')
        self.input_length = K.placeholder(name='input_length',
                                          ndim=1,
                                          dtype='int32')
        self.label_length = K.placeholder(name='label_length',
                                          ndim=1,
                                          dtype='int32')
        self.loss_out = K.mean(
            warpctc_tensorflow.ctc(tf.transpose(self.y_pred,
                                                perm=[1, 0, 2]), self.y_true,
                                   self.label_length, self.input_length))
        # self.optimizer = keras.optimizers.Adam(lr = learning_rate)
        self.optimizer = keras.optimizers.SGD(lr=learning_rate,
                                              decay=1e-6,
                                              momentum=0.9,
                                              nesterov=True,
                                              clipnorm=200)
        self.update = self.optimizer.get_updates(self.model.trainable_weights,
                                                 [],
                                                 loss=self.loss_out)
        self.network_output = K.ctc_decode(
            Activation('softmax')(self.y_pred), self.input_length, True)[0][0]
        self.train_step = K.function([self.input_data, self.width, self.y_true, self.input_length, self.label_length, K.learning_phase()], \
             [self.loss_out, self.y_pred], updates = self.update)
        self.test = K.argmax(self.y_pred, axis=2)
        self.predict_step = K.function([
            self.input_data, self.width, self.input_length,
            K.learning_phase()
        ], [self.network_output])

Ejemplo n.º 14

0

Mostrar archivo

def ctc_accuracy(y_true, y_pred, max_len=MAX_LEN):
    labels = y_true[:, 2:]
    input_length = y_true[:, 0]
    decoded = K.ctc_decode(y_pred, input_length)[0][0]
    cmp = K.cast(K.equal(labels, decoded), dtype='float')
    return K.cast(K.equal(K.sum(cmp, axis=-1), max_len), dtype='float')

Ejemplo n.º 15

0

Mostrar archivo

    def evaluate2(self, ltm_images_ph, tcng, sess):
        db = self.db
        keys = list(db.keys())

        ler_dic = {}
        tler = 0.0

        for idx in range(len(keys)):
            if idx > 40000:
                break

            bnk = keys[idx].split('/')[-1].split('_')[-1].split('.')[0]
            if bnk not in list(ler_dic.keys()):
                ler_dic[bnk] = []

            image = cv2.imread(db[keys[idx]][3], 0)
            org_shape = image.shape

            add_to_bottom = int(self.hl - org_shape[0])
            add_to_right = int(self.wl - org_shape[1])

            if org_shape[0] > self.hl or org_shape[1] > self.wl:
                raise Exception("height or width is bigger than " +
                                str(self.hl) + " x " + str(self.wl) + " " +
                                org_shape)

            padded_image = cv2.copyMakeBorder(image, 0, add_to_bottom, 0,
                                              add_to_right,
                                              cv2.BORDER_CONSTANT, 0)
            padded_image = np.array(
                padded_image.reshape(1, self.hl, self.wl, 1))

            ls = np.array(
                sorted([int(line) for line in db[keys[idx]][2].split('-')
                        ])).reshape(-1, 3)
            height = np.array(org_shape[0]).reshape(-1, 1)
            width = np.array(org_shape[1]).reshape(-1, 1)

            label, seq_len = self.label_processor(db[keys[idx]][0])
            label = np.array(label)
            seq_len = np.array(seq_len).reshape(-1, 1)

            if True:
                image = np.concatenate([padded_image, padded_image], axis=0)
                height = np.concatenate([height, height], axis=0)
                width = np.concatenate([width, width], axis=0)
                ls = np.concatenate([ls, ls], axis=0)

            ltm_images, l_true = ltm_img_processor(image,
                                                   height,
                                                   width,
                                                   ls,
                                                   double=False)

            y_pred = sess.run(
                [tcng.fc_2],
                feed_dict={
                    ltm_images_ph: ltm_images,
                    tcng.images_ph: image,
                    tcng.heights_ph: height,
                    tcng.widths_ph: width
                })

            y_pred = y_pred[0]
            shape = y_pred[:, 2:, :].shape
            ctc_decode = bknd.ctc_decode(y_pred[:, 2:, :],
                                         input_length=np.ones(shape[0]) *
                                         shape[1])[0][0]
            out = bknd.get_value(ctc_decode)[:, :self.maxL]

            ler = compare1(out, label, self.Ivoc, show=2)
            ler_dic[bnk].append(float(ler))
            tler += ler

            logging.debug("processed %i out of %i", idx, len(keys))

        for bnk in list(ler_dic.keys()):
            ler_dic[bnk] = np.mean(ler_dic[bnk])
            logging.info("ler for bank %i is %f", int(bnk), ler_dic[bnk])
        return tler / len(keys)

Ejemplo n.º 16

0

Mostrar archivo

    def Predict(self, batch_size, data_input, in_len):
        '''
		预测结果
		返回语音识别后的拼音符号列表
		'''
        batch_size = 1

        in_len = np.zeros((batch_size), dtype=np.int32)
        print(in_len.shape)
        in_len[0] = in_len[0] - 2

        x_in = np.zeros((batch_size, 1600, 200), dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)
        print('base_pred:\n', base_pred)

        y_p = base_pred
        print('base_pred0:\n', base_pred[0][0].shape)

        #for j in range(200):
        #	mean = np.sum(y_p[0][j]) / y_p[0][j].shape[0]
        #	print('max y_p:',np.max(y_p[0][j]),'min y_p:',np.min(y_p[0][j]),'mean y_p:',mean,'mid y_p:',y_p[0][j][100])
        #	print('argmin:',np.argmin(y_p[0][j]),'argmax:',np.argmax(y_p[0][j]))
        #	count=0
        #	for i in range(y_p[0][j].shape[0]):
        #		if(y_p[0][j][i] < mean):
        #			count += 1
        #	print('count:',count)

        base_pred = base_pred[:, 2:, :]
        r = K.ctc_decode(base_pred,
                         in_len,
                         greedy=True,
                         beam_width=100,
                         top_paths=1)
        print('r', r)
        #r = K.cast(r[0][0], dtype='float32')
        #print('r1', r)
        #print('解码完成')

        r1 = K.get_value(r[0][0])
        print('r1', r1)

        print('r0', r[1])
        r2 = K.get_value(r[1])
        print(r2)
        print('解码完成')
        list_symbol_dic = GetSymbolList(self.datapath)  # 获取拼音列表

        r1 = r1[0]

        r_str = []
        for i in r1:
            r_str.append(list_symbol_dic[i])

        #print(r_str)

        return r_str
        pass

Ejemplo n.º 17

0

Mostrar archivo

Archivo: SpeechModel4.py Proyecto: liuming5467/ASRT_SpeechRecognition

    def RecognizeSpeech(self, wavsignal, fs):
        '''
		最终做语音识别用的函数，识别一个wav序列的语音
		不过这里现在还有bug
		'''

        #data = self.data
        data = DataSpeech('E:\\语音数据集')
        data.LoadDataList('dev')
        # 获取输入特征
        #data_input = data.GetMfccFeature(wavsignal, fs)
        data_input = data.GetFrequencyFeature(wavsignal, fs)

        list_symbol_dic = data.list_symbol  # 获取拼音列表

        labels = [
            'dong1', 'bei3', 'jun1', 'de5', 'yi4', 'xie1', 'ai4', 'guo2',
            'jiang4', 'shi4', 'ma3', 'zhan4', 'shan1', 'li3', 'du4', 'tang2',
            'ju4', 'wu3', 'su1', 'bing3', 'ai4', 'deng4', 'tie3', 'mei2',
            'deng3', 'ye3', 'fen4', 'qi3', 'kang4', 'zhan4'
        ]
        #labels = [ list_symbol_dic[-1] ]
        #labels = [ list_symbol_dic[-1] ]
        #while(len(labels) < 32):
        #	labels.append(list_symbol_dic[-1])

        feat_out = []
        #print("数据编号",n_start,filename)
        for i in labels:
            if ('' != i):
                n = data.SymbolToNum(i)
                feat_out.append(n)

        print(feat_out)
        labels = feat_out

        x = next(
            self.data_gen(data_input=np.array(data_input),
                          data_labels=np.array(labels),
                          input_length=len(data_input),
                          labels_length=len(labels),
                          batch_size=2))

        [test_input_data, y, test_input_length, label_length], labels = x
        xx = [test_input_data, y, test_input_length, label_length]

        pred = self._model.predict(x=xx)

        print(pred)

        shape = pred[:, :].shape
        print(shape)

        #print(test_input_data)
        y_p = self.test_func([test_input_data])
        print(type(y_p))
        print('y_p:', y_p)

        for j in range(0, 200):
            mean = sum(y_p[0][0][j]) / len(y_p[0][0][j])
            print('max y_p:', max(y_p[0][0][j]), 'min y_p:', min(y_p[0][0][j]),
                  'mean y_p:', mean, 'mid y_p:', y_p[0][0][j][100])
            print('argmin:', np.argmin(y_p[0][0][j]), 'argmax:',
                  np.argmax(y_p[0][0][j]))
            count = 0
            for i in y_p[0][0][j]:
                if (i < mean):
                    count += 1
            print('count:', count)

        print(K.is_sparse(y_p))
        y_p = K.to_dense(y_p)
        print(K.is_sparse(y_p))
        #y_p = tf.sparse_to_dense(y_p,(2,397),1417,0)
        print(test_input_length.T)
        test_input_length = test_input_length.reshape(2, 1)
        func_in_len = self.test_func_input_length([test_input_length])
        print(type(func_in_len))
        #in_len = np.ones(shape[0]) * shape[1]
        ctc_decoded = K.ctc_decode(y_p, input_length=func_in_len)

        print(ctc_decoded)
        #ctc_decoded = ctc_decoded[0][0]
        #out = K.get_value(ctc_decoded)[:,:64]
        #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length])
        return pred[0][0]

        pass

Ejemplo n.º 18

0

Mostrar archivo

Archivo: CRNN Model_keras_validation.py Proyecto: rizwan3d/CRNN_for_OCR

# As our model predicts the probability for each class at each time step, we need to use some transcription function to convert it into actual texts. Here we will use the CTC decoder to get the output text. Let’s see the code:

# In[2]:


# load the saved best model weights
act_model.load_weights('best_model.hdf5')
 
num_val = 15000
# predict outputs on validation images
prediction = act_model.predict(valid_img[:num_val])
 
valid_img = np.array(valid_img)

# use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                         greedy=True)[0][0])
#print(out)
out_pred = ''
counter = 0

# see the results
i = 0
for x in out:
    print("original_text =  ", valid_orig_txt[i])
    print("predicted text = ", end = '')
    for p in x:  
        if int(p) != -1:
            c = char_list[int(p)]
            print(char_list[int(p)], end = '')
            out_pred= out_pred + c
    if valid_orig_txt[i] == out_pred:

Ejemplo n.º 19

0

Mostrar archivo

    def RecognizeSpeech(self, wavsignal, fs):
        '''
		最终做语音识别用的函数，识别一个wav序列的语音
		不过这里现在还有bug
		'''
        #data = self.data
        #data = DataSpeech('E:\\语音数据集')
        #data.LoadDataList('dev')
        # 获取输入特征
        #data_input = data.GetMfccFeature(wavsignal, fs)
        data_input = GetFrequencyFeature(wavsignal, fs)
        input_length = len(data_input)
        input_length = input_length // 4

        data_input = np.array(data_input, dtype=np.float)
        in_len = np.zeros((1), dtype=np.int32)
        print(in_len.shape)
        in_len[0] = input_length - 2

        batch_size = 1
        x_in = np.zeros((batch_size, 1600, 200), dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)
        print('base_pred:\n', base_pred)

        y_p = base_pred
        print('base_pred0:\n', base_pred[0][0].shape)

        for j in range(200):
            mean = np.sum(y_p[0][j]) / y_p[0][j].shape[0]
            print('max y_p:', np.max(y_p[0][j]), 'min y_p:', np.min(y_p[0][j]),
                  'mean y_p:', mean, 'mid y_p:', y_p[0][j][100])
            print('argmin:', np.argmin(y_p[0][j]), 'argmax:',
                  np.argmax(y_p[0][j]))
            count = 0
            for i in range(y_p[0][j].shape[0]):
                if (y_p[0][j][i] < mean):
                    count += 1
            print('count:', count)
        #for j in range(0,200):
        #	mean = sum(y_p[0][0][j]) / len(y_p[0][0][j])
        #	print('max y_p:',max(y_p[0][0][j]),'min y_p:',min(y_p[0][0][j]),'mean y_p:',mean,'mid y_p:',y_p[0][0][j][100])
        #	print('argmin:',np.argmin(y_p[0][0][j]),'argmax:',np.argmax(y_p[0][0][j]))
        #	count=0
        #	for i in y_p[0][0][j]:
        #		if(i < mean):
        #			count += 1
        #	print('count:',count)
        #decoded_sequences = self.decoder([base_pred, in_len])

        #print('decoded_sequences:\n', decoded_sequences)
        #input_length = tf.squeeze(input_length)

        #decode_pred = self.model_decode(x=[x_in, in_len])
        #print(decode_pred)
        base_pred = base_pred[:, 2:, :]
        r = K.ctc_decode(base_pred,
                         in_len,
                         greedy=True,
                         beam_width=100,
                         top_paths=1)
        print('r', r)
        #r = K.cast(r[0][0], dtype='float32')
        #print('r1', r)
        #print('解码完成')

        r1 = K.get_value(r[0][0])
        print('r1', r1)

        print('r0', r[1])
        r2 = K.get_value(r[1])
        print(r2)
        print('解码完成')
        list_symbol_dic = GetSymbolList(self.datapath)  # 获取拼音列表

        r1 = r1[0]

        r_str = []
        for i in r1:
            r_str.append(list_symbol_dic[i])

        #print(r_str)

        return r_str

        pass

Ejemplo n.º 20

0

Mostrar archivo

Archivo: LPN_crack.py Proyecto: wanth1997/LicensePlateNumber_Crack

if (opts.printmodel):
    plot_model(model, to_file="model.png", show_shapes=True)
    Image('model.png')

if (opts.testing == False):
    model.fit_generator(gen(opts.batch_size), steps_per_epoch=opts.steps, epochs=opts.epochs,
            callbacks=[EarlyStopping(patience=10), evaluator],
            validation_data=gen(), validation_steps=1280)
else:
    print("testing......")
    characters2 = characters + ' '
    [X_test, y_test, _, _], _  = next(gen(1))
    #cv2.imwrite("./save_image/test.jpg" , X_test)
    y_pred = base_model.predict(X_test)
    y_pred = y_pred[:,2:,:]
    out = K.get_value(K.ctc_decode(y_pred, input_length=np.ones(y_pred.shape[0])*y_pred.shape[1], )[0][0])[:, :7]
    out = ''.join([characters[x] for x in out[0]])
    y_true = ''.join([characters[x] for x in y_test[0]])
    print(out)
    print(y_true)

if(opts.modelname == None and opts.testing == False):
    run_name = datetime.datetime.now().strftime('%Y:%m:%d:%H:%M:%S')
    model.save(run_name+".h5")
    base_model.save("base_"+run_name+".h5")
elif(opts.testing ==True):
    print("Please input testing model name")
else:
    model.save(opts.modelname)
    base_model.save("base_"+opts.modelname)
del model

Ejemplo n.º 21

0

Mostrar archivo

Archivo: model.py Proyecto: sdrobert/more-or-let

def _dft_ctc_decode(y_pred, input_length, beam_width=100):
    assert False, "fixme"
    sm_y_pred = K.softmax(y_pred)
    return K.ctc_decode(
        sm_y_pred, K.flatten(input_length),
        beam_width=beam_width, greedy=False, top_paths=1)[0][0]

Ejemplo n.º 22

0

Mostrar archivo

Archivo: main20210121.py Proyecto: 889924/ChaquopyTest-main

def predict(wavs):
    # print("pppppppppppppppppp")
    # 初始化语音
    # speaker = win32com.client.Dispatch("SAPI.SpVoice")

    # my_record()

    # wavs = glob.glob('.//test_data/voice_test.wav')
    # wavs = ['/data/user/0/com.example.chaquopytest/files/chaquopy/AssetFinder/app/sjbf_speech2.wav']
    # print(wavs)
    a = join(dirname(__file__), 'asr_video_enhance_2.h5')
    print(type(a))
    graph = tf.compat.v1.get_default_graph()
    session = tf.compat.v1.Session()
    with graph.as_default():
        with session.as_default():
            model = load_model(join(dirname(__file__), 'asr_video_enhance_2.h5'))
    # model = load_model(join(dirname(__file__), 'asr_video_enhance_2.h5'))
    # load_model('/data/user/0/com.fangte.yjy.speechrecogni/files/chaquopy/AssetFinder/app/asr_video_enhance_2.h5')
    pk = join(dirname(__file__), 'dictionary_video_enhance_2.pkl')
    with open(pk, 'rb') as fr:
        [_, id2char, mfcc_mean, mfcc_std] = pickle.load(fr)
    #     # char2id = pd.DataFrame(char2id.items(), columns=['name', 'index'])
    #     # print(char2id)
    # wavs = join(dirname(__file__), l)
    # wavs = []
    # wavs.append(l)
    # mfcc_mean = np.array([-5.54817, 10.18685, -16.97834, 19.95623, -24.71567, 1.91108, -17.68871, 2.04288, -17.55804,
    #                       0.20271, -9.62210, -5.43127, -1.53957])
    # mfcc_std = np.array([4.11379, 16.58478, 15.80970, 18.87008, 18.04815, 21.30934, 19.47388, 18.76543, 16.85591,
    #                      16.07542, 13.90712, 13.12571, 12.20504])
    # id2char = {0: '倍', 1: '速', 2: '快', 3: '播', 4: '放', 5: '一', 6: '个', 7: '慢', 8: '0', 9: '.', 10: '5',
    #            11: '2', 12: '停', 13: '4', 14: '随', 15: '机', 16: '顺', 17: '序', 18: '上', 19: '1', 20: '进',
    #            21: '下', 22: '暂', 23: '开', 24: '始', 25: '止', 26: '退', 27: '循', 28: '环'
    #            }
    mfcc_dim = 13
    # index = np.random.randint(len(wavs))
    # print(wavs[index])
    # audio, sr = librosa.load(wavs[index])
    print(wavs)
    audio, sr = librosa.load(wavs)
    energy = librosa.feature.rms(audio)
    frames = np.nonzero(energy >= np.max(energy) / 5)
    indices = librosa.core.frames_to_samples(frames)[1]
    audio = audio[indices[0]:indices[-1]] if indices.size else audio[0:0]
    X_data = mfcc(audio, sr, numcep=mfcc_dim, nfft=551)
    X_data = (X_data - mfcc_mean) / (mfcc_std + 1e-14)
    # print(X_data.shape)
    tf.compat.v1.reset_default_graph()
    with graph.as_default():
        with session.as_default():
            pred = model.predict(np.expand_dims(X_data, axis=0))
    # pred = model.predict(np.expand_dims(X_data, axis=0))
    pred_ids = K.eval(K.ctc_decode(pred, [X_data.shape[0]], greedy=False, beam_width=10, top_paths=1)[0][0])
    pred_ids = pred_ids.flatten().tolist()
    text = ''.join([id2char[i] for i in pred_ids])
    # print(''.join([id2char[i] for i in pred_ids]))
    print(text)
    return text

# if __name__ == '__main__':
#     result = predict()
    # print(result)

Ejemplo n.º 23

0

Mostrar archivo

Archivo: testTF.py Proyecto: otaGran/Plates

        batch_num = 1#264
        batch_acc = 0
        true_acc = 0
        st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
        print(st)
        print(datetime.datetime.now())
        for i in range(batch_num):
            # [X_test, y_test, _, _], _  = next(generator)
            print(X_test[i])
            y_pred = sess.run(y, feed_dict={
            x:X_test[i][np.newaxis, :]


        })
            shape = y_pred[:, 2:, :].shape
            out = K.get_value(K.ctc_decode(y_pred[:, 2:, :], input_length=np.ones(shape[0]) * shape[1])[0][0])[:, :8]
            # if out.shape[1] == 8:
            # batch_acc += ((y_test[i] == out).sum(axis=1) == 8).mean()
            # argmax = np.argmax(y_pred, axis=2)[0]
            out = ''.join([characters[x] for x in out[0]]).replace(' ', '')
            y_true = ''.join([characters[x] for x in y_test[i]]).replace(' ', '')
            if out == y_true:
                true_acc += 1

            """
            else:
                print(out)
                print(y_true)
                print("-----------")
            """
        # print(true_acc / batch_num*100)

Ejemplo n.º 24

0

Mostrar archivo

Archivo: CRNN.py Proyecto: Shaocr/Keras-OCR

 def ctc_decode(softmax):
     return K.ctc_decode(
         softmax, K.tile([K.shape(softmax)[1]], [K.shape(softmax)[0]]))[0]

Ejemplo n.º 25

0

Mostrar archivo

Archivo: Predict_DEMO.py Proyecto: yurigba/CRNN-with-STN

        mat_ori = np.zeros(
            (height, width - int(31.0 / img_size[0] * img_size[1]), 3),
            dtype=np.uint8)
        out_img = np.concatenate([img_reshape, mat_ori],
                                 axis=1).transpose([1, 0, 2])
    else:
        out_img = cv2.resize(img, (width, height),
                             interpolation=cv2.INTER_CUBIC)
        out_img = np.asarray(out_img).transpose([1, 0, 2])

    img_list[ii] = np.asarray(out_img)
    ii += 1

model = load_model('PATH_TO_WEIGHT_FILE')
''' if you want to load model with STN, please use
model = load_model('PATH_TO_WEIGHT_FILE', custom_objects={'SpatialTransformer': SpatialTransformer})'''

y_pred = model.predict(img_list)
shape = y_pred[:, 2:, :].shape
ctc_decode = bknd.ctc_decode(y_pred[:, 2:, :],
                             input_length=np.ones(shape[0]) * shape[1])[0][0]
out = bknd.get_value(ctc_decode)[:, :label_len]

out_list = []
for m in range(len(fileList)):
    result_str = ''.join([characters[k] for k in out[m]])
    out_list.append(result_str)

print(out_list)

Ejemplo n.º 26

0

Mostrar archivo

def ctc_decode(pred):
	c = K.ctc_decode(pred, input_length=np.ones(pred.shape[0]) * pred.shape[1], greedy=False, beam_width=10)[0][0]
	print (c)

Ejemplo n.º 27

0

Mostrar archivo

    },
                        optimizer=Adam(lr=0.0001))

    model_final.fit(
        x=[train_x, train_y, train_input_len, train_label_len],
        y=train_output,
        validation_data=([valid_x, valid_y, valid_input_len,
                          valid_label_len], valid_output),
        epochs=60,
        batch_size=128)

    #Check model performance on validation set
    preds = model.predict(valid_x)
    decoded = K.get_value(
        K.ctc_decode(preds,
                     input_length=np.ones(preds.shape[0]) * preds.shape[1],
                     greedy=True)[0][0])

    prediction = []
    for i in range(valid_size):
        prediction.append(num_to_label(decoded[i]))

    y_true = validation_written_df.loc[0:valid_size, 'IDENTITY']
    correct_char = 0
    total_char = 0
    correct = 0

    for i in range(valid_size):
        pr = prediction[i]
        tr = y_true[i]
        total_char += len(tr)

Ejemplo n.º 28

0

Mostrar archivo

Archivo: test.py Proyecto: mukulbhave/viden

    print("predicting for:" + pathAndFilename)
    # predict outputs on validation images
    # img = Image.open(pathAndFilename)
    # img = img.resize((128, 32), Image.BICUBIC)

    # img = np.array(img) /255;
    # img = np.sum(img, axis=2,keepdims=True)
    img, _, _, _ = process_data(pathAndFilename, "1_1")
    img = img / 255.
    img = np.expand_dims(img, axis=0)
    prediction = act_model.predict(img)

    # use CTC decoder
    out = K.get_value(
        K.ctc_decode(prediction,
                     input_length=np.ones(prediction.shape[0]) *
                     prediction.shape[1],
                     greedy=False)[0][0])
    head, tail = ntpath.split(pathAndFilename)
    txt = tail.split('_')[1]
    # see the results
    i = 0
    le = min(10, out.shape[1])
    print(out.shape)
    for x in out:
        print(txt)
        for p in range(0, le):
            if int(x[p]) != -1:
                print(char_list[int(x[p])], end='')
        print('\n')
        i += 1

Ejemplo n.º 29

0

Mostrar archivo

def get_predictions(index, partition, input_to_softmax, model_path, phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    if not phn:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n' +
              ''.join(int_sequence_to_text(pred_ints, phn)))
        print('-' * 80)
    else:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n')
        split_true = transcr.split(" ")
        split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
        print("\033[1;32m" + split_pred[0] + " ", end='')
        for i in range(1, len(split_true) - 1):
            if split_true[i - 1] == split_pred[i] or split_true[
                    i] == split_pred[i] or split_true[i + 1] == split_pred[i]:
                print("\033[1;32m" + split_pred[i] + " ", end='')
            else:
                print("\033[1;31m" + split_pred[i] + " ", end='')
        print(split_pred[len(split_true) - 1] + " ", end='')
    split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
    split_true = transcr.split(" ")
    displayAccuracy(split_true, split_pred, phn)

Ejemplo n.º 30

0

Mostrar archivo

Archivo: 利用keras_orc.py Proyecto: wei772/DevelopAdvanced

    # print(np.shape(X))
    X = np.transpose(X, (0, 2, 3, 1)) X = np.array(X) Y = np.array(Y) return X,Y # the actual loss calc occurs here despite it not being
# an internal Keras loss function

def ctc_lambda_func(args): y_pred, labels, input_length, label_length = args # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    # y_pred = y_pred[:, 2:, :] 测试感觉没影响
    y_pred = y_pred[:, :, :] return K.ctc_batch_cost(labels, y_pred, input_length, label_length) if __name__ == '__main__': height=150
    width=50
    input_tensor = Input((height, width, 1)) x = input_tensor for i in range(3): x = Convolution2D(32*2**i, (3, 3), activation='relu', padding='same')(x) # x = Convolution2D(32*2**i, (3, 3), activation='relu')(x)
        x = MaxPooling2D(pool_size=(2, 2))(x) conv_shape = x.get_shape() # print(conv_shape)
    x = Reshape(target_shape=(int(conv_shape[1]), int(conv_shape[2] * conv_shape[3])))(x) x = Dense(32, activation='relu')(x) gru_1 = GRU(32, return_sequences=True, kernel_initializer='he_normal', name='gru1')(x) gru_1b = GRU(32, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(x) gru1_merged = add([gru_1, gru_1b]) ###################

    gru_2 = GRU(32, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged) gru_2b = GRU(32, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')( gru1_merged) x = concatenate([gru_2, gru_2b]) ######################
    x = Dropout(0.25)(x) x = Dense(label_count, kernel_initializer='he_normal', activation='softmax')(x) base_model = Model(inputs=input_tensor, outputs=x) labels = Input(name='the_labels', shape=[seq_len], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([x, labels, input_length, label_length]) model = Model(inputs=[input_tensor, labels, input_length, label_length], outputs=[loss_out]) model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer='adadelta') model.summary() def test(base_model): file_list = [] X, Y = gen_image_data(r'data\test', file_list) y_pred = base_model.predict(X) shape = y_pred[:, :, :].shape # 2:
        out = K.get_value(K.ctc_decode(y_pred[:, :, :], input_length=np.ones(shape[0]) * shape[1])[0][0])[:, :seq_len] # 2:
        print() error_count=0
        for i in range(len(X)): print(file_list[i]) str_src = str(os.path.split(file_list[i])[-1]).split('.')[0].split('_')[-1] print(out[i]) str_out = ''.join([str(x) for x in out[i] if x!=-1 ]) print(str_src, str_out) if str_src!=str_out: error_count+=1
                print('################################',error_count) # img = cv2.imread(file_list[i])
            # cv2.imshow('image', img)
            # cv2.waitKey()


    class LossHistory(Callback): def on_train_begin(self, logs={}): self.losses = [] def on_epoch_end(self, epoch, logs=None): model.save_weights('model_1018.w') base_model.save_weights('base_model_1018.w') test(base_model) def on_batch_end(self, batch, logs={}): self.losses.append(logs.get('loss')) # checkpointer = ModelCheckpoint(filepath="keras_seq2seq_1018.hdf5", verbose=1, save_best_only=True, )
    history = LossHistory() # base_model.load_weights('base_model_1018.w')
    # model.load_weights('model_1018.w')

    X,Y=gen_image_data() maxin=4900
    subseq_size = 100
    batch_size=10
    result=model.fit([X[:maxin], Y[:maxin], np.array(np.ones(len(X))*int(conv_shape[1]))[:maxin], np.array(np.ones(len(X))*seq_len)[:maxin]], Y[:maxin], batch_size=20, epochs=1000, callbacks=[history, plotter, EarlyStopping(patience=10)], #checkpointer, history,

Ejemplo n.º 31

0

Mostrar archivo

 def predict(self, X):
     y_pred = self.model.predict(X)
     input_length = np.ones(y_pred.shape[0]) * y_pred.shape[1]
     predicts = K.eval(K.ctc_decode(y_pred, input_length)[0][0])
     return predicts