Ejemplo n.º 1
0
        print 'acc: %f%%' % acc


evaluator = Evaluate()
# model.fit_generator(gen(64), samples_per_epoch=512, nb_epoch=15,
#                     callbacks=[evaluator],
#                     )

# 测试模型
characters2 = characters + ' '
[X_test, y_test, _, _], _ = next(gen(1))
y_pred = base_model.predict(X_test)
y_pred = y_pred[:, 2:, :]
out = K.get_value(
    K.ctc_decode(
        y_pred,
        input_length=np.ones(y_pred.shape[0]) * y_pred.shape[1],
    )[0][0])[:, :n_len]
# out = ''.join([characters[x] for x in out[0]])
# y_true = ''.join([characters[x] for x in y_test[0]])
#
# import pylab
# plt.imshow(X_test[0].transpose(1, 0, 2))
# plt.title('pred:' + str(out) + '\ntrue: ' + str(y_true))
# pylab.show()

argmax = np.argmax(y_pred, axis=2)[0]
print list(zip(argmax, ''.join([characters2[x] for x in argmax])))

# 计算模型总体准确率
print evaluate(base_model)
model.save('model.h5')
Ejemplo n.º 2
0
def create_model(params, gpu=False, two_rnns=False):

    input_data = Input(name="input",
                       shape=params["input_shape"],
                       dtype="float32")
    conv1 = Conv2D(
        params["conv_filters"],
        params["kernel_size"],
        padding="same",
        activation=params["act"],
        kernel_initializer="he_normal",
        name="conv1",
    )(input_data)
    conv1 = MaxPooling2D(pool_size=(params["pool_size"], params["pool_size"]),
                         name="max1")(conv1)
    conv1 = Dropout(0.2)(conv1)
    conv2 = Conv2D(
        params["conv_filters"],
        params["kernel_size"],
        padding="same",
        activation=params["act"],
        kernel_initializer="he_normal",
        name="conv2",
    )(conv1)
    conv2 = MaxPooling2D(pool_size=(params["pool_size"], params["pool_size"]),
                         name="max2")(conv2)
    conv2 = Dropout(0.2)(conv2)

    # conv1shape = (img_w // (pool_size ** (num_convs - 1)),
    #                     (img_h // (pool_size ** (num_convs - 1))) * conv_filters)
    conv2shape = (
        params["img_w"] // (params["pool_size"]**params["num_convs"]),
        (params["img_h"] // (params["pool_size"]**params["num_convs"])) *
        params["conv_filters"],
    )

    # Failed attempt to do a skip connection

    # conv1 = Reshape(target_shape=conv1shape)(conv1)
    # conv2 = Reshape(target_shape=conv2shape)(conv2)
    # inner = concatenate([conv1, conv2], axis=2)

    inner = Reshape(target_shape=conv2shape, name="reshape")(conv2)

    # cuts down input size going into RNN:
    inner = Dense(params["time_dense_size"],
                  activation=params["act"],
                  name="dense1")(inner)

    if gpu:
        gru_1 = CuDNNGRU(
            params["rnn1_size"],
            return_sequences=True,
            kernel_initializer="he_normal",
            name="gru1",
        )(inner)
        gru_1b = CuDNNGRU(
            params["rnn1_size"],
            return_sequences=True,
            go_backwards=True,
            kernel_initializer="he_normal",
            name="gru1_b",
        )(inner)
    else:
        gru_1 = GRU(
            params["rnn1_size"],
            return_sequences=True,
            kernel_initializer="he_normal",
            name="gru1",
            reset_after=True,
            recurrent_activation="sigmoid",
        )(inner)
        gru_1b = GRU(
            params["rnn1_size"],
            return_sequences=True,
            go_backwards=True,
            kernel_initializer="he_normal",
            name="gru1_b",
            reset_after=True,
            recurrent_activation="sigmoid",
        )(inner)

    gru1_merged = add([gru_1, gru_1b])
    if two_rnns:
        if gpu:
            gru_2 = CuDNNGRU(
                params["rnn2_size"],
                return_sequences=True,
                kernel_initializer="he_normal",
                name="gru2",
            )(gru1_merged)
            gru_2b = CuDNNGRU(
                params["rnn2_size"],
                return_sequences=True,
                go_backwards=True,
                kernel_initializer="he_normal",
                name="gru2_b",
            )(gru1_merged)
        else:
            gru_2 = GRU(
                params["rnn2_size"],
                return_sequences=True,
                kernel_initializer="he_normal",
                name="gru2",
                reset_after=True,
                recurrent_activation="sigmoid",
            )(gru1_merged)
            gru_2b = GRU(
                params["rnn2_size"],
                return_sequences=True,
                go_backwards=True,
                kernel_initializer="he_normal",
                name="gru2_b",
                reset_after=True,
                recurrent_activation="sigmoid",
            )(gru1_merged)

    # transforms RNN output to character activations:
    if two_rnns:
        inner = Dense(params["output_size"],
                      kernel_initializer="he_normal",
                      name="dense2")(concatenate([gru_2, gru_2b]))
    else:
        inner = Dense(params["output_size"],
                      kernel_initializer="he_normal",
                      name="dense2")(gru1_merged)

    y_pred = Activation("softmax", name="softmax")(inner)
    output_labels = Input(name="the_labels",
                          shape=[params["max_string_len"]],
                          dtype="float32")
    input_lengths = Input(name="input_length", shape=[1], dtype="int64")
    label_lengths = Input(name="label_length", shape=[1], dtype="int64")

    # Keras doesn't currently support loss funcs with extra parameters
    # so CTC loss is implemented in a lambda layer
    # The loss function
    def ctc_lambda_func(args):
        y_pred, labels, input_length, label_length = args
        # the 2 is critical here since the first couple outputs of the RNN
        # tend to be garbage:
        y_pred = y_pred[:, params["ctc_cut"]:, :]
        return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

    loss_out = Lambda(ctc_lambda_func, output_shape=(1, ), name="ctc")(
        [y_pred, output_labels, input_lengths, label_lengths])

    train_model = Model(
        inputs=[input_data, output_labels, input_lengths, label_lengths],
        outputs=loss_out)

    top_k_dec_list, _ = K.ctc_decode(
        y_pred[:, params["ctc_cut"]:, :],
        K.squeeze(input_lengths, axis=1),
        greedy=False,
        top_paths=3,
    )
    decoder0 = K.function([input_data, input_lengths], [top_k_dec_list[0]])
    decoder1 = K.function([input_data, input_lengths], [top_k_dec_list[1]])
    decoder2 = K.function([input_data, input_lengths], [top_k_dec_list[2]])
    decoder_models = decoder0, decoder1, decoder2

    return train_model, decoder_models
Ejemplo n.º 3
0
def predict_text(model, recs_all, recs_len, img_all, img_name=None):
    texts = []
    img_list = []
    width_list = []
    img_index = 0

    # fixme 当前是前面所有长度的和
    for i in range(len(recs_len)):
        if i > 0:
            recs_len[i] += recs_len[i - 1]

    for i in range(len(recs_all)):
        for j in range(len(recs_len)):
            if i < recs_len[j]:
                img_index = j
                break

        img_rec = dumpRotateImage(img_all[img_index], recs_all[i]).convert('L')

        scale = img_rec.size[1] * 1.0 / 32
        if not scale > 0:
            continue

        w = int(img_rec.size[0] / scale)

        # fixme 像素缩放后小于1pixel
        if not w > 0:
            continue

        img_rec = img_rec.resize((w, 32), Image.BILINEAR)
        width_list.append(w)

        # fixme 增强图像对比度 提高识别
        img_in = np.array(img_rec).T
        img_out = np.zeros(img_in.shape, np.uint8)
        cv2.normalize(img_in, img_out, 255, 0, cv2.NORM_MINMAX, cv2.CV_8U)

        # fixme 黑白色彩反转 达到黑字白底的目的
        # todo 根据面积比较的反转
        # todo 可以尝试提取图片的前景色
        # black = 0
        # for m in range(32):
        #    for n in range(64 if w >= 64 else w):
        #        if img_out[m, n] < 100 :
        #            black += 1
        # if black > (32*(64 if w >= 64 else w)/2):
        #    img_out = 255 - img_out

        # todo 根据顶点的线条比较反转
        black = 0
        for m in range(32):
            if img_out[0, m] < 100:
                black += 1
        for n in range(64 if w >= 64 else w):
            if img_out[n, 0] < 100:
                black += 1
        if black > (32 + (64 if w >= 64 else w)) // 2:
            img_out = 255 - img_out

        # todo 获取黑色文字进行二值化(效果不佳)
        # for i in range(32):
        #     for j in range(w):
        #         if not (img_out[i, j] < 50):
        #             img_out[i, j] = 255
        #
        # ret, img_out = cv2.threshold(img_out, 180, 255, cv2.THRESH_BINARY)

        img_rec = img_out.astype(np.float32) / 255.0 - 0.5  # img_rec is array
        img_list.append(img_rec)

    width_max = max(width_list)
    X = np.zeros((len(width_list), width_max, 32, 1), dtype=np.float)

    for i in range(len(width_list)):
        img_pad = np.zeros((width_max - width_list[i], 32), np.float32) + 0.5
        img_rec = np.concatenate((img_list[i], img_pad), axis=0)
        X[i] = np.expand_dims(img_rec, axis=2)

        # fixme 保存裁剪后的图像
        if not img_name is None:
            img_out = (img_rec + 0.5) * 255
            img_sa = Image.fromarray(img_out.T.astype(np.int32))
            img_sa.convert('L').save(root_recs + '/' + img_name +
                                     '_%d_.jpg' % i)

    y_pred = model.predict(X)

    out = K.get_value(
        K.ctc_decode(y_pred,
                     input_length=np.ones(y_pred.shape[0]) *
                     y_pred.shape[1])[0][0])

    for i in range(len(out)):
        out_s = u''.join([char[x] for x in out[i] if x != -1])
        # texts_str += (out_s)
        texts.append(out_s)

    # return texts_str
    return texts
    def RecognizeSpeech(self, wavsignal, fs):
        '''
		最终做语音识别用的函数,识别一个wav序列的语音
		不过这里现在还有bug
		'''

        #data = self.data
        data = DataSpeech('E:\\语音数据集')
        data.LoadDataList('dev')
        # 获取输入特征
        #data_input = data.GetMfccFeature(wavsignal, fs)
        data_input = data.GetFrequencyFeature(wavsignal, fs)
        input_length = len(data_input)
        input_length = input_length // 4

        data_input = np.array(data_input, dtype=np.float)
        in_len = np.zeros((1), dtype=np.int32)
        print(in_len.shape)
        in_len[0] = input_length

        batch_size = 1
        x_in = np.zeros((batch_size, 1600, 200), dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)
        print('base_pred:\n', base_pred)

        #input_length = tf.squeeze(input_length)

        #decode_pred = self.model_decode(x=[x_in, in_len])
        #print(decode_pred)
        base_pred = base_pred[:, 2:, :]
        r = K.ctc_decode(base_pred,
                         in_len,
                         greedy=True,
                         beam_width=64,
                         top_paths=1)
        print('r', r)
        #r = K.cast(r[0][0], dtype='float32')
        #print('r1', r)
        #print('解码完成')

        r1 = K.get_value(r[0][0])
        print('r1', r1)

        print('r0', r[1])
        r2 = K.get_value(r[1])
        print(r2)
        print('解码完成')
        list_symbol_dic = data.list_symbol  # 获取拼音列表
        #arr_zero = np.zeros((1, 200), dtype=np.int16) #一个全是0的行向量

        #import matplotlib.pyplot as plt
        #plt.subplot(111)
        #plt.imshow(data_input, cmap=plt.get_cmap('gray'))
        #plt.show()

        #while(len(data_input)<1600): #长度不够时补全到1600
        #	data_input = np.row_stack((data_input,arr_zero))
        #print(len(data_input))

        #list_symbol = data.list_symbol # 获取拼音列表

        #labels = [ list_symbol[0] ]
        #while(len(labels) < 64):
        #	labels.append('')

        #labels_num = []
        #for i in labels:
        #	labels_num.append(data.SymbolToNum(i))

        #data_input = np.array(data_input, dtype=np.int16)
        #data_input = data_input.reshape(data_input.shape[0],data_input.shape[1])

        #labels_num = np.array(labels_num, dtype=np.int16)
        #labels_num = labels_num.reshape(labels_num.shape[0])

        #input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16)
        #input_length = np.array(input_length)
        #input_length = input_length.reshape(input_length.shape[0])

        #label_length = np.array([labels_num.shape[0]], dtype=np.int16)
        #label_length = np.array(label_length)
        #label_length = label_length.reshape(label_length.shape[0])

        #x = [data_input, labels_num, input_length, label_length]
        #x = next(data.data_genetator(1, self.AUDIO_LENGTH))
        #x = kr.utils.np_utils.to_categorical(x)

        #print(x)
        #x=np.array(x)

        #pred = self._model.predict(x=x)
        #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length])
        #return [labels,pred]
        return r1
        pass
                                      train_data_labels_=train_data_labels,
                                      reshape_=False),
                            shuffle=False,
                            steps_per_epoch=data_size)
        model.save(
            "/home/tatras/Desktop/github-general/cmu-deep-learning-2018/"
            "hw3/models/2_layer_lstm_ctc_epoch_{}".format(_))


def testing_():
    # Training the data in generators
    test_data_raw = np.load("/home/kiriteegak/Desktop/github-general/"
                            "cmu-deep-learning-2018/hw3/data/dev.npy")
    sizes = np.apply_along_axis(len, 0, test_data_raw)
    test_data_raw = np.apply_along_axis(np.expand_dims, 0, test_data_raw, 1)
    model = load_model(
        "/home/kiriteegak/Desktop/github-general/cmu-deep-learning-2018/"
        "hw3/models/2_layer_lstm_ctc_epoch_0",
        custom_objects={'tf': tf})
    print("here")
    model_changed = change_network_architecture(model)
    return model_changed.predict(x=test_data_raw), sizes


if __name__ == '__main__':
    test_data_labels = np.load(
        "/home/kiriteegak/Desktop/github-general/"
        "cmu-deep-learning-2018/hw3/data/dev_phonemes.npy")
    outputs, lengths_ = testing_()
    print(K.ctc_decode(outputs, lengths_, greedy=False))
import time




start=time.clock()
X_test_1 = np.zeros((1, width1, height1, 3), dtype=np.uint8)
X_test_2 = np.zeros((1, width2, height2, 3), dtype=np.uint8)
file = codecs.open("test1.txt","a","utf-8")
for i in range(0,100000):
    result=""
    X_test_1[0] = cv2.resize(cv2.imread('test/'+str(i)+'_1.png'), (width1, height1), cv2.INTER_LINEAR).transpose(1,0,2)
    y_pred_1 = model1.predict(X_test_1)
    y_pred_1 = y_pred_1[:,2:,:]
    out1 = K.get_value(K.ctc_decode(y_pred_1, input_length=np.ones(y_pred_1.shape[0])*y_pred_1.shape[1], )[0][0])[:, :30]
    out1 = ''.join([characters[x] for x in out1[0]])
    result += out1 +";"
    if os.path.isfile('test/'+str(i)+'_2.png') == True:
        X_test_1[0] = cv2.resize(cv2.imread('test/'+str(i)+'_2.png'), (width1, height1), cv2.INTER_LINEAR).transpose(1,0,2)
        y_pred_1 = model1.predict(X_test_1)
        y_pred_1 = y_pred_1[:,2:,:]
        out1 = K.get_value(K.ctc_decode(y_pred_1, input_length=np.ones(y_pred_1.shape[0])*y_pred_1.shape[1], )[0][0])[:, :30]
        out1 = ''.join([characters[x] for x in out1[0]])   
        result +=  out1 +";"
            
    X_test_2[0] = cv2.resize(cv2.imread('test/'+str(i)+'_0.png'), (width2, height2), cv2.INTER_LINEAR).transpose(1,0,2)
    y_pred_2 = model2.predict(X_test_2)
    y_pred_2 = y_pred_2[:,2:,:]
    out2 = K.get_value(K.ctc_decode(y_pred_2, input_length=np.ones(y_pred_2.shape[0])*y_pred_2.shape[1], )[0][0])[:, :30]
    out2 = ''.join([characters2[x] for x in out2[0]])
Ejemplo n.º 7
0
	def __keras_decode(y_pred: np.ndarray, input_lengths: np.ndarray, greedy: bool, beam_width: int, top_paths: int) -> list:
		decoded = k.ctc_decode(y_pred=y_pred, input_length=input_lengths, greedy=greedy, beam_width=beam_width, top_paths=top_paths)
		return [path.eval(session=k.get_session()) for path in decoded[0]]
Ejemplo n.º 8
0
 def call(self, y_pred):
     top_k_decoded, logs = K.ctc_decode(y_pred,
                                        K.reshape(self.input_length,
                                                  (-1, )),
                                        greedy=True)
     return K.reshape(top_k_decoded, (-1, 1))
Ejemplo n.º 9
0
def ctc_pred(model,x,batch_size,input_len,):
    pred = model.predict(x,batch_size=batch_size)
    input_len = K.constant([input_len]*len(pred),dtype="int32")
    decoded = K.ctc_decode(pred, input_len, greedy=True, beam_width=100, top_paths=1)
    return K.get_value(decoded[0][0])
Ejemplo n.º 10
0
                                       sample_weight=sample_weight[i:i +
                                                                   batch_size])

        total_ctcloss += ctcloss * inputs_train["the_input"].shape[0] * 1.
    loss_train[epoch] = total_ctcloss / X_train.shape[0]

    inputs_train = {
        'the_input': X_train,
        'the_labels': y_train,
        'input_length': np.sum(X_train_mask, axis=1, dtype=np.int32),
        'label_length': np.squeeze(y_train_mask),
    }
    outputs_train = {'ctc': np.zeros([y_train.shape[0]])}
    preds = test_func([inputs_train["the_input"]])[0]
    decode_function = K.ctc_decode(preds[:, 2:, :],
                                   inputs_train["input_length"] - 2,
                                   greedy=False,
                                   top_paths=1)
    labellings = decode_function[0][0].eval(session=sess)

    #    print labellings, len(labellings), len(labellings[0]), shape(labellings)
    if labellings.shape[1] == 0:
        ua_train[epoch] = 0.0
        wa_train[epoch] = 0.0
    else:
        ua_train[epoch] = unweighted_accuracy(y_train.ravel(),
                                              labellings.T[0].ravel())
        wa_train[epoch] = weighted_accuracy(y_train.ravel(),
                                            labellings.T[0].ravel())

    inputs_test = {
        'the_input': X_test,
Ejemplo n.º 11
0
                  optimizer=sgd)

    batch, lab, input_len, lab_len = tt.get_batch()

    size_training_set = int(.8 * len(batch))
    print('The training set is of size {}\n'.format(size_training_set))

    [x_train, x_test] = np.split(batch, [size_training_set])
    [y_train, y_test] = np.split(lab, [size_training_set])
    [input_len_train, input_len_test] = np.split(input_len,
                                                 [size_training_set])
    [lab_len_train, lab_len_test] = np.split(lab_len, [size_training_set])

    model.fit([x_train, y_train, input_len_train, lab_len_train],
              [y_train, x_train],
              batch_size=100,
              epochs=1)

    score = model.evaluate([x_test, y_test, input_len_test, lab_len_test],
                           [y_test, x_test])

    print('The final score is {}'.format(score))

batch, lab, input_len, lab_len = tt.get_sound_examples('examples')
out = K.ctc_decode(
    model.predict([batch, lab, input_len, lab_len])[1], input_len)

E = K.eval(out[0][0])
for k in range(len(E)):
    print(tt.int_list_to_text(E[k]))
Ejemplo n.º 12
0
    model.output_length = lambda x: x
    print(model.summary())
    return model


model = bidirectional_rnn_model(
    input_dim=161,  # change to 13 if you would like to use MFCC features
    units=512 + 32)

print('load Model')
model.load_weights('results/model_20.h5')
data_gen = AudioGenerator()
print("Load file")
audio_path = 'output.wav'
data_point = data_gen.normalize(data_gen.featurize(audio_path))

print("Start prediction")

#input_to_softmax.load_weights(model_path)
prediction = model.predict(np.expand_dims(data_point, axis=0), batch_size=1)
output_length = [model.output_length(data_point.shape[0])]
pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
             1).flatten().tolist()

print(prediction)
print(output_length)

print(pred_ints)
print('Predicted transcription:\n' + '\n' +
      ''.join(int_sequence_to_text(pred_ints)))
Ejemplo n.º 13
0
    def __init__(self, learning_rate=0.001):
        conv_filters = 16
        kernel_size = (3, 3)
        pool_size = 2
        time_dense_size = 32
        rnn_size = 512
        img_h = 32
        act = 'relu'

        self.width = K.placeholder(name='width', ndim=0, dtype='int32')
        self.input_data = Input(name='the_input',
                                shape=(None, img_h, 1),
                                dtype='float32')
        self.inner = Conv2D(conv_filters,
                            kernel_size,
                            padding='same',
                            activation=act,
                            kernel_initializer='he_normal',
                            name='conv1')(self.input_data)
        self.inner = MaxPooling2D(pool_size=(pool_size, pool_size),
                                  name='max1')(self.inner)
        self.inner = Conv2D(conv_filters,
                            kernel_size,
                            padding='same',
                            activation=act,
                            kernel_initializer='he_normal',
                            name='conv2')(self.inner)
        self.inner = MaxPooling2D(pool_size=(pool_size, pool_size),
                                  name='max2')(self.inner)

        self.inner = Lambda(self.res, arguments={"last_dim": (img_h // (pool_size ** 2)) * conv_filters \
                                                      , "width": self.width // 4})(self.inner)

        # cuts down input size going into RNN:
        self.inp = Dense(time_dense_size, activation=act,
                         name='dense1')(self.inner)
        self.batch_norm = keras.layers.normalization.BatchNormalization()(
            self.inp)
        self.gru_1 = Bidirectional(GRU(rnn_size,
                                       return_sequences=True,
                                       kernel_initializer='he_normal',
                                       name='gru1'),
                                   merge_mode="sum")(self.batch_norm)
        self.gru_2 = Bidirectional(GRU(rnn_size,
                                       return_sequences=True,
                                       kernel_initializer='he_normal',
                                       name='gru2'),
                                   merge_mode="concat")(self.gru_1)
        self.y_pred = TimeDistributed(
            Dense(63,
                  kernel_initializer='he_normal',
                  name='dense2',
                  activation='linear'))(self.gru_2)
        self.model = Model(inputs=self.input_data, outputs=self.y_pred)
        self.model.summary()
        self.out = K.function(
            [self.input_data, self.width,
             K.learning_phase()], [self.y_pred])
        self.y_true = K.placeholder(name='y_true', ndim=1, dtype='int32')
        self.input_length = K.placeholder(name='input_length',
                                          ndim=1,
                                          dtype='int32')
        self.label_length = K.placeholder(name='label_length',
                                          ndim=1,
                                          dtype='int32')
        self.loss_out = K.mean(
            warpctc_tensorflow.ctc(tf.transpose(self.y_pred,
                                                perm=[1, 0, 2]), self.y_true,
                                   self.label_length, self.input_length))
        # self.optimizer = keras.optimizers.Adam(lr = learning_rate)
        self.optimizer = keras.optimizers.SGD(lr=learning_rate,
                                              decay=1e-6,
                                              momentum=0.9,
                                              nesterov=True,
                                              clipnorm=200)
        self.update = self.optimizer.get_updates(self.model.trainable_weights,
                                                 [],
                                                 loss=self.loss_out)
        self.network_output = K.ctc_decode(
            Activation('softmax')(self.y_pred), self.input_length, True)[0][0]
        self.train_step = K.function([self.input_data, self.width, self.y_true, self.input_length, self.label_length, K.learning_phase()], \
             [self.loss_out, self.y_pred], updates = self.update)
        self.test = K.argmax(self.y_pred, axis=2)
        self.predict_step = K.function([
            self.input_data, self.width, self.input_length,
            K.learning_phase()
        ], [self.network_output])
Ejemplo n.º 14
0
def ctc_accuracy(y_true, y_pred, max_len=MAX_LEN):
    labels = y_true[:, 2:]
    input_length = y_true[:, 0]
    decoded = K.ctc_decode(y_pred, input_length)[0][0]
    cmp = K.cast(K.equal(labels, decoded), dtype='float')
    return K.cast(K.equal(K.sum(cmp, axis=-1), max_len), dtype='float')
Ejemplo n.º 15
0
    def evaluate2(self, ltm_images_ph, tcng, sess):
        db = self.db
        keys = list(db.keys())

        ler_dic = {}
        tler = 0.0

        for idx in range(len(keys)):
            if idx > 40000:
                break

            bnk = keys[idx].split('/')[-1].split('_')[-1].split('.')[0]
            if bnk not in list(ler_dic.keys()):
                ler_dic[bnk] = []

            image = cv2.imread(db[keys[idx]][3], 0)
            org_shape = image.shape

            add_to_bottom = int(self.hl - org_shape[0])
            add_to_right = int(self.wl - org_shape[1])

            if org_shape[0] > self.hl or org_shape[1] > self.wl:
                raise Exception("height or width is bigger than " +
                                str(self.hl) + " x " + str(self.wl) + " " +
                                org_shape)

            padded_image = cv2.copyMakeBorder(image, 0, add_to_bottom, 0,
                                              add_to_right,
                                              cv2.BORDER_CONSTANT, 0)
            padded_image = np.array(
                padded_image.reshape(1, self.hl, self.wl, 1))

            ls = np.array(
                sorted([int(line) for line in db[keys[idx]][2].split('-')
                        ])).reshape(-1, 3)
            height = np.array(org_shape[0]).reshape(-1, 1)
            width = np.array(org_shape[1]).reshape(-1, 1)

            label, seq_len = self.label_processor(db[keys[idx]][0])
            label = np.array(label)
            seq_len = np.array(seq_len).reshape(-1, 1)

            if True:
                image = np.concatenate([padded_image, padded_image], axis=0)
                height = np.concatenate([height, height], axis=0)
                width = np.concatenate([width, width], axis=0)
                ls = np.concatenate([ls, ls], axis=0)

            ltm_images, l_true = ltm_img_processor(image,
                                                   height,
                                                   width,
                                                   ls,
                                                   double=False)

            y_pred = sess.run(
                [tcng.fc_2],
                feed_dict={
                    ltm_images_ph: ltm_images,
                    tcng.images_ph: image,
                    tcng.heights_ph: height,
                    tcng.widths_ph: width
                })

            y_pred = y_pred[0]
            shape = y_pred[:, 2:, :].shape
            ctc_decode = bknd.ctc_decode(y_pred[:, 2:, :],
                                         input_length=np.ones(shape[0]) *
                                         shape[1])[0][0]
            out = bknd.get_value(ctc_decode)[:, :self.maxL]

            ler = compare1(out, label, self.Ivoc, show=2)
            ler_dic[bnk].append(float(ler))
            tler += ler

            logging.debug("processed %i out of %i", idx, len(keys))

        for bnk in list(ler_dic.keys()):
            ler_dic[bnk] = np.mean(ler_dic[bnk])
            logging.info("ler for bank %i is %f", int(bnk), ler_dic[bnk])
        return tler / len(keys)
Ejemplo n.º 16
0
    def Predict(self, batch_size, data_input, in_len):
        '''
		预测结果
		返回语音识别后的拼音符号列表
		'''
        batch_size = 1

        in_len = np.zeros((batch_size), dtype=np.int32)
        print(in_len.shape)
        in_len[0] = in_len[0] - 2

        x_in = np.zeros((batch_size, 1600, 200), dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)
        print('base_pred:\n', base_pred)

        y_p = base_pred
        print('base_pred0:\n', base_pred[0][0].shape)

        #for j in range(200):
        #	mean = np.sum(y_p[0][j]) / y_p[0][j].shape[0]
        #	print('max y_p:',np.max(y_p[0][j]),'min y_p:',np.min(y_p[0][j]),'mean y_p:',mean,'mid y_p:',y_p[0][j][100])
        #	print('argmin:',np.argmin(y_p[0][j]),'argmax:',np.argmax(y_p[0][j]))
        #	count=0
        #	for i in range(y_p[0][j].shape[0]):
        #		if(y_p[0][j][i] < mean):
        #			count += 1
        #	print('count:',count)

        base_pred = base_pred[:, 2:, :]
        r = K.ctc_decode(base_pred,
                         in_len,
                         greedy=True,
                         beam_width=100,
                         top_paths=1)
        print('r', r)
        #r = K.cast(r[0][0], dtype='float32')
        #print('r1', r)
        #print('解码完成')

        r1 = K.get_value(r[0][0])
        print('r1', r1)

        print('r0', r[1])
        r2 = K.get_value(r[1])
        print(r2)
        print('解码完成')
        list_symbol_dic = GetSymbolList(self.datapath)  # 获取拼音列表

        r1 = r1[0]

        r_str = []
        for i in r1:
            r_str.append(list_symbol_dic[i])

        #print(r_str)

        return r_str
        pass
    def RecognizeSpeech(self, wavsignal, fs):
        '''
		最终做语音识别用的函数,识别一个wav序列的语音
		不过这里现在还有bug
		'''

        #data = self.data
        data = DataSpeech('E:\\语音数据集')
        data.LoadDataList('dev')
        # 获取输入特征
        #data_input = data.GetMfccFeature(wavsignal, fs)
        data_input = data.GetFrequencyFeature(wavsignal, fs)

        list_symbol_dic = data.list_symbol  # 获取拼音列表

        labels = [
            'dong1', 'bei3', 'jun1', 'de5', 'yi4', 'xie1', 'ai4', 'guo2',
            'jiang4', 'shi4', 'ma3', 'zhan4', 'shan1', 'li3', 'du4', 'tang2',
            'ju4', 'wu3', 'su1', 'bing3', 'ai4', 'deng4', 'tie3', 'mei2',
            'deng3', 'ye3', 'fen4', 'qi3', 'kang4', 'zhan4'
        ]
        #labels = [ list_symbol_dic[-1] ]
        #labels = [ list_symbol_dic[-1] ]
        #while(len(labels) < 32):
        #	labels.append(list_symbol_dic[-1])

        feat_out = []
        #print("数据编号",n_start,filename)
        for i in labels:
            if ('' != i):
                n = data.SymbolToNum(i)
                feat_out.append(n)

        print(feat_out)
        labels = feat_out

        x = next(
            self.data_gen(data_input=np.array(data_input),
                          data_labels=np.array(labels),
                          input_length=len(data_input),
                          labels_length=len(labels),
                          batch_size=2))

        [test_input_data, y, test_input_length, label_length], labels = x
        xx = [test_input_data, y, test_input_length, label_length]

        pred = self._model.predict(x=xx)

        print(pred)

        shape = pred[:, :].shape
        print(shape)

        #print(test_input_data)
        y_p = self.test_func([test_input_data])
        print(type(y_p))
        print('y_p:', y_p)

        for j in range(0, 200):
            mean = sum(y_p[0][0][j]) / len(y_p[0][0][j])
            print('max y_p:', max(y_p[0][0][j]), 'min y_p:', min(y_p[0][0][j]),
                  'mean y_p:', mean, 'mid y_p:', y_p[0][0][j][100])
            print('argmin:', np.argmin(y_p[0][0][j]), 'argmax:',
                  np.argmax(y_p[0][0][j]))
            count = 0
            for i in y_p[0][0][j]:
                if (i < mean):
                    count += 1
            print('count:', count)

        print(K.is_sparse(y_p))
        y_p = K.to_dense(y_p)
        print(K.is_sparse(y_p))
        #y_p = tf.sparse_to_dense(y_p,(2,397),1417,0)
        print(test_input_length.T)
        test_input_length = test_input_length.reshape(2, 1)
        func_in_len = self.test_func_input_length([test_input_length])
        print(type(func_in_len))
        #in_len = np.ones(shape[0]) * shape[1]
        ctc_decoded = K.ctc_decode(y_p, input_length=func_in_len)

        print(ctc_decoded)
        #ctc_decoded = ctc_decoded[0][0]
        #out = K.get_value(ctc_decoded)[:,:64]
        #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length])
        return pred[0][0]

        pass
# As our model predicts the probability for each class at each time step, we need to use some transcription function to convert it into actual texts. Here we will use the CTC decoder to get the output text. Let’s see the code:

# In[2]:


# load the saved best model weights
act_model.load_weights('best_model.hdf5')
 
num_val = 15000
# predict outputs on validation images
prediction = act_model.predict(valid_img[:num_val])
 
valid_img = np.array(valid_img)

# use CTC decoder
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                         greedy=True)[0][0])
#print(out)
out_pred = ''
counter = 0

# see the results
i = 0
for x in out:
    print("original_text =  ", valid_orig_txt[i])
    print("predicted text = ", end = '')
    for p in x:  
        if int(p) != -1:
            c = char_list[int(p)]
            print(char_list[int(p)], end = '')
            out_pred= out_pred + c
    if valid_orig_txt[i] == out_pred:
Ejemplo n.º 19
0
    def RecognizeSpeech(self, wavsignal, fs):
        '''
		最终做语音识别用的函数,识别一个wav序列的语音
		不过这里现在还有bug
		'''
        #data = self.data
        #data = DataSpeech('E:\\语音数据集')
        #data.LoadDataList('dev')
        # 获取输入特征
        #data_input = data.GetMfccFeature(wavsignal, fs)
        data_input = GetFrequencyFeature(wavsignal, fs)
        input_length = len(data_input)
        input_length = input_length // 4

        data_input = np.array(data_input, dtype=np.float)
        in_len = np.zeros((1), dtype=np.int32)
        print(in_len.shape)
        in_len[0] = input_length - 2

        batch_size = 1
        x_in = np.zeros((batch_size, 1600, 200), dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)
        print('base_pred:\n', base_pred)

        y_p = base_pred
        print('base_pred0:\n', base_pred[0][0].shape)

        for j in range(200):
            mean = np.sum(y_p[0][j]) / y_p[0][j].shape[0]
            print('max y_p:', np.max(y_p[0][j]), 'min y_p:', np.min(y_p[0][j]),
                  'mean y_p:', mean, 'mid y_p:', y_p[0][j][100])
            print('argmin:', np.argmin(y_p[0][j]), 'argmax:',
                  np.argmax(y_p[0][j]))
            count = 0
            for i in range(y_p[0][j].shape[0]):
                if (y_p[0][j][i] < mean):
                    count += 1
            print('count:', count)
        #for j in range(0,200):
        #	mean = sum(y_p[0][0][j]) / len(y_p[0][0][j])
        #	print('max y_p:',max(y_p[0][0][j]),'min y_p:',min(y_p[0][0][j]),'mean y_p:',mean,'mid y_p:',y_p[0][0][j][100])
        #	print('argmin:',np.argmin(y_p[0][0][j]),'argmax:',np.argmax(y_p[0][0][j]))
        #	count=0
        #	for i in y_p[0][0][j]:
        #		if(i < mean):
        #			count += 1
        #	print('count:',count)
        #decoded_sequences = self.decoder([base_pred, in_len])

        #print('decoded_sequences:\n', decoded_sequences)
        #input_length = tf.squeeze(input_length)

        #decode_pred = self.model_decode(x=[x_in, in_len])
        #print(decode_pred)
        base_pred = base_pred[:, 2:, :]
        r = K.ctc_decode(base_pred,
                         in_len,
                         greedy=True,
                         beam_width=100,
                         top_paths=1)
        print('r', r)
        #r = K.cast(r[0][0], dtype='float32')
        #print('r1', r)
        #print('解码完成')

        r1 = K.get_value(r[0][0])
        print('r1', r1)

        print('r0', r[1])
        r2 = K.get_value(r[1])
        print(r2)
        print('解码完成')
        list_symbol_dic = GetSymbolList(self.datapath)  # 获取拼音列表

        r1 = r1[0]

        r_str = []
        for i in r1:
            r_str.append(list_symbol_dic[i])

        #print(r_str)

        return r_str

        pass
Ejemplo n.º 20
0
if (opts.printmodel):
    plot_model(model, to_file="model.png", show_shapes=True)
    Image('model.png')

if (opts.testing == False):
    model.fit_generator(gen(opts.batch_size), steps_per_epoch=opts.steps, epochs=opts.epochs,
            callbacks=[EarlyStopping(patience=10), evaluator],
            validation_data=gen(), validation_steps=1280)
else:
    print("testing......")
    characters2 = characters + ' '
    [X_test, y_test, _, _], _  = next(gen(1))
    #cv2.imwrite("./save_image/test.jpg" , X_test)
    y_pred = base_model.predict(X_test)
    y_pred = y_pred[:,2:,:]
    out = K.get_value(K.ctc_decode(y_pred, input_length=np.ones(y_pred.shape[0])*y_pred.shape[1], )[0][0])[:, :7]
    out = ''.join([characters[x] for x in out[0]])
    y_true = ''.join([characters[x] for x in y_test[0]])
    print(out)
    print(y_true)

if(opts.modelname == None and opts.testing == False):
    run_name = datetime.datetime.now().strftime('%Y:%m:%d:%H:%M:%S')
    model.save(run_name+".h5")
    base_model.save("base_"+run_name+".h5")
elif(opts.testing ==True):
    print("Please input testing model name")
else:
    model.save(opts.modelname)
    base_model.save("base_"+opts.modelname)
del model
Ejemplo n.º 21
0
def _dft_ctc_decode(y_pred, input_length, beam_width=100):
    assert False, "fixme"
    sm_y_pred = K.softmax(y_pred)
    return K.ctc_decode(
        sm_y_pred, K.flatten(input_length),
        beam_width=beam_width, greedy=False, top_paths=1)[0][0]
Ejemplo n.º 22
0
def predict(wavs):
    # print("pppppppppppppppppp")
    # 初始化语音
    # speaker = win32com.client.Dispatch("SAPI.SpVoice")

    # my_record()

    # wavs = glob.glob('.//test_data/voice_test.wav')
    # wavs = ['/data/user/0/com.example.chaquopytest/files/chaquopy/AssetFinder/app/sjbf_speech2.wav']
    # print(wavs)
    a = join(dirname(__file__), 'asr_video_enhance_2.h5')
    print(type(a))
    graph = tf.compat.v1.get_default_graph()
    session = tf.compat.v1.Session()
    with graph.as_default():
        with session.as_default():
            model = load_model(join(dirname(__file__), 'asr_video_enhance_2.h5'))
    # model = load_model(join(dirname(__file__), 'asr_video_enhance_2.h5'))
    # load_model('/data/user/0/com.fangte.yjy.speechrecogni/files/chaquopy/AssetFinder/app/asr_video_enhance_2.h5')
    pk = join(dirname(__file__), 'dictionary_video_enhance_2.pkl')
    with open(pk, 'rb') as fr:
        [_, id2char, mfcc_mean, mfcc_std] = pickle.load(fr)
    #     # char2id = pd.DataFrame(char2id.items(), columns=['name', 'index'])
    #     # print(char2id)
    # wavs = join(dirname(__file__), l)
    # wavs = []
    # wavs.append(l)
    # mfcc_mean = np.array([-5.54817, 10.18685, -16.97834, 19.95623, -24.71567, 1.91108, -17.68871, 2.04288, -17.55804,
    #                       0.20271, -9.62210, -5.43127, -1.53957])
    # mfcc_std = np.array([4.11379, 16.58478, 15.80970, 18.87008, 18.04815, 21.30934, 19.47388, 18.76543, 16.85591,
    #                      16.07542, 13.90712, 13.12571, 12.20504])
    # id2char = {0: '倍', 1: '速', 2: '快', 3: '播', 4: '放', 5: '一', 6: '个', 7: '慢', 8: '0', 9: '.', 10: '5',
    #            11: '2', 12: '停', 13: '4', 14: '随', 15: '机', 16: '顺', 17: '序', 18: '上', 19: '1', 20: '进',
    #            21: '下', 22: '暂', 23: '开', 24: '始', 25: '止', 26: '退', 27: '循', 28: '环'
    #            }
    mfcc_dim = 13
    # index = np.random.randint(len(wavs))
    # print(wavs[index])
    # audio, sr = librosa.load(wavs[index])
    print(wavs)
    audio, sr = librosa.load(wavs)
    energy = librosa.feature.rms(audio)
    frames = np.nonzero(energy >= np.max(energy) / 5)
    indices = librosa.core.frames_to_samples(frames)[1]
    audio = audio[indices[0]:indices[-1]] if indices.size else audio[0:0]
    X_data = mfcc(audio, sr, numcep=mfcc_dim, nfft=551)
    X_data = (X_data - mfcc_mean) / (mfcc_std + 1e-14)
    # print(X_data.shape)
    tf.compat.v1.reset_default_graph()
    with graph.as_default():
        with session.as_default():
            pred = model.predict(np.expand_dims(X_data, axis=0))
    # pred = model.predict(np.expand_dims(X_data, axis=0))
    pred_ids = K.eval(K.ctc_decode(pred, [X_data.shape[0]], greedy=False, beam_width=10, top_paths=1)[0][0])
    pred_ids = pred_ids.flatten().tolist()
    text = ''.join([id2char[i] for i in pred_ids])
    # print(''.join([id2char[i] for i in pred_ids]))
    print(text)
    return text

# if __name__ == '__main__':
#     result = predict()
    # print(result)
Ejemplo n.º 23
0
        batch_num = 1#264
        batch_acc = 0
        true_acc = 0
        st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
        print(st)
        print(datetime.datetime.now())
        for i in range(batch_num):
            # [X_test, y_test, _, _], _  = next(generator)
            print(X_test[i])
            y_pred = sess.run(y, feed_dict={
            x:X_test[i][np.newaxis, :]


        })
            shape = y_pred[:, 2:, :].shape
            out = K.get_value(K.ctc_decode(y_pred[:, 2:, :], input_length=np.ones(shape[0]) * shape[1])[0][0])[:, :8]
            # if out.shape[1] == 8:
            # batch_acc += ((y_test[i] == out).sum(axis=1) == 8).mean()
            # argmax = np.argmax(y_pred, axis=2)[0]
            out = ''.join([characters[x] for x in out[0]]).replace(' ', '')
            y_true = ''.join([characters[x] for x in y_test[i]]).replace(' ', '')
            if out == y_true:
                true_acc += 1

            """
            else:
                print(out)
                print(y_true)
                print("-----------")
            """
        # print(true_acc / batch_num*100)
Ejemplo n.º 24
0
 def ctc_decode(softmax):
     return K.ctc_decode(
         softmax, K.tile([K.shape(softmax)[1]], [K.shape(softmax)[0]]))[0]
Ejemplo n.º 25
0
        mat_ori = np.zeros(
            (height, width - int(31.0 / img_size[0] * img_size[1]), 3),
            dtype=np.uint8)
        out_img = np.concatenate([img_reshape, mat_ori],
                                 axis=1).transpose([1, 0, 2])
    else:
        out_img = cv2.resize(img, (width, height),
                             interpolation=cv2.INTER_CUBIC)
        out_img = np.asarray(out_img).transpose([1, 0, 2])

    img_list[ii] = np.asarray(out_img)
    ii += 1

model = load_model('PATH_TO_WEIGHT_FILE')
''' if you want to load model with STN, please use
model = load_model('PATH_TO_WEIGHT_FILE', custom_objects={'SpatialTransformer': SpatialTransformer})'''

y_pred = model.predict(img_list)
shape = y_pred[:, 2:, :].shape
ctc_decode = bknd.ctc_decode(y_pred[:, 2:, :],
                             input_length=np.ones(shape[0]) * shape[1])[0][0]
out = bknd.get_value(ctc_decode)[:, :label_len]

out_list = []
for m in range(len(fileList)):
    result_str = ''.join([characters[k] for k in out[m]])
    out_list.append(result_str)

print(out_list)
Ejemplo n.º 26
0
def ctc_decode(pred):
	c = K.ctc_decode(pred, input_length=np.ones(pred.shape[0]) * pred.shape[1], greedy=False, beam_width=10)[0][0]
	print (c)
Ejemplo n.º 27
0
    },
                        optimizer=Adam(lr=0.0001))

    model_final.fit(
        x=[train_x, train_y, train_input_len, train_label_len],
        y=train_output,
        validation_data=([valid_x, valid_y, valid_input_len,
                          valid_label_len], valid_output),
        epochs=60,
        batch_size=128)

    #Check model performance on validation set
    preds = model.predict(valid_x)
    decoded = K.get_value(
        K.ctc_decode(preds,
                     input_length=np.ones(preds.shape[0]) * preds.shape[1],
                     greedy=True)[0][0])

    prediction = []
    for i in range(valid_size):
        prediction.append(num_to_label(decoded[i]))

    y_true = validation_written_df.loc[0:valid_size, 'IDENTITY']
    correct_char = 0
    total_char = 0
    correct = 0

    for i in range(valid_size):
        pr = prediction[i]
        tr = y_true[i]
        total_char += len(tr)
Ejemplo n.º 28
0
    print("predicting for:" + pathAndFilename)
    # predict outputs on validation images
    # img = Image.open(pathAndFilename)
    # img = img.resize((128, 32), Image.BICUBIC)

    # img = np.array(img) /255;
    # img = np.sum(img, axis=2,keepdims=True)
    img, _, _, _ = process_data(pathAndFilename, "1_1")
    img = img / 255.
    img = np.expand_dims(img, axis=0)
    prediction = act_model.predict(img)

    # use CTC decoder
    out = K.get_value(
        K.ctc_decode(prediction,
                     input_length=np.ones(prediction.shape[0]) *
                     prediction.shape[1],
                     greedy=False)[0][0])
    head, tail = ntpath.split(pathAndFilename)
    txt = tail.split('_')[1]
    # see the results
    i = 0
    le = min(10, out.shape[1])
    print(out.shape)
    for x in out:
        print(txt)
        for p in range(0, le):
            if int(x[p]) != -1:
                print(char_list[int(x[p])], end='')
        print('\n')
        i += 1
Ejemplo n.º 29
0
def get_predictions(index, partition, input_to_softmax, model_path, phn=False):
    """ Print a model's decoded predictions
	Params:
		index (int): The example you would like to visualize
		partition (str): One of 'train' or 'validation'
		input_to_softmax (Model): The acoustic model
		model_path (str): Path to saved acoustic model's weights
	"""
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_test_data()

    # obtain the true transcription and the audio features
    if partition == 'test':
        if phn:
            transcr = data_gen.test_phn_texts[index]
            audio_path = data_gen.test_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.test_wrd_texts[index]
            audio_path = data_gen.test_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        if phn:
            transcr = data_gen.train_phn_texts[index]
            audio_path = data_gen.train_phn_audio_paths[index]
        elif not phn:
            transcr = data_gen.train_wrd_texts[index]
            audio_path = data_gen.train_wrd_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!	 Must be "train" or "validation"')

    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])]
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) +
                 1).flatten().tolist()

    # play the audio file, and display the true and predicted transcriptions
    if not phn:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n' +
              ''.join(int_sequence_to_text(pred_ints, phn)))
        print('-' * 80)
    else:
        print('-' * 80)
        Audio(audio_path)
        print('True transcription:\n' + '\n' + transcr)
        print('-' * 80)
        print('Predicted transcription:\n' + '\n')
        split_true = transcr.split(" ")
        split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
        print("\033[1;32m" + split_pred[0] + " ", end='')
        for i in range(1, len(split_true) - 1):
            if split_true[i - 1] == split_pred[i] or split_true[
                    i] == split_pred[i] or split_true[i + 1] == split_pred[i]:
                print("\033[1;32m" + split_pred[i] + " ", end='')
            else:
                print("\033[1;31m" + split_pred[i] + " ", end='')
        print(split_pred[len(split_true) - 1] + " ", end='')
    split_pred = (''.join(int_sequence_to_text(pred_ints, phn))).split(" ")
    split_true = transcr.split(" ")
    displayAccuracy(split_true, split_pred, phn)
Ejemplo n.º 30
0
    # print(np.shape(X))
    X = np.transpose(X, (0, 2, 3, 1)) X = np.array(X) Y = np.array(Y) return X,Y # the actual loss calc occurs here despite it not being
# an internal Keras loss function

def ctc_lambda_func(args): y_pred, labels, input_length, label_length = args # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    # y_pred = y_pred[:, 2:, :] 测试感觉没影响
    y_pred = y_pred[:, :, :] return K.ctc_batch_cost(labels, y_pred, input_length, label_length) if __name__ == '__main__': height=150
    width=50
    input_tensor = Input((height, width, 1)) x = input_tensor for i in range(3): x = Convolution2D(32*2**i, (3, 3), activation='relu', padding='same')(x) # x = Convolution2D(32*2**i, (3, 3), activation='relu')(x)
        x = MaxPooling2D(pool_size=(2, 2))(x) conv_shape = x.get_shape() # print(conv_shape)
    x = Reshape(target_shape=(int(conv_shape[1]), int(conv_shape[2] * conv_shape[3])))(x) x = Dense(32, activation='relu')(x) gru_1 = GRU(32, return_sequences=True, kernel_initializer='he_normal', name='gru1')(x) gru_1b = GRU(32, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(x) gru1_merged = add([gru_1, gru_1b]) ###################

    gru_2 = GRU(32, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged) gru_2b = GRU(32, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')( gru1_merged) x = concatenate([gru_2, gru_2b]) ######################
    x = Dropout(0.25)(x) x = Dense(label_count, kernel_initializer='he_normal', activation='softmax')(x) base_model = Model(inputs=input_tensor, outputs=x) labels = Input(name='the_labels', shape=[seq_len], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([x, labels, input_length, label_length]) model = Model(inputs=[input_tensor, labels, input_length, label_length], outputs=[loss_out]) model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer='adadelta') model.summary() def test(base_model): file_list = [] X, Y = gen_image_data(r'data\test', file_list) y_pred = base_model.predict(X) shape = y_pred[:, :, :].shape # 2:
        out = K.get_value(K.ctc_decode(y_pred[:, :, :], input_length=np.ones(shape[0]) * shape[1])[0][0])[:, :seq_len] # 2:
        print() error_count=0
        for i in range(len(X)): print(file_list[i]) str_src = str(os.path.split(file_list[i])[-1]).split('.')[0].split('_')[-1] print(out[i]) str_out = ''.join([str(x) for x in out[i] if x!=-1 ]) print(str_src, str_out) if str_src!=str_out: error_count+=1
                print('################################',error_count) # img = cv2.imread(file_list[i])
            # cv2.imshow('image', img)
            # cv2.waitKey()


    class LossHistory(Callback): def on_train_begin(self, logs={}): self.losses = [] def on_epoch_end(self, epoch, logs=None): model.save_weights('model_1018.w') base_model.save_weights('base_model_1018.w') test(base_model) def on_batch_end(self, batch, logs={}): self.losses.append(logs.get('loss')) # checkpointer = ModelCheckpoint(filepath="keras_seq2seq_1018.hdf5", verbose=1, save_best_only=True, )
    history = LossHistory() # base_model.load_weights('base_model_1018.w')
    # model.load_weights('model_1018.w')

    X,Y=gen_image_data() maxin=4900
    subseq_size = 100
    batch_size=10
    result=model.fit([X[:maxin], Y[:maxin], np.array(np.ones(len(X))*int(conv_shape[1]))[:maxin], np.array(np.ones(len(X))*seq_len)[:maxin]], Y[:maxin], batch_size=20, epochs=1000, callbacks=[history, plotter, EarlyStopping(patience=10)], #checkpointer, history,
Ejemplo n.º 31
0
 def predict(self, X):
     y_pred = self.model.predict(X)
     input_length = np.ones(y_pred.shape[0]) * y_pred.shape[1]
     predicts = K.eval(K.ctc_decode(y_pred, input_length)[0][0])
     return predicts