Exemple #1
0
 def __init__(self, corpus):
     MS_OUTPUT_SIZE = 1424
     self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE
     self.label_max_string_length = 64
     self.AUDIO_LENGTH = 1600
     self.AUDIO_FEATURE_LENGTH = 80
     self.datapath = 'feature'
     self._model, self.base_model = self.CreateModel()
     self.corpus = corpus
     self.data_test = DataSpeech('../feature', FEATURE_TYPE, 'thchs30', 'test', self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
Exemple #2
0
    def TrainModel(self, epoch=10, save_step=1000, batch_size=32):

        self.data = DataSpeech('../feature',
                               feature_type=FEATURE_TYPE,
                               corpus='thchs30',
                               type='train',
                               audio_length=self.AUDIO_LENGTH,
                               feature_size=self.AUDIO_FEATURE_LENGTH)
        trainGen = self.data.data_generator(batch_size)
        trainNum = self.data.getDataNum()

        dev_data = DataSpeech('../feature',
                              feature_type=FEATURE_TYPE,
                              corpus='thchs30',
                              type='dev',
                              audio_length=self.AUDIO_LENGTH,
                              feature_size=self.AUDIO_FEATURE_LENGTH)
        devGen = dev_data.data_generator(batch_size)
        devNum = dev_data.getDataNum()
        # g = devGen
        # x, y = next(g)

        cb = MyCallback()

        print('[*INFO] Training the Model')
        H = self._model.fit_generator(trainGen,
                                      steps_per_epoch=trainNum // batch_size,
                                      validation_data=devGen,
                                      validation_steps=devNum // batch_size,
                                      epochs=epoch,
                                      callbacks=[cb])
    def TrainModel(self, epoch=10, save_step=1000, batch_size=32):

        self.data = DataSpeech('../feature', 'thchs30', 'train',
                               self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
        trainGen = self.data.data_generator(batch_size)
        trainNum = self.data.getDataNum()

        dev_data = DataSpeech('../feature', 'thchs30', 'dev',
                              self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
        devGen = dev_data.data_generator(batch_size)
        devNum = dev_data.getDataNum()
        # g = devGen
        # x, y = next(g)

        cb = MyCallback()

        print('[*INFO] Training the Model')
        H = self._model.fit_generator(trainGen,
                                      steps_per_epoch=trainNum // batch_size,
                                      validation_data=devGen,
                                      validation_steps=devNum // batch_size,
                                      epochs=epoch,
                                      callbacks=[cb])

        print('[*INFO] Evaluating the Model')
        '''
        predIdex = self._model.predict_generator(
            devGen,
            steps=((devNum // batch_size)+1)
        )
        '''
        # print(classification_report(x[1], predIdex, target_names=lb.classes_))
        self.TestModel(self.datapath, dataset='dev', data_count=4)
Exemple #4
0
class ModelSpeech():
    def __init__(self, corpus):
        MS_OUTPUT_SIZE = 1424
        self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE
        self.label_max_string_length = 64
        self.AUDIO_LENGTH = 1600
        self.AUDIO_FEATURE_LENGTH = 80
        self.datapath = 'feature'
        self._model, self.base_model = self.CreateModel()
        self.corpus = corpus
        #self.data_test = DataSpeech('../feature', FEATURE_TYPE, 'thchs30', 'test', self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)

    def CreateModel(self):
        '''
        定义CNN/DFCNN/CTC模型,使用函数式模型
        输入层:80维的特征值序列,一条语音数据的最大长度设为1600(大约16s)
        隐藏层:卷积池化层,卷积核大小为3x3,池化窗口大小为2
        隐藏层:全连接层
        输出层:全连接层,神经元数量为self.MS_OUTPUT_SIZE,使用softmax作为激活函数,
        CTC层:使用CTC的loss作为损失函数,实现连接性时序多输出
        '''

        input_data = Input(name='input',
                           shape=(self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH,
                                  1))
        x = Conv2D(32, (3, 3),
                   use_bias=False,
                   activation='relu',
                   padding='same',
                   kernel_initializer='he_normal')(input_data)
        x = BatchNormalization(mode=0)(x)

        x = Conv2D(32, (3, 3),
                   use_bias=True,
                   activation='relu',
                   padding='same',
                   kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=2, strides=None, padding='valid')(x)
        x = Dropout(0.1)(x)

        x = Conv2D(64, (3, 3),
                   use_bias=True,
                   activation='relu',
                   padding='same',
                   kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = Conv2D(64, (3, 3),
                   use_bias=True,
                   activation='relu',
                   padding='same',
                   kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=2, strides=None, padding='valid')(x)
        x = Dropout(0.1)(x)

        x = Conv2D(128, (3, 3),
                   use_bias=True,
                   activation='relu',
                   padding='same',
                   kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = Conv2D(128, (3, 3),
                   use_bias=True,
                   activation='relu',
                   padding='same',
                   kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=2, strides=None, padding='valid')(x)
        x = Dropout(0.1)(x)

        x = Conv2D(128, (3, 3),
                   use_bias=True,
                   activation='relu',
                   padding='same',
                   kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = Conv2D(128, (3, 3),
                   use_bias=True,
                   activation='relu',
                   padding='same',
                   kernel_initializer='he_normal')(x)
        x = BatchNormalization(axis=-1)(x)
        x = MaxPooling2D(pool_size=1, strides=None, padding='valid')(x)

        #x = Reshape((x.shape[0], x.shape[1]*x.shape[2]))(x)
        #print(x.shape)
        #exit(0)
        x = Reshape((200, 1280))(x)
        x = Dense(128,
                  activation='relu',
                  use_bias=True,
                  kernel_initializer='he_normal')(x)
        x = BatchNormalization(axis=-1)(x)
        x = Dense(self.MS_OUTPUT_SIZE,
                  use_bias=True,
                  kernel_initializer='he_normal')(x)
        x = BatchNormalization(axis=-1)(x)

        y_pred = Activation('softmax', name='Activation0')(x)
        model_data = Model(inputs=input_data, outputs=y_pred)

        labels = Input(name='the_labels',
                       shape=[self.label_max_string_length],
                       dtype='float32')
        input_length = Input(name='input_length', shape=[1], dtype='int64')
        label_length = Input(name='label_length', shape=[1], dtype='int64')

        loss_out = Lambda(self.ctc_lambda_func, output_shape=(1, ),
                          name='ctc')(
                              [y_pred, labels, input_length, label_length])

        model = Model(inputs=[input_data, labels, input_length, label_length],
                      outputs=loss_out)
        model.summary()

        # clipnorm seems to speeds up convergence
        # sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
        opt = Adam(lr=0.001,
                   beta_1=0.9,
                   beta_2=0.999,
                   decay=0.0,
                   epsilon=10e-8)

        model.build((self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH, 1))
        #model = ParallelModel(model, NUM_GPU)
        #model = multi_gpu_model(model, gpus=2)

        # model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)
        model.compile(loss={
            'ctc': lambda y_true, y_pred: y_pred
        },
                      optimizer=opt)

        # captures output of softmax so we can decode the output during visualization
        test_func = K.function([input_data], [y_pred])

        print('[*提示] 创建模型成功,模型编译成功')
        return model, model_data

    def ctc_lambda_func(self, args):
        y_pred, labels, input_length, label_length = args
        y_pred = y_pred[:, 2:, :]
        return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

    def TrainModel(self, epoch=10, save_step=1000, batch_size=32):

        self.data = DataSpeech('../feature',
                               feature_type=FEATURE_TYPE,
                               corpus='thchs30',
                               type='train',
                               audio_length=self.AUDIO_LENGTH,
                               feature_size=self.AUDIO_FEATURE_LENGTH)
        trainGen = self.data.data_generator(batch_size)
        trainNum = self.data.getDataNum()

        dev_data = DataSpeech('../feature',
                              feature_type=FEATURE_TYPE,
                              corpus='thchs30',
                              type='dev',
                              audio_length=self.AUDIO_LENGTH,
                              feature_size=self.AUDIO_FEATURE_LENGTH)
        devGen = dev_data.data_generator(batch_size)
        devNum = dev_data.getDataNum()
        # g = devGen
        # x, y = next(g)

        cb = MyCallback()

        print('[*INFO] Training the Model')
        H = self._model.fit_generator(trainGen,
                                      steps_per_epoch=trainNum // batch_size,
                                      validation_data=devGen,
                                      validation_steps=devNum // batch_size,
                                      epochs=epoch,
                                      callbacks=[cb])

    def LoadModel(self,
                  filename='model_speech/' + ModelName + '/speech_model' +
                  ModelName + '.model'):

        # 加载模型参数

        self._model.load_weights(filename)
        self.base_model.load_weights(filename + '.base')

    def SaveModel(self,
                  filename='model_speech/' + FEATURE_TYPE + '_' + ModelName +
                  '/' + ModelName,
                  comment=''):
        '''
        保存模型参数
        '''
        if not os.path.exists('model_speech/' + FEATURE_TYPE + '_' +
                              ModelName):
            os.makedirs('model_speech/' + FEATURE_TYPE + '_' + ModelName)
        self._model.save_weights(filename + comment + '.model')
        self.base_model.save_weights(filename + comment + '.model.base')
        f = open(
            'model_speech/' + FEATURE_TYPE + '_' + ModelName + '/step_' +
            ModelName + '.txt', 'w')
        f.write(filename + comment)
        f.close()

    def TestModel(self,
                  datapath='../feature',
                  dataset='dev',
                  data_count=32,
                  out_report=False,
                  show_ratio=True):
        # 测试检验模型效果

        self.data = DataSpeech('../feature', FEATURE_TYPE, 'thchs30', dataset,
                               self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
        # data = DataSpeech(datapath, self.corpus, dataset)
        # data.LoadDataList(str_dataset)
        num_data = self.data.getDataNum()  # 获取数据的数量
        if (data_count <= 0 or data_count >
                num_data):  # 当data_count为小于等于0或者大于测试数据量的值时,则使用全部数据来测试
            data_count = num_data

        try:
            ran_num = random.randint(0, num_data - 1)  # 获取一个随机数

            words_num = 0
            word_error_num = 0

            nowtime = time.strftime('%Y%m%d_%H%M%S',
                                    time.localtime(time.time()))
            if (out_report == True):
                txt_obj = open('Test_Report_' + dataset + '_' + nowtime +
                               '.txt',
                               'w',
                               encoding='UTF-8')  # 打开文件并读入

            txt = ''
            for i in range(data_count):
                # data_input, data_labels = data.GetData((ran_num + i) % num_data)  # 从随机数开始连续向后取一定数量数据
                #data_input = self.data.data[(ran_num + i) % num_data]
                data_input, data_labels = self.data.get_feature_label()
                data_input = data_input.reshape(data_input.shape[0],
                                                data_input.shape[1], 1)
                #data_labels = self.data.label[(ran_num + i) % num_data]
                # 数据格式出错处理 开始
                # 当输入的wav文件长度过长时自动跳过该文件,转而使用下一个wav文件来运行
                num_bias = 0
                while (data_input.shape[0] > self.AUDIO_LENGTH):
                    print('*[Error]', 'wave data lenghth of num',
                          (ran_num + i) % num_data, 'is too long.',
                          '\n A Exception raise when test Speech Model.')
                    num_bias += 1
                    # data_input, data_labels = data.GetData((ran_num + i + num_bias) % num_data)  # 从随机数开始连续向后取一定数量数据
                    data_input, data_labels = self.data.get_feature_label()
                    #data_input = self.data.data[(ran_num + i + num_bias) % num_data]
                    #data_labels = self.data.label[(ran_num + i + num_bias) % num_data]
                # 数据格式出错处理 结束

                pre = self.Predict(data_input, data_input.shape[0] // 8)

                words_n = len(data_labels)  # 获取每个句子的字数
                words_num += words_n  # 把句子的总字数加上
                edit_distance = GetEditDistance(data_labels, pre)  # 获取编辑距离
                if (edit_distance <= words_n):  # 当编辑距离小于等于句子字数时
                    word_error_num += edit_distance  # 使用编辑距离作为错误字数
                else:  # 否则肯定是增加了一堆乱七八糟的奇奇怪怪的字
                    word_error_num += words_n  # 就直接加句子本来的总字数就好了

                if (i % 10 == 0 and show_ratio == True):
                    print('Test Count: ', i, '/', data_count)

                txt = ''
                if (out_report == True):
                    txt += str(i) + '\n'
                    txt += 'True:\t' + str(data_labels) + '\n'
                    txt += 'Pred:\t' + str(pre) + '\n'
                    txt += '\n'
                    txt_obj.write(txt)

            # print('*[测试结果] 语音识别 ' + str_dataset + ' 集语音单字错误率:', word_error_num / words_num * 100, '%')
            print(
                '*[Test Result] Speech Recognition ' + dataset +
                ' set word error ratio: ', word_error_num / words_num * 100,
                '%')
            if (out_report == True):
                txt = '*[测试结果] 语音识别 ' + dataset + ' 集语音单字错误率: ' + str(
                    word_error_num / words_num * 100) + ' %'
                txt_obj.write(txt)
                txt_obj.close()

            return word_error_num / words_num * 100

        except StopIteration:
            print('[Error] Model Test Error. please check data format.')

    def Predict(self, data_input, input_len):
        '''
        预测结果
        返回语音识别后的拼音符号列表
        '''
        data_input = data_input.reshape(data_input.shape[0],
                                        data_input.shape[1], 1)
        batch_size = 1
        in_len = np.zeros((batch_size), dtype=np.int32)

        in_len[0] = input_len

        x_in = np.zeros(
            (batch_size, self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH, 1),
            dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)

        # print('base_pred:\n', base_pred)

        # y_p = base_pred
        # for j in range(200):
        #	mean = np.sum(y_p[0][j]) / y_p[0][j].shape[0]
        #	print('max y_p:',np.max(y_p[0][j]),'min y_p:',np.min(y_p[0][j]),'mean y_p:',mean,'mid y_p:',y_p[0][j][100])
        #	print('argmin:',np.argmin(y_p[0][j]),'argmax:',np.argmax(y_p[0][j]))
        #	count=0
        #	for i in range(y_p[0][j].shape[0]):
        #		if(y_p[0][j][i] < mean):
        #			count += 1
        #	print('count:',count)

        #base_pred = base_pred[:, :, :]
        base_pred = base_pred[:, 2:, :]

        r = K.ctc_decode(base_pred,
                         in_len,
                         greedy=True,
                         beam_width=100,
                         top_paths=1)

        # print('r', r)

        r1 = K.get_value(r[0][0])
        # print('r1', r1)

        # r2 = K.get_value(r[1])
        # print(r2)

        r1 = r1[0]

        return r1
        pass

    '''
Exemple #5
0
    def TestModel(self,
                  datapath='../feature',
                  dataset='dev',
                  data_count=32,
                  out_report=False,
                  show_ratio=True):
        # 测试检验模型效果

        self.data = DataSpeech('../feature', FEATURE_TYPE, 'thchs30', dataset,
                               self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
        # data = DataSpeech(datapath, self.corpus, dataset)
        # data.LoadDataList(str_dataset)
        num_data = self.data.getDataNum()  # 获取数据的数量
        if (data_count <= 0 or data_count >
                num_data):  # 当data_count为小于等于0或者大于测试数据量的值时,则使用全部数据来测试
            data_count = num_data

        try:
            ran_num = random.randint(0, num_data - 1)  # 获取一个随机数

            words_num = 0
            word_error_num = 0

            nowtime = time.strftime('%Y%m%d_%H%M%S',
                                    time.localtime(time.time()))
            if (out_report == True):
                txt_obj = open('Test_Report_' + dataset + '_' + nowtime +
                               '.txt',
                               'w',
                               encoding='UTF-8')  # 打开文件并读入

            txt = ''
            for i in range(data_count):
                # data_input, data_labels = data.GetData((ran_num + i) % num_data)  # 从随机数开始连续向后取一定数量数据
                #data_input = self.data.data[(ran_num + i) % num_data]
                data_input, data_labels = self.data.get_feature_label()
                data_input = data_input.reshape(data_input.shape[0],
                                                data_input.shape[1], 1)
                #data_labels = self.data.label[(ran_num + i) % num_data]
                # 数据格式出错处理 开始
                # 当输入的wav文件长度过长时自动跳过该文件,转而使用下一个wav文件来运行
                num_bias = 0
                while (data_input.shape[0] > self.AUDIO_LENGTH):
                    print('*[Error]', 'wave data lenghth of num',
                          (ran_num + i) % num_data, 'is too long.',
                          '\n A Exception raise when test Speech Model.')
                    num_bias += 1
                    # data_input, data_labels = data.GetData((ran_num + i + num_bias) % num_data)  # 从随机数开始连续向后取一定数量数据
                    data_input, data_labels = self.data.get_feature_label()
                    #data_input = self.data.data[(ran_num + i + num_bias) % num_data]
                    #data_labels = self.data.label[(ran_num + i + num_bias) % num_data]
                # 数据格式出错处理 结束

                pre = self.Predict(data_input, data_input.shape[0] // 8)

                words_n = len(data_labels)  # 获取每个句子的字数
                words_num += words_n  # 把句子的总字数加上
                edit_distance = GetEditDistance(data_labels, pre)  # 获取编辑距离
                if (edit_distance <= words_n):  # 当编辑距离小于等于句子字数时
                    word_error_num += edit_distance  # 使用编辑距离作为错误字数
                else:  # 否则肯定是增加了一堆乱七八糟的奇奇怪怪的字
                    word_error_num += words_n  # 就直接加句子本来的总字数就好了

                if (i % 10 == 0 and show_ratio == True):
                    print('Test Count: ', i, '/', data_count)

                txt = ''
                if (out_report == True):
                    txt += str(i) + '\n'
                    txt += 'True:\t' + str(data_labels) + '\n'
                    txt += 'Pred:\t' + str(pre) + '\n'
                    txt += '\n'
                    txt_obj.write(txt)

            # print('*[测试结果] 语音识别 ' + str_dataset + ' 集语音单字错误率:', word_error_num / words_num * 100, '%')
            print(
                '*[Test Result] Speech Recognition ' + dataset +
                ' set word error ratio: ', word_error_num / words_num * 100,
                '%')
            if (out_report == True):
                txt = '*[测试结果] 语音识别 ' + dataset + ' 集语音单字错误率: ' + str(
                    word_error_num / words_num * 100) + ' %'
                txt_obj.write(txt)
                txt_obj.close()

            return word_error_num / words_num * 100

        except StopIteration:
            print('[Error] Model Test Error. please check data format.')
Exemple #6
0
class ModelSpeech():
    def __init__(self, corpus):
        MS_OUTPUT_SIZE = 1424
        self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE
        self.label_max_string_length = 64
        self.AUDIO_LENGTH = 1700
        self.AUDIO_FEATURE_LENGTH = 80
        self.datapath = 'feature'
        self._model, self.base_model = self.CreateModel()
        self.corpus = corpus

    def CreateModel(self):
        '''
        定义CNN/LSTM/CTC模型,使用函数式模型
        输入层:200维的特征值序列,一条语音数据的最大长度设为1600(大约16s)
        隐藏层:卷积池化层,卷积核大小为3x3,池化窗口大小为2
        隐藏层:全连接层
        输出层:全连接层,神经元数量为self.MS_OUTPUT_SIZE,使用softmax作为激活函数,
        CTC层:使用CTC的loss作为损失函数,实现连接性时序多输出

        '''

        '''

        input_data = Input(name='the_input', shape=(self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH, 1))

        layer_h1 = Conv2D(32, (3, 3), use_bias=False, activation='relu', padding='same',
                          kernel_initializer='he_normal')(input_data)  # 卷积层
        # layer_h1 = Dropout(0.05)(layer_h1)
        layer_h2 = Conv2D(32, (3, 3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(
            layer_h1)  # 卷积层
        layer_h3 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h2)  # 池化层

        # layer_h3 = Dropout(0.05)(layer_h3) # 随机中断部分神经网络连接,防止过拟合
        layer_h4 = Conv2D(64, (3, 3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(
            layer_h3)  # 卷积层
        # layer_h4 = Dropout(0.1)(layer_h4)
        layer_h5 = Conv2D(64, (3, 3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(
            layer_h4)  # 卷积层
        layer_h6 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h5)  # 池化层

        # layer_h6 = Dropout(0.1)(layer_h6)
        layer_h7 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                          kernel_initializer='he_normal')(layer_h6)  # 卷积层
        # layer_h7 = Dropout(0.15)(layer_h7)
        layer_h8 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                          kernel_initializer='he_normal')(layer_h7)  # 卷积层
        layer_h9 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h8)  # 池化层

        # layer_h9 = Dropout(0.15)(layer_h9)
        layer_h10 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                           kernel_initializer='he_normal')(layer_h9)  # 卷积层
        # layer_h10 = Dropout(0.2)(layer_h10)
        layer_h11 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                           kernel_initializer='he_normal')(layer_h10)  # 卷积层
        layer_h12 = MaxPooling2D(pool_size=1, strides=None, padding="valid")(layer_h11)  # 池化层

        # layer_h12 = Dropout(0.2)(layer_h12)
        layer_h13 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                           kernel_initializer='he_normal')(layer_h12)  # 卷积层
        # layer_h13 = Dropout(0.3)(layer_h13)
        layer_h14 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                           kernel_initializer='he_normal')(layer_h13)  # 卷积层
        layer_h15 = MaxPooling2D(pool_size=1, strides=None, padding="valid")(layer_h14)  # 池化层

        # test=Model(inputs = input_data, outputs = layer_h12)
        # test.summary()
        # print(layer_h15.shape)

        layer_h16 = Reshape((212, 1280))(layer_h15)  # Reshape层

        # layer_h16 = Dropout(0.3)(layer_h16) # 随机中断部分神经网络连接,防止过拟合
        layer_h17 = Dense(128, activation="relu", use_bias=True, kernel_initializer='he_normal')(layer_h16)  # 全连接层

        inner = layer_h17
        # layer_h5 = LSTM(256, activation='relu', use_bias=True, return_sequences=True)(layer_h4) # LSTM层

        rnn_size = 128
        gru_1 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru1')(inner)
        gru_1b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru1_b')(
            inner)
        gru1_merged = add([gru_1, gru_1b])
        gru_2 = GRU(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='gru2')(gru1_merged)
        gru_2b = GRU(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru2_b')(
            gru1_merged)

        gru2 = concatenate([gru_2, gru_2b])

        layer_h20 = gru2
        # layer_h20 = Dropout(0.4)(gru2)
        layer_h21 = Dense(128, activation="relu", use_bias=True, kernel_initializer='he_normal')(layer_h20)  # 全连接层

        # layer_h17 = Dropout(0.3)(layer_h17)
        layer_h22 = Dense(self.MS_OUTPUT_SIZE, use_bias=True, kernel_initializer='he_normal')(layer_h21)  # 全连接层

        y_pred = Activation('softmax', name='Activation0')(layer_h22)
        model_data = Model(inputs=input_data, outputs=y_pred)
        # model_data.summary()

        
        labels = Input(name='the_labels', shape=[self.label_max_string_length], dtype='float32')
        input_length = Input(name='input_length', shape=[1], dtype='int64')
        label_length = Input(name='label_length', shape=[1], dtype='int64')
        # Keras doesn't currently support loss funcs with extra parameters
        # so CTC loss is implemented in a lambda layer

        # layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')([y_pred, labels, input_length, label_length])#(layer_h6) # CTC
        loss_out = Lambda(self.ctc_lambed_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])

        model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

        model.summary()

        # clipnorm seems to speeds up convergence
        # sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
        # ada_d = Adadelta(lr = 0.01, rho = 0.95, epsilon = 1e-06)
        opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.0, epsilon=10e-8)
        # model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)

        model.build((self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH, 1))
        model = ParallelModel(model, NUM_GPU)

        model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=opt)

        # captures output of softmax so we can decode the output during visualization
        test_func = K.function([input_data], [y_pred])

        # print('[*提示] 创建模型成功,模型编译成功')
        print('[*Info] Create Model Successful, Compiles Model Successful. ')
        return model, model_data
        
        '''

        input_data = Input(name='the_input', shape=(self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH, 1))

        layer_h1 = Conv2D(32, (3, 3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(
            input_data)  # 卷积层
        layer_h1 = Dropout(0.1)(layer_h1)
        layer_h2 = Conv2D(32, (3, 3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(
            layer_h1)  # 卷积层
        layer_h3 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h2)  # 池化层
        # layer_h3 = Dropout(0.2)(layer_h2) # 随机中断部分神经网络连接,防止过拟合
        layer_h3 = Dropout(0.1)(layer_h3)
        layer_h4 = Conv2D(64, (3, 3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(
            layer_h3)  # 卷积层
        layer_h4 = Dropout(0.2)(layer_h4)
        layer_h5 = Conv2D(64, (3, 3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(
            layer_h4)  # 卷积层
        layer_h6 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h5)  # 池化层

        layer_h6 = Dropout(0.2)(layer_h6)
        layer_h7 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                          kernel_initializer='he_normal')(layer_h6)  # 卷积层
        layer_h7 = Dropout(0.3)(layer_h7)
        layer_h8 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                          kernel_initializer='he_normal')(layer_h7)  # 卷积层
        layer_h9 = MaxPooling2D(pool_size=2, strides=None, padding="valid")(layer_h8)  # 池化层

        layer_h9 = Dropout(0.3)(layer_h9)
        layer_h10 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                           kernel_initializer='he_normal')(layer_h9)  # 卷积层
        layer_h10 = Dropout(0.4)(layer_h10)
        layer_h11 = Conv2D(128, (3, 3), use_bias=True, activation='relu', padding='same',
                           kernel_initializer='he_normal')(layer_h10)  # 卷积层
        layer_h12 = MaxPooling2D(pool_size=1, strides=None, padding="valid")(layer_h11)  # 池化层

        # test=Model(inputs = input_data, outputs = layer_h12)
        ## test.summary()

        layer_h10 = Reshape((212, 1280))(layer_h12)  # Reshape层
        # layer_h5 = LSTM(256, activation='relu', use_bias=True, return_sequences=True)(layer_h4) # LSTM层
        # layer_h6 = Dropout(0.2)(layer_h5) # 随机中断部分神经网络连接,防止过拟合
        layer_h10 = Dropout(0.4)(layer_h10)
        layer_h11 = Dense(128, activation="relu", use_bias=True, kernel_initializer='he_normal')(layer_h10)  # 全连接层
        layer_h11 = Dropout(0.5)(layer_h11)
        layer_h12 = Dense(self.MS_OUTPUT_SIZE, use_bias=True, kernel_initializer='he_normal')(layer_h11)  # 全连接层

        y_pred = Activation('softmax', name='Activation0')(layer_h12)
        model_data = Model(inputs=input_data, outputs=y_pred)
        # model_data.summary()

        labels = Input(name='the_labels', shape=[self.label_max_string_length], dtype='float32')
        input_length = Input(name='input_length', shape=[1], dtype='int64')
        label_length = Input(name='label_length', shape=[1], dtype='int64')
        # Keras doesn't currently support loss funcs with extra parameters
        # so CTC loss is implemented in a lambda layer

        # layer_out = Lambda(ctc_lambda_func,output_shape=(self.MS_OUTPUT_SIZE, ), name='ctc')([y_pred, labels, input_length, label_length])#(layer_h6) # CTC
        loss_out = Lambda(self.ctc_lambda_func, output_shape=(1,), name='ctc')(
            [y_pred, labels, input_length, label_length])

        model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

        # model.summary()

        # clipnorm seems to speeds up convergence
        # sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
        ada_d = Adadelta(lr=0.01, rho=0.95, epsilon=1e-06)

        # model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)
        model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=ada_d)

        # captures output of softmax so we can decode the output during visualization
        test_func = K.function([input_data], [y_pred])

        print('[*提示] 创建模型成功,模型编译成功')
        return model, model_data



    def ctc_lambda_func(self, args):
        y_pred, labels, input_length, label_length = args
        y_pred = y_pred[:, :, :]
        return K.ctc_batch_cost(labels, y_pred, input_length, label_length)


    def TrainModel(self, epoch=10, save_step=1000, batch_size=32):

        self.data = DataSpeech('../feature', 'thchs30', 'train', self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
        trainGen = self.data.data_generator(batch_size)
        trainNum = self.data.getDataNum()


        dev_data = DataSpeech('../feature', 'thchs30', 'dev', self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
        devGen = dev_data.data_generator(batch_size)
        devNum = dev_data.getDataNum()
        #g = devGen
        #x, y = next(g)

        cb = MyCallback()

        print('[*INFO] Training the Model')
        H = self._model.fit_generator(
            trainGen,
            steps_per_epoch=trainNum // batch_size,
            validation_data=devGen,
            validation_steps=devNum // batch_size,
            epochs=epoch,
            callbacks=[cb]
            )

        print('[*INFO] Evaluating the Model')
        '''
        predIdex = self._model.predict_generator(
            devGen,
            steps=((devNum // batch_size)+1)
        )
        '''
        #print(classification_report(x[1], predIdex, target_names=lb.classes_))
        self.TestModel(self.datapath, dataset='dev', data_count=4)


    '''
    def TrainModel(self, epoch=10, save_step=1000, batch_size=32):
        self.data = DataSpeech('../feature', 'thchs30', 'train',  self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
        yielddatas = self.data.data_generator(batch_size)
        for epoch in range(epoch):  # 迭代轮数
            print('[running] train epoch %d .' % epoch)
            n_step = 0  # 迭代数据数
            while True:
                try:
                    print('[message] epoch %d . Have train datas %d+' % (epoch, n_step * save_step))
                    # data_genetator是一个生成器函数

                    # self._model.fit_generator(yielddatas, save_step, nb_worker=2)
                    self._model.fit_generator(yielddatas, save_step)
                    n_step += 1
                except StopIteration:
                    print('[error] generator error. please check data format.')
                    break

                self.SaveModel(comment='_e_' + str(epoch) + '_step_' + str(n_step * save_step))
                self.TestModel(self.datapath, dataset='train', data_count=4)
                self.TestModel(self.datapath, dataset='dev', data_count=4)
            end = time.time()
            print(str(epoch) + "epochs total running time(min):", (end - start) // 60)
    '''

    '''
    def LoadModel(self, filename=abspath + 'model_speech/m' + ModelName + '/speech_model' + ModelName + '.model'):
    
        # 加载模型参数
        
        self._model.load_weights(filename)
        self.base_model.load_weights(filename + '.base')
    '''

    def SaveModel(self, filename='model_speech/' + ModelName + '/' + ModelName, comment=''):
        '''
        保存模型参数
        '''
        if not os.path.exists('model_speech/' + str(ModelName)):
            os.makedirs('model_speech/' + str(ModelName))
        self._model.save_weights(filename + comment + '.model')
        self.base_model.save_weights(filename + comment + '.model.base')
        f = open('step' + ModelName + '.txt', 'w')
        f.write(filename + comment)
        f.close()

    def TestModel(self, datapath='../feature', dataset='dev', data_count=32, out_report=False, show_ratio=True):
        '''
        测试检验模型效果
        '''
        self.data = DataSpeech('../feature', 'thchs30', dataset, self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH)
        #data = DataSpeech(datapath, self.corpus, dataset)
        # data.LoadDataList(str_dataset)
        num_data = self.data.getDataNum()  # 获取数据的数量
        if (data_count <= 0 or data_count > num_data):  # 当data_count为小于等于0或者大于测试数据量的值时,则使用全部数据来测试
            data_count = num_data

        try:
            ran_num = random.randint(0, num_data - 1)  # 获取一个随机数

            words_num = 0
            word_error_num = 0

            nowtime = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time()))
            if (out_report == True):
                txt_obj = open('Test_Report_' + dataset + '_' + nowtime + '.txt', 'w', encoding='UTF-8')  # 打开文件并读入

            txt = ''
            for i in range(data_count):
                #data_input, data_labels = data.GetData((ran_num + i) % num_data)  # 从随机数开始连续向后取一定数量数据
                data_input = self.data.data[(ran_num + i) % num_data]
                data_input = data_input.reshape(data_input.shape[0], data_input.shape[1], 1)
                data_labels = self.data.label[(ran_num + i) % num_data]
                # 数据格式出错处理 开始
                # 当输入的wav文件长度过长时自动跳过该文件,转而使用下一个wav文件来运行
                num_bias = 0
                while (data_input.shape[0] > self.AUDIO_LENGTH):
                    print('*[Error]', 'wave data lenghth of num', (ran_num + i) % num_data, 'is too long.',
                          '\n A Exception raise when test Speech Model.')
                    num_bias += 1
                    #data_input, data_labels = data.GetData((ran_num + i + num_bias) % num_data)  # 从随机数开始连续向后取一定数量数据
                    data_input = self.data.data[(ran_num + i + num_bias) % num_data]
                    data_labels = self.data.label[(ran_num + i + num_bias) % num_data]
                # 数据格式出错处理 结束

                pre = self.Predict(data_input, data_input.shape[0] // 8)

                words_n = len(data_labels)  # 获取每个句子的字数
                words_num += words_n  # 把句子的总字数加上
                edit_distance = GetEditDistance(data_labels, pre)  # 获取编辑距离
                if (edit_distance <= words_n):  # 当编辑距离小于等于句子字数时
                    word_error_num += edit_distance  # 使用编辑距离作为错误字数
                else:  # 否则肯定是增加了一堆乱七八糟的奇奇怪怪的字
                    word_error_num += words_n  # 就直接加句子本来的总字数就好了

                if (i % 10 == 0 and show_ratio == True):
                    print('Test Count: ', i, '/', data_count)

                txt = ''
                if (out_report == True):
                    txt += str(i) + '\n'
                    txt += 'True:\t' + str(data_labels) + '\n'
                    txt += 'Pred:\t' + str(pre) + '\n'
                    txt += '\n'
                    txt_obj.write(txt)

            # print('*[测试结果] 语音识别 ' + str_dataset + ' 集语音单字错误率:', word_error_num / words_num * 100, '%')
            print('*[Test Result] Speech Recognition ' + dataset + ' set word error ratio: ',
                  word_error_num / words_num * 100, '%')
            if (out_report == True):
                txt = '*[测试结果] 语音识别 ' + dataset + ' 集语音单字错误率: ' + str(word_error_num / words_num * 100) + ' %'
                txt_obj.write(txt)
                txt_obj.close()
            return word_error_num / words_num * 100

        except StopIteration:
            print('[Error] Model Test Error. please check data format.')


    def Predict(self, data_input, input_len):
        '''
        预测结果
        返回语音识别后的拼音符号列表
        '''

        batch_size = 1
        in_len = np.zeros((batch_size), dtype=np.int32)

        in_len[0] = input_len

        x_in = np.zeros((batch_size, self.AUDIO_LENGTH, self.AUDIO_FEATURE_LENGTH, 1), dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)

        # print('base_pred:\n', base_pred)

        # y_p = base_pred
        # for j in range(200):
        #	mean = np.sum(y_p[0][j]) / y_p[0][j].shape[0]
        #	print('max y_p:',np.max(y_p[0][j]),'min y_p:',np.min(y_p[0][j]),'mean y_p:',mean,'mid y_p:',y_p[0][j][100])
        #	print('argmin:',np.argmin(y_p[0][j]),'argmax:',np.argmax(y_p[0][j]))
        #	count=0
        #	for i in range(y_p[0][j].shape[0]):
        #		if(y_p[0][j][i] < mean):
        #			count += 1
        #	print('count:',count)

        base_pred = base_pred[:, :, :]
        # base_pred =base_pred[:, 2:, :]

        r = K.ctc_decode(base_pred, in_len, greedy=True, beam_width=100, top_paths=1)

        # print('r', r)

        r1 = K.get_value(r[0][0])
        # print('r1', r1)

        # r2 = K.get_value(r[1])
        # print(r2)

        r1 = r1[0]

        return r1
        pass