def TrainModel(self,
                   datapath,
                   epoch=2,
                   save_step=1000,
                   filename='model_speech/LSTM_CNN_model'):
        '''
		训练模型
		参数:
			datapath: 数据保存的路径
			epoch: 迭代轮数
			save_step: 每多少步保存一次模型
			filename: 默认保存文件名,不含文件后缀名
		'''
        data = DataSpeech(datapath)
        data.LoadDataList('train')
        num_data = DataSpeech.GetDataNum()  # 获取数据的数量
        for epoch in range(epoch):  # 迭代轮数
            print('[running] train epoch %d .' % epoch)
            n_step = 0  # 迭代数据数
            while True:
                try:
                    print('[message] epoch %d . Have train datas %d+' %
                          (epoch, n_step * save_step))
                    # data_genetator是一个生成器函数
                    yielddatas = data.data_genetator(self.BATCH_SIZE)
                    self._model.fit_generator(yielddatas,
                                              save_step,
                                              nb_worker=2)
                    n_step += 1
                except StopIteration:
                    print('[error] generator error. please check data format.')
                    break

                self.SaveModel(comment='_e_' + str(epoch) + '_step_' +
                               str(n_step))
    def TestModel(self, datapath='', str_dataset='dev', data_count=32):
        '''
		测试检验模型效果
		'''
        data = DataSpeech(self.datapath, str_dataset)
        #data = self.data
        #data.LoadDataList(str_dataset)
        num_data = data.GetDataNum()  # 获取数据的数量
        if (data_count <= 0 or data_count >
                num_data):  # 当data_count为小于等于0或者大于测试数据量的值时,则使用全部数据来测试
            data_count = num_data

        try:
            ran_num = random.randint(0, num_data - 1)  # 获取一个随机数

            words_num = 0
            word_error_num = 0
            for i in range(data_count):
                data_input, data_labels = data.GetData(
                    (ran_num + i) % num_data)  # 从随机数开始连续向后取一定数量数据
                pre = self.Predict(data_input, data_input.shape[0] // 4)

                words_num += max(data_labels.shape[0], pre.shape[0])
                word_error_num += GetEditDistance(data_labels, pre)

            print('*[测试结果] 语音识别语音单字错误率:', word_error_num / words_num * 100,
                  '%')
        except StopIteration:
            print('[Error] Model Test Error. please check data format.')
Beispiel #3
0
    def TestModel(self, datapath='', str_dataset='dev', data_count=32):
        '''
		测试检验模型效果
		'''
        data = DataSpeech(self.datapath, str_dataset)
        #data = self.data
        #data.LoadDataList(str_dataset)
        num_data = data.GetDataNum()  # 获取数据的数量
        if (data_count <= 0 or data_count >
                num_data):  # 当data_count为小于等于0或者大于测试数据量的值时,则使用全部数据来测试
            data_count = num_data

        try:
            gen = data.data_genetator(data_count)
            #for i in range(1):
            #	[X, y, input_length, label_length ], labels = gen
            #r = self._model.test_on_batch([X, y, input_length, label_length ], labels)
            r = self._model.evaluate_generator(generator=gen,
                                               steps=1,
                                               max_queue_size=data_count,
                                               workers=1,
                                               use_multiprocessing=False)
            print(r)
        except StopIteration:
            print('[Error] Model Test Error. please check data format.')
    def TestModel(self, datapath, str_dataset='dev'):
        '''
		测试检验模型效果
		'''
        data = DataSpeech(datapath)
        data.LoadDataList(str_dataset)
        num_data = DataSpeech.GetDataNum()  # 获取数据的数量
        try:
            gen = data.data_genetator(num_data)
            for i in range(1):
                X, y = gen
            r = self._model.test_on_batch(X, y)
            print(r)
        except StopIteration:
            print('[Error] Model Test Error. please check data format.')
Beispiel #5
0
    def TrainModel(self,
                   datapath='',
                   epoch=2,
                   batch_size=32,
                   save_step=1000,
                   filename='model_speech/speech_model'):
        '''
		训练模型
		参数:
			datapath: 数据保存的路径
			epoch: 迭代轮数
			save_step: 每多少步保存一次模型
			filename: 默认保存文件名,不含文件后缀名
		'''
        data = DataSpeech(self.datapath, 'train', LoadToMem=False)
        #data = self.data
        #data.LoadDataList()
        num_data = data.DataNum  # 获取数据的数量
        for epoch in range(epoch):  # 迭代轮数
            print('[running] train epoch %d .' % epoch)
            n_step = 0  # 迭代数据数
            while (n_step * save_step * batch_size < num_data):
                try:
                    print('[message] epoch %d . Have train datas %d * %d+' %
                          (epoch, batch_size, n_step * save_step))
                    # data_genetator是一个生成器函数
                    yielddatas = data.data_genetator(batch_size,
                                                     self.AUDIO_LENGTH)
                    #self._model.fit_generator(yielddatas, save_step, nb_worker=2)
                    self._model.fit_generator(yielddatas, save_step)
                    n_step += 1
                except StopIteration:
                    print('[error] generator error. please check data format.')
                    break

                self.SaveModel(comment='_e_' + str(epoch) + '_step_' +
                               str(n_step * save_step))
                self.TestModel(self.datapath,
                               str_dataset='train',
                               data_count=16)
                self.TestModel(self.datapath, str_dataset='dev', data_count=16)
    def __init__(self, datapath):
        '''
		初始化
		默认输出的拼音的表示大小是1283,即1282个拼音+1个空白块
		'''
        MS_OUTPUT_SIZE = 1417
        self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE  # 神经网络最终输出的每一个字符向量维度的大小
        #self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch
        self.label_max_string_length = 64
        self.AUDIO_LENGTH = 1600
        self.AUDIO_FEATURE_LENGTH = 200
        self._model = self.CreateModel()

        self.data = DataSpeech(datapath)
Beispiel #7
0
    def TestModel(self,filename,test_size=-1,type='both'):
        '''
        type:train test dev both(test+dev)
        计算总拼音错误率
        test_size 为-1时即全部,其他即为所选类型各10
        filename:保存的文件名,路径、后缀已准备好
        '''
        deal_batch=16#emmm
        if type not in ['both','test','dev','train']:
            raise TypeError('type 不对 应为 both 或 test 或 dev 或 train')
        import tensorflow as tf
        path=os.path.join(self.save_path,filename+'_weights_data_120.h5')
        self.model_data.load_weights(path)
        speech_datas=[]
        if type=='both':
            speech_datas.append(DataSpeech(self.relpath,'dev'))
            speech_datas.append(DataSpeech(self.relpath,'test'))
        else:
            speech_datas.append(DataSpeech(self.relpath,type))

        total_distance=0
        total_pinyin=0
        for speech_data in speech_datas:
            all_test=speech_data.DataNum_Total
            X=np.zeros((deal_batch,1600,200,1),dtype=np.float64)
            input_length = np.zeros((deal_batch),dtype = np.int32)
            data_output_list=[]
            test_size=test_size if test_size>0 & test_size<=all_test else all_test
            total_count=test_size//deal_batch
            remainder=test_size%deal_batch
            from tqdm import tqdm
            print('评估计算中~')
            for i in tqdm(range(0,total_count)):
                start=i*deal_batch
                end=(i+1)*deal_batch
                for j,k in zip(range(0,deal_batch),range(start,end)):
                    data_input,data_output=speech_data.GetData(k)
                    data_output_list.append(data_output)
                    X[j,0:len(data_input)]=data_input
                    input_length[j]=(min(data_input.shape[0] // 8 + data_input.shape[0] % 8,200))
                temp=self.model_data.predict(X)
                pinyin_pre_decode=K.ctc_decode(temp,input_length,greedy=True,beam_width=100,top_paths=1,merge_repeated=False)
                #greedy改为False可能会好点但太慢了
                pinyin_pre=K.eval(pinyin_pre_decode[0][0])
                for j in range(0,deal_batch):                    
                    pinyin_pre_temp=speech_data.num2symbol(pinyin_pre[j])
                    pinyin_real=speech_data.num2symbol(data_output_list[j])
                    total_distance+=GetEditDistance_pinyin(pinyin_pre_temp,pinyin_real)
                    total_pinyin+=len(pinyin_real)
                if i%10==0:
                    K.clear_session()
                    self.CreateModel()
                    self.model_data.load_weights(path)
                data_output_list=[]
            
            if remainder!=0:
                K.clear_session()
                self.CreateModel()
                self.model_data.load_weights(path)
                X=np.zeros((remainder,1600,200,1),dtype=np.float64)
                input_length = np.zeros((remainder),dtype = np.int32)
                data_output_list=[]
                start=total_count*deal_batch
                end=test_size
                for j,k in zip(range(0,remainder),range(start,end)):
                    data_input,data_output=speech_data.GetData(k)
                    data_output_list.append(data_output)
                    X[j,0:len(data_input)]=data_input
                    input_length[j]=(min(data_input.shape[0] // 8 + data_input.shape[0] % 8,200))
                temp=self.model_data.predict(X)
                pinyin_pre_decode=K.ctc_decode(temp,input_length,greedy=True,beam_width=100,top_paths=1,merge_repeated=False)
                #greedy改为False可能会好点但太慢了
                pinyin_pre=K.eval(pinyin_pre_decode[0][0])
                for j in range(0,remainder):                
                    pinyin_pre_temp=speech_data.num2symbol(pinyin_pre[j])
                    pinyin_real=speech_data.num2symbol(data_output_list[j])
                    total_distance+=GetEditDistance_pinyin(pinyin_pre_temp,pinyin_real)
                    total_pinyin+=len(pinyin_real)
            
        print('total_WER:'+str(total_distance/total_pinyin))
Beispiel #8
0
    def TrainModel(self,filename,batch_size=32,epochs=50,save_epoch=1):
        '''
        filename:保存的文件名,路径、后缀已准备好,如model_lz
        '''       
        self.epoch_all=0
        if os.path.exists(os.path.join(self.save_path,filename+'_epochs.pkl')):
            with open(os.path.join(self.save_path,filename+'_epochs.pkl'),'rb') as f:
                self.epoch_all=pickle.load(f)
            print('加载epochs数',self.epoch_all)
        else:
            print('未加载epochs数,设为',self.epoch_all)
      
        #这里来加载已存在的模型权重
        if os.path.exists(os.path.join(self.save_path,filename+'_weights_ctc_'+str(self.epoch_all)+'.h5')):
            self.model_ctc.load_weights(os.path.join(self.save_path,filename+'_weights_ctc_'+str(self.epoch_all)+'.h5'))
            
        lr=0.0001#https://machinelearningmastery.com/using-learning-rate-schedules-deep-learning-models-python-keras/
        lr=0.00005
        lr=0.00002
        lr=0.00001
        lr=0.000005
        lr=0.000004
        #lr=1e-6
        #opt=Adam(lr=lr,decay=lr/epochs)#Adam默认参数传说中的,衰减这样设置试试?
        opt=Adam(lr=lr)#上面这个衰减好像有点麻烦,,有点不对劲,要衰减要用callback去改比较好,,要命哦        
        #https://blog.csdn.net/zzc15806/article/details/79711114
        from keras.callbacks import LearningRateScheduler
 
        def scheduler(epoch):
            init_lr=lr
            new_lr=lr*0.5**(epoch//2)#试一下这个嘿嘿
            return new_lr
            
        reduce_lr = LearningRateScheduler(scheduler,verbose=1)#暂时先不用               
        def ctc_loss(y_true,y_pred):#定义一般的损失函数
            return y_pred
        self.model_ctc.compile(loss={'ctc' : ctc_loss},optimizer=opt,metrics = ['accuracy'])#这个accuracy好像没用啊.....这个输出是ctc,,,,,,
                
        speech_datas=DataSpeech(self.relpath,'train')
        #speech_validation=DataSpeech(self.relpath,'test')#验证数据
        data_nums=speech_datas.DataNum_Total
        #validation_nums=speech_validation.DataNum_Total
        yield_datas=speech_datas.speechmodel_generator(batch_size,self.AUDIO_LENGTH,self.STRING_LENGTH)
        #yield_validation=speech_validation.nl_speechmodel_generator(8,self.AUDIO_LENGTH,self.STRING_LENGTH)   

        print("[提示QAQ]一个epoch的数据量为%d"%data_nums)
        try:
            hist=LossHistory(save_filename=filename,model_ctc=self.model_ctc,model_data=self.model_data,save_epoch=save_epoch)#这里filename要设置一样,否则保存有问题,,
            self.model_ctc.fit_generator(generator=yield_datas,
                                            steps_per_epoch=data_nums//batch_size+1 if data_nums%batch_size!=0 else data_nums//batch_size,#最后一步小于等于batch_size
                                            epochs=epochs,
                                            verbose=1,
                                            #validation_data=yield_validation,
                                            #validation_steps=50,
                                            callbacks=[hist],
                                            initial_epoch=self.epoch_all#大胆猜测是从0开始,,,错了也就是差1吧emmm
                                            )

        except StopIteration:
            raise LZ_Error("[错误QAQ]貌似生成的数据格式有点问题??")#天知道触发吗              

        #hist.plot()
        #hist.save('lz_new.json')#这两个不能这样了先
        self.SaveModel(filename)#保存所有东西
    def RecognizeSpeech(self, wavsignal, fs):
        '''
		最终做语音识别用的函数,识别一个wav序列的语音
		不过这里现在还有bug
		'''

        #data = self.data
        data = DataSpeech('E:\\语音数据集')
        data.LoadDataList('dev')
        # 获取输入特征
        #data_input = data.GetMfccFeature(wavsignal, fs)
        data_input = data.GetFrequencyFeature(wavsignal, fs)

        arr_zero = np.zeros((1, 200), dtype=np.int16)  #一个全是0的行向量

        #import matplotlib.pyplot as plt
        #plt.subplot(111)
        #plt.imshow(data_input, cmap=plt.get_cmap('gray'))
        #plt.show()

        #while(len(data_input)<1600): #长度不够时补全到1600
        #	data_input = np.row_stack((data_input,arr_zero))
        #print(len(data_input))

        list_symbol = data.list_symbol  # 获取拼音列表

        labels = [list_symbol[0]]
        #while(len(labels) < 64):
        #	labels.append('')

        labels_num = []
        for i in labels:
            labels_num.append(data.SymbolToNum(i))

        data_input = np.array(data_input, dtype=np.int16)
        data_input = data_input.reshape(data_input.shape[0],
                                        data_input.shape[1])

        labels_num = np.array(labels_num, dtype=np.int16)
        labels_num = labels_num.reshape(labels_num.shape[0])

        input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16)
        input_length = np.array(input_length)
        input_length = input_length.reshape(input_length.shape[0])

        label_length = np.array([labels_num.shape[0]], dtype=np.int16)
        label_length = np.array(label_length)
        label_length = label_length.reshape(label_length.shape[0])

        x = [data_input, labels_num, input_length, label_length]
        #x = next(data.data_genetator(1, self.AUDIO_LENGTH))
        #x = kr.utils.np_utils.to_categorical(x)

        print(x)
        x = np.array(x)

        pred = self._model.predict(x=x)
        #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length])
        return [labels, pred]

        pass
    def RecognizeSpeech(self, wavsignal, fs):
        '''
		最终做语音识别用的函数,识别一个wav序列的语音
		不过这里现在还有bug
		'''

        #data = self.data
        data = DataSpeech('E:\\语音数据集')
        data.LoadDataList('dev')
        # 获取输入特征
        #data_input = data.GetMfccFeature(wavsignal, fs)
        data_input = data.GetFrequencyFeature(wavsignal, fs)
        input_length = len(data_input)
        input_length = input_length // 4

        data_input = np.array(data_input, dtype=np.float)
        in_len = np.zeros((1), dtype=np.int32)
        print(in_len.shape)
        in_len[0] = input_length

        batch_size = 1
        x_in = np.zeros((batch_size, 1600, 200), dtype=np.float)

        for i in range(batch_size):
            x_in[i, 0:len(data_input)] = data_input

        base_pred = self.base_model.predict(x=x_in)
        print('base_pred:\n', base_pred)

        #input_length = tf.squeeze(input_length)

        #decode_pred = self.model_decode(x=[x_in, in_len])
        #print(decode_pred)
        base_pred = base_pred[:, 2:, :]
        r = K.ctc_decode(base_pred,
                         in_len,
                         greedy=True,
                         beam_width=64,
                         top_paths=1)
        print('r', r)
        #r = K.cast(r[0][0], dtype='float32')
        #print('r1', r)
        #print('解码完成')

        r1 = K.get_value(r[0][0])
        print('r1', r1)

        print('r0', r[1])
        r2 = K.get_value(r[1])
        print(r2)
        print('解码完成')
        list_symbol_dic = data.list_symbol  # 获取拼音列表
        #arr_zero = np.zeros((1, 200), dtype=np.int16) #一个全是0的行向量

        #import matplotlib.pyplot as plt
        #plt.subplot(111)
        #plt.imshow(data_input, cmap=plt.get_cmap('gray'))
        #plt.show()

        #while(len(data_input)<1600): #长度不够时补全到1600
        #	data_input = np.row_stack((data_input,arr_zero))
        #print(len(data_input))

        #list_symbol = data.list_symbol # 获取拼音列表

        #labels = [ list_symbol[0] ]
        #while(len(labels) < 64):
        #	labels.append('')

        #labels_num = []
        #for i in labels:
        #	labels_num.append(data.SymbolToNum(i))

        #data_input = np.array(data_input, dtype=np.int16)
        #data_input = data_input.reshape(data_input.shape[0],data_input.shape[1])

        #labels_num = np.array(labels_num, dtype=np.int16)
        #labels_num = labels_num.reshape(labels_num.shape[0])

        #input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16)
        #input_length = np.array(input_length)
        #input_length = input_length.reshape(input_length.shape[0])

        #label_length = np.array([labels_num.shape[0]], dtype=np.int16)
        #label_length = np.array(label_length)
        #label_length = label_length.reshape(label_length.shape[0])

        #x = [data_input, labels_num, input_length, label_length]
        #x = next(data.data_genetator(1, self.AUDIO_LENGTH))
        #x = kr.utils.np_utils.to_categorical(x)

        #print(x)
        #x=np.array(x)

        #pred = self._model.predict(x=x)
        #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length])
        #return [labels,pred]
        return r1
        pass