def TrainModel(self, datapath, epoch=2, save_step=1000, filename='model_speech/LSTM_CNN_model'): ''' 训练模型 参数: datapath: 数据保存的路径 epoch: 迭代轮数 save_step: 每多少步保存一次模型 filename: 默认保存文件名,不含文件后缀名 ''' data = DataSpeech(datapath) data.LoadDataList('train') num_data = DataSpeech.GetDataNum() # 获取数据的数量 for epoch in range(epoch): # 迭代轮数 print('[running] train epoch %d .' % epoch) n_step = 0 # 迭代数据数 while True: try: print('[message] epoch %d . Have train datas %d+' % (epoch, n_step * save_step)) # data_genetator是一个生成器函数 yielddatas = data.data_genetator(self.BATCH_SIZE) self._model.fit_generator(yielddatas, save_step, nb_worker=2) n_step += 1 except StopIteration: print('[error] generator error. please check data format.') break self.SaveModel(comment='_e_' + str(epoch) + '_step_' + str(n_step))
def TestModel(self, datapath='', str_dataset='dev', data_count=32): ''' 测试检验模型效果 ''' data = DataSpeech(self.datapath, str_dataset) #data = self.data #data.LoadDataList(str_dataset) num_data = data.GetDataNum() # 获取数据的数量 if (data_count <= 0 or data_count > num_data): # 当data_count为小于等于0或者大于测试数据量的值时,则使用全部数据来测试 data_count = num_data try: ran_num = random.randint(0, num_data - 1) # 获取一个随机数 words_num = 0 word_error_num = 0 for i in range(data_count): data_input, data_labels = data.GetData( (ran_num + i) % num_data) # 从随机数开始连续向后取一定数量数据 pre = self.Predict(data_input, data_input.shape[0] // 4) words_num += max(data_labels.shape[0], pre.shape[0]) word_error_num += GetEditDistance(data_labels, pre) print('*[测试结果] 语音识别语音单字错误率:', word_error_num / words_num * 100, '%') except StopIteration: print('[Error] Model Test Error. please check data format.')
def TestModel(self, datapath='', str_dataset='dev', data_count=32): ''' 测试检验模型效果 ''' data = DataSpeech(self.datapath, str_dataset) #data = self.data #data.LoadDataList(str_dataset) num_data = data.GetDataNum() # 获取数据的数量 if (data_count <= 0 or data_count > num_data): # 当data_count为小于等于0或者大于测试数据量的值时,则使用全部数据来测试 data_count = num_data try: gen = data.data_genetator(data_count) #for i in range(1): # [X, y, input_length, label_length ], labels = gen #r = self._model.test_on_batch([X, y, input_length, label_length ], labels) r = self._model.evaluate_generator(generator=gen, steps=1, max_queue_size=data_count, workers=1, use_multiprocessing=False) print(r) except StopIteration: print('[Error] Model Test Error. please check data format.')
def TestModel(self, datapath, str_dataset='dev'): ''' 测试检验模型效果 ''' data = DataSpeech(datapath) data.LoadDataList(str_dataset) num_data = DataSpeech.GetDataNum() # 获取数据的数量 try: gen = data.data_genetator(num_data) for i in range(1): X, y = gen r = self._model.test_on_batch(X, y) print(r) except StopIteration: print('[Error] Model Test Error. please check data format.')
def TrainModel(self, datapath='', epoch=2, batch_size=32, save_step=1000, filename='model_speech/speech_model'): ''' 训练模型 参数: datapath: 数据保存的路径 epoch: 迭代轮数 save_step: 每多少步保存一次模型 filename: 默认保存文件名,不含文件后缀名 ''' data = DataSpeech(self.datapath, 'train', LoadToMem=False) #data = self.data #data.LoadDataList() num_data = data.DataNum # 获取数据的数量 for epoch in range(epoch): # 迭代轮数 print('[running] train epoch %d .' % epoch) n_step = 0 # 迭代数据数 while (n_step * save_step * batch_size < num_data): try: print('[message] epoch %d . Have train datas %d * %d+' % (epoch, batch_size, n_step * save_step)) # data_genetator是一个生成器函数 yielddatas = data.data_genetator(batch_size, self.AUDIO_LENGTH) #self._model.fit_generator(yielddatas, save_step, nb_worker=2) self._model.fit_generator(yielddatas, save_step) n_step += 1 except StopIteration: print('[error] generator error. please check data format.') break self.SaveModel(comment='_e_' + str(epoch) + '_step_' + str(n_step * save_step)) self.TestModel(self.datapath, str_dataset='train', data_count=16) self.TestModel(self.datapath, str_dataset='dev', data_count=16)
def __init__(self, datapath): ''' 初始化 默认输出的拼音的表示大小是1283,即1282个拼音+1个空白块 ''' MS_OUTPUT_SIZE = 1417 self.MS_OUTPUT_SIZE = MS_OUTPUT_SIZE # 神经网络最终输出的每一个字符向量维度的大小 #self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch self.label_max_string_length = 64 self.AUDIO_LENGTH = 1600 self.AUDIO_FEATURE_LENGTH = 200 self._model = self.CreateModel() self.data = DataSpeech(datapath)
def TestModel(self,filename,test_size=-1,type='both'): ''' type:train test dev both(test+dev) 计算总拼音错误率 test_size 为-1时即全部,其他即为所选类型各10 filename:保存的文件名,路径、后缀已准备好 ''' deal_batch=16#emmm if type not in ['both','test','dev','train']: raise TypeError('type 不对 应为 both 或 test 或 dev 或 train') import tensorflow as tf path=os.path.join(self.save_path,filename+'_weights_data_120.h5') self.model_data.load_weights(path) speech_datas=[] if type=='both': speech_datas.append(DataSpeech(self.relpath,'dev')) speech_datas.append(DataSpeech(self.relpath,'test')) else: speech_datas.append(DataSpeech(self.relpath,type)) total_distance=0 total_pinyin=0 for speech_data in speech_datas: all_test=speech_data.DataNum_Total X=np.zeros((deal_batch,1600,200,1),dtype=np.float64) input_length = np.zeros((deal_batch),dtype = np.int32) data_output_list=[] test_size=test_size if test_size>0 & test_size<=all_test else all_test total_count=test_size//deal_batch remainder=test_size%deal_batch from tqdm import tqdm print('评估计算中~') for i in tqdm(range(0,total_count)): start=i*deal_batch end=(i+1)*deal_batch for j,k in zip(range(0,deal_batch),range(start,end)): data_input,data_output=speech_data.GetData(k) data_output_list.append(data_output) X[j,0:len(data_input)]=data_input input_length[j]=(min(data_input.shape[0] // 8 + data_input.shape[0] % 8,200)) temp=self.model_data.predict(X) pinyin_pre_decode=K.ctc_decode(temp,input_length,greedy=True,beam_width=100,top_paths=1,merge_repeated=False) #greedy改为False可能会好点但太慢了 pinyin_pre=K.eval(pinyin_pre_decode[0][0]) for j in range(0,deal_batch): pinyin_pre_temp=speech_data.num2symbol(pinyin_pre[j]) pinyin_real=speech_data.num2symbol(data_output_list[j]) total_distance+=GetEditDistance_pinyin(pinyin_pre_temp,pinyin_real) total_pinyin+=len(pinyin_real) if i%10==0: K.clear_session() self.CreateModel() self.model_data.load_weights(path) data_output_list=[] if remainder!=0: K.clear_session() self.CreateModel() self.model_data.load_weights(path) X=np.zeros((remainder,1600,200,1),dtype=np.float64) input_length = np.zeros((remainder),dtype = np.int32) data_output_list=[] start=total_count*deal_batch end=test_size for j,k in zip(range(0,remainder),range(start,end)): data_input,data_output=speech_data.GetData(k) data_output_list.append(data_output) X[j,0:len(data_input)]=data_input input_length[j]=(min(data_input.shape[0] // 8 + data_input.shape[0] % 8,200)) temp=self.model_data.predict(X) pinyin_pre_decode=K.ctc_decode(temp,input_length,greedy=True,beam_width=100,top_paths=1,merge_repeated=False) #greedy改为False可能会好点但太慢了 pinyin_pre=K.eval(pinyin_pre_decode[0][0]) for j in range(0,remainder): pinyin_pre_temp=speech_data.num2symbol(pinyin_pre[j]) pinyin_real=speech_data.num2symbol(data_output_list[j]) total_distance+=GetEditDistance_pinyin(pinyin_pre_temp,pinyin_real) total_pinyin+=len(pinyin_real) print('total_WER:'+str(total_distance/total_pinyin))
def TrainModel(self,filename,batch_size=32,epochs=50,save_epoch=1): ''' filename:保存的文件名,路径、后缀已准备好,如model_lz ''' self.epoch_all=0 if os.path.exists(os.path.join(self.save_path,filename+'_epochs.pkl')): with open(os.path.join(self.save_path,filename+'_epochs.pkl'),'rb') as f: self.epoch_all=pickle.load(f) print('加载epochs数',self.epoch_all) else: print('未加载epochs数,设为',self.epoch_all) #这里来加载已存在的模型权重 if os.path.exists(os.path.join(self.save_path,filename+'_weights_ctc_'+str(self.epoch_all)+'.h5')): self.model_ctc.load_weights(os.path.join(self.save_path,filename+'_weights_ctc_'+str(self.epoch_all)+'.h5')) lr=0.0001#https://machinelearningmastery.com/using-learning-rate-schedules-deep-learning-models-python-keras/ lr=0.00005 lr=0.00002 lr=0.00001 lr=0.000005 lr=0.000004 #lr=1e-6 #opt=Adam(lr=lr,decay=lr/epochs)#Adam默认参数传说中的,衰减这样设置试试? opt=Adam(lr=lr)#上面这个衰减好像有点麻烦,,有点不对劲,要衰减要用callback去改比较好,,要命哦 #https://blog.csdn.net/zzc15806/article/details/79711114 from keras.callbacks import LearningRateScheduler def scheduler(epoch): init_lr=lr new_lr=lr*0.5**(epoch//2)#试一下这个嘿嘿 return new_lr reduce_lr = LearningRateScheduler(scheduler,verbose=1)#暂时先不用 def ctc_loss(y_true,y_pred):#定义一般的损失函数 return y_pred self.model_ctc.compile(loss={'ctc' : ctc_loss},optimizer=opt,metrics = ['accuracy'])#这个accuracy好像没用啊.....这个输出是ctc,,,,,, speech_datas=DataSpeech(self.relpath,'train') #speech_validation=DataSpeech(self.relpath,'test')#验证数据 data_nums=speech_datas.DataNum_Total #validation_nums=speech_validation.DataNum_Total yield_datas=speech_datas.speechmodel_generator(batch_size,self.AUDIO_LENGTH,self.STRING_LENGTH) #yield_validation=speech_validation.nl_speechmodel_generator(8,self.AUDIO_LENGTH,self.STRING_LENGTH) print("[提示QAQ]一个epoch的数据量为%d"%data_nums) try: hist=LossHistory(save_filename=filename,model_ctc=self.model_ctc,model_data=self.model_data,save_epoch=save_epoch)#这里filename要设置一样,否则保存有问题,, self.model_ctc.fit_generator(generator=yield_datas, steps_per_epoch=data_nums//batch_size+1 if data_nums%batch_size!=0 else data_nums//batch_size,#最后一步小于等于batch_size epochs=epochs, verbose=1, #validation_data=yield_validation, #validation_steps=50, callbacks=[hist], initial_epoch=self.epoch_all#大胆猜测是从0开始,,,错了也就是差1吧emmm ) except StopIteration: raise LZ_Error("[错误QAQ]貌似生成的数据格式有点问题??")#天知道触发吗 #hist.plot() #hist.save('lz_new.json')#这两个不能这样了先 self.SaveModel(filename)#保存所有东西
def RecognizeSpeech(self, wavsignal, fs): ''' 最终做语音识别用的函数,识别一个wav序列的语音 不过这里现在还有bug ''' #data = self.data data = DataSpeech('E:\\语音数据集') data.LoadDataList('dev') # 获取输入特征 #data_input = data.GetMfccFeature(wavsignal, fs) data_input = data.GetFrequencyFeature(wavsignal, fs) arr_zero = np.zeros((1, 200), dtype=np.int16) #一个全是0的行向量 #import matplotlib.pyplot as plt #plt.subplot(111) #plt.imshow(data_input, cmap=plt.get_cmap('gray')) #plt.show() #while(len(data_input)<1600): #长度不够时补全到1600 # data_input = np.row_stack((data_input,arr_zero)) #print(len(data_input)) list_symbol = data.list_symbol # 获取拼音列表 labels = [list_symbol[0]] #while(len(labels) < 64): # labels.append('') labels_num = [] for i in labels: labels_num.append(data.SymbolToNum(i)) data_input = np.array(data_input, dtype=np.int16) data_input = data_input.reshape(data_input.shape[0], data_input.shape[1]) labels_num = np.array(labels_num, dtype=np.int16) labels_num = labels_num.reshape(labels_num.shape[0]) input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16) input_length = np.array(input_length) input_length = input_length.reshape(input_length.shape[0]) label_length = np.array([labels_num.shape[0]], dtype=np.int16) label_length = np.array(label_length) label_length = label_length.reshape(label_length.shape[0]) x = [data_input, labels_num, input_length, label_length] #x = next(data.data_genetator(1, self.AUDIO_LENGTH)) #x = kr.utils.np_utils.to_categorical(x) print(x) x = np.array(x) pred = self._model.predict(x=x) #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length]) return [labels, pred] pass
def RecognizeSpeech(self, wavsignal, fs): ''' 最终做语音识别用的函数,识别一个wav序列的语音 不过这里现在还有bug ''' #data = self.data data = DataSpeech('E:\\语音数据集') data.LoadDataList('dev') # 获取输入特征 #data_input = data.GetMfccFeature(wavsignal, fs) data_input = data.GetFrequencyFeature(wavsignal, fs) input_length = len(data_input) input_length = input_length // 4 data_input = np.array(data_input, dtype=np.float) in_len = np.zeros((1), dtype=np.int32) print(in_len.shape) in_len[0] = input_length batch_size = 1 x_in = np.zeros((batch_size, 1600, 200), dtype=np.float) for i in range(batch_size): x_in[i, 0:len(data_input)] = data_input base_pred = self.base_model.predict(x=x_in) print('base_pred:\n', base_pred) #input_length = tf.squeeze(input_length) #decode_pred = self.model_decode(x=[x_in, in_len]) #print(decode_pred) base_pred = base_pred[:, 2:, :] r = K.ctc_decode(base_pred, in_len, greedy=True, beam_width=64, top_paths=1) print('r', r) #r = K.cast(r[0][0], dtype='float32') #print('r1', r) #print('解码完成') r1 = K.get_value(r[0][0]) print('r1', r1) print('r0', r[1]) r2 = K.get_value(r[1]) print(r2) print('解码完成') list_symbol_dic = data.list_symbol # 获取拼音列表 #arr_zero = np.zeros((1, 200), dtype=np.int16) #一个全是0的行向量 #import matplotlib.pyplot as plt #plt.subplot(111) #plt.imshow(data_input, cmap=plt.get_cmap('gray')) #plt.show() #while(len(data_input)<1600): #长度不够时补全到1600 # data_input = np.row_stack((data_input,arr_zero)) #print(len(data_input)) #list_symbol = data.list_symbol # 获取拼音列表 #labels = [ list_symbol[0] ] #while(len(labels) < 64): # labels.append('') #labels_num = [] #for i in labels: # labels_num.append(data.SymbolToNum(i)) #data_input = np.array(data_input, dtype=np.int16) #data_input = data_input.reshape(data_input.shape[0],data_input.shape[1]) #labels_num = np.array(labels_num, dtype=np.int16) #labels_num = labels_num.reshape(labels_num.shape[0]) #input_length = np.array([data_input.shape[0] // 4 - 3], dtype=np.int16) #input_length = np.array(input_length) #input_length = input_length.reshape(input_length.shape[0]) #label_length = np.array([labels_num.shape[0]], dtype=np.int16) #label_length = np.array(label_length) #label_length = label_length.reshape(label_length.shape[0]) #x = [data_input, labels_num, input_length, label_length] #x = next(data.data_genetator(1, self.AUDIO_LENGTH)) #x = kr.utils.np_utils.to_categorical(x) #print(x) #x=np.array(x) #pred = self._model.predict(x=x) #pred = self._model.predict_on_batch([data_input, labels_num, input_length, label_length]) #return [labels,pred] return r1 pass