def train_model(n_epochs, model, traindir, model_name, n_classes, totalsamples, dict_name, results_dir, batch_size=32,testing=False): print('...Training...') if testing: totalsamples = 10 n_epochs = 3 # due the variability of samples in each mfcc files batch size must be 1 gen = generator_train_flatbatch(train_dir=traindir, batch_size=batch_size, n_classes=n_classes) earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=3, verbose=1) history = model.fit_generator(generator=gen, steps_per_epoch=totalsamples, epochs=n_epochs, verbose=1, callbacks=[earlystop]) loss = history.history['loss'] acc = history.history['acc'] print(f'Model fit history:{history.history}') print(f'Trained on {totalsamples} files for over {n_epochs} epochs.') print(f'Results directory: {results_dir}') model_name = model_name + f'_E{len(loss)}' cdnn_records_add(loss=loss, accuracy=acc, model_name=model_name, nn_records_name=dict_name, results_address=results_dir) del gen return model, model_name
def evaluate_model(model, testdir, n_classes, totalsamples, model_name, dict_name, results_dir, batch_size=32, testing=False): print('...Evaluating...') if testing: totalsamples = 100 gen = generator_train_flatbatch(train_dir=testdir, batch_size=batch_size, n_classes=n_classes) history = model.evaluate_generator(generator=gen, steps=totalsamples) loss, acc = history print(f'Model fit history:{history}') print(f'Trained on {totalsamples} files.') print(f'Results directory: {results_dir}') cdnn_records_add(loss=loss, accuracy=acc, model_name=model_name, nn_records_name=dict_name, results_address=results_dir) del gen return
def main(testing=False): # Config Values[DNN params] Frame_length = 0.025 Frame_step = 0.01 Dbdir = './speakers_db_correct/' overwrite_MFCCs = False TrainAll = False testing = True FramelevelORword = False cwd = os.getcwd() SysPath = cwd.split('GOP-LSTM')[0] Wavdir = SysPath + 'corpus/dat/speakers/' Dbdir = SysPath +'GOP-LSTM/PhoneInfo/speakers_db_correct/' Holddir = SysPath + 'HoldDir/' Traindir = Holddir + 'Train/' Testdir = Holddir + 'Test/' PhoInfDir = SysPath + 'GOP-LSTM/PhoneInfo/' N_context = 2 N_ceps = 26 wordcount = 10 # Training & Test Data if FramelevelORword: speakers_trainNtest(db_corpus=Dbdir, wav_corpus=Wavdir, n_ceps=N_ceps, n_context=N_context, frame_length=Frame_length, frame_step=Frame_step, inmat=True, holddir=Holddir, overwrite=overwrite_MFCCs) cdnn_dict_name = 'crnn_gridsearch_records_ALL.pk' if not TrainAll: ByCount = 4000 Traindir = Holddir + 'Train_Correct/' Testdir = Holddir + 'Test_Correct/' selected_phones, totalcount = select_trainNtest(bycount=ByCount, holddir=Holddir, train_corpus=Traindir, test_corpus=Testdir, overwrite=False) Traindir = Holddir + f'Train_Select_{ByCount}/' Testdir = Holddir + f'Test_Select_{ByCount}/' N_classes = len(selected_phones) print(f'N selected classes: {N_classes}') cdnn_dict_name = f'crnn_gridsearch_records_{ByCount}.pk' else: selected_phones, totalcount, w2pdict = createNcount_trainNtest(frame_length=Frame_length, frame_step=Frame_step, n_ceps=N_ceps, n_context=N_context, dbdir=Dbdir, datdir=Wavdir, holddir=Holddir, wordcount=wordcount, phoinfdir=PhoInfDir) N_classes = len(selected_phones) Traindir = Holddir + f'FLP_Train_{wordcount}/' Testdir = Holddir + f'FLP_Test_{wordcount}/' cdnn_dict_name = f'ddcp_blstm_gridsearch_records_wl_{wordcount}.pk' print(f'Selected phones: {selected_phones}') print(f'Train count & test count: {totalcount}') if testing: cdnn_dict_name = f'testing_records.pk' cdnn_address = SysPath + 'GOP-LSTM/Results/CDNN_phones/' # Iterate over gridsearch N_epochs = 70 Input_tuple = (5, 26, 1) ConvLayerList = [[32 for _ in range(15)]] DropoutList =[0.8] # add one for sil N_classes += 1 selected_phones.append('_') seq_sizelist = [64] for seq_size in seq_sizelist: totaltrain = nseqsofnsize(Traindir,seq_size=seq_size) totaltest = nseqsofnsize(Testdir,seq_size=seq_size) for cl in ConvLayerList: for dl in DropoutList: # Compile Params #cname = '_'.join(str(x) for x in cl) cname = f'{cl[0]}_x{len(cl)}' Model_name = f'BLSTM_CP{cname}_FBN_SS{seq_size}_DL{dl}_V3' model = make_CNNLSTM_classifier(input_tuple=Input_tuple, conv_layers=cl, n_classes=N_classes, seq_size=seq_size, dropout_rate=dl, channel_order='channels_last') model, Model_name = train_model(n_epochs=N_epochs, model=model, traindir=Traindir, model_name=Model_name, n_classes=N_classes, totalsamples=totaltrain, dict_name=cdnn_dict_name, results_dir=cdnn_address, batch_size=seq_size, testing=testing) print('...Evaluating...') evaluate_model(model=model, testdir=Testdir, n_classes=N_classes, totalsamples=totaltest, model_name=Model_name, dict_name=cdnn_dict_name, results_dir=cdnn_address, batch_size=seq_size, testing=testing) # Forced Accuracy print('...Predicting...') if testing: totaltest = 30 gen = generator_test_bufferedseq_wfname(train_dir=Testdir, batch_size=seq_size, n_classes=N_classes, wfname=True) ''' Return Word Accuracy (by Softmax & ForcedMax), Max Seg Accuracy (from goldstandard-gst)''' that = True if that: selected_phones.append('_') diagnose = False s_correct = 0 f_correct = 0 total = 0 s_IDS = 0 f_IDS = 0 maxsegtotal = 0 s_seg = 0 f_seg = 0 print(f'Total Test size:{totaltest}\n') x, y, file = next(gen) cfile = file for _ in range(totaltest): # amount of words to be judged if diagnose: print(f'Current file:{file}') print(f'Word\'s phones{potphones}') print(file) segcount = 0 gwordphones = [] # gold standard word segments swordphones = [] # softmax word segments fwordphones = [] # forced word segments fname = file.split('.')[0] potphones = w2pdict[fname] potphones.append('_') pind = [selected_phones.index(sp) for sp in potphones] predictions = model.predict(x=x) segcount += 1 gstd = [selected_phones[sp] for sp in np.argmax(y, axis=2)[0]] softmax = [selected_phones[sp] for sp in np.argmax(predictions, axis=2)[0]] forceda = [selected_phones[pind[sp]] for sp in np.argmax(predictions[:, :, pind][0], axis=1)] gwordphones += gstd swordphones += softmax fwordphones += forceda x, y, file = next(gen) predictions = model.predict(x=x) if cfile == file: # break out of word while loop samefile = True cfile = file else: samefile = False cfile = file while samefile: # track error for same file segcount += 1 gstd = [selected_phones[sp] for sp in np.argmax(y, axis=2)[0]] softmax = [selected_phones[sp] for sp in np.argmax(predictions, axis=2)[0]] forceda = [selected_phones[pind[sp]] for sp in np.argmax(predictions[:, :, pind][0], axis=1)] gwordphones += gstd swordphones += softmax fwordphones += forceda x, y, file = next(gen) predictions = model.predict(x=x) if cfile != file: # break out of word while loop samefile = False cfile = file # got word segs, process them gseg = segmentphonelist(gwordphones) sseg = segmentphonelist(swordphones) fseg = segmentphonelist(fwordphones) sLD = uttLD(gseg, sseg) fLD = uttLD(gseg, fseg) s_IDS += sLD f_IDS += fLD if diagnose: print('\n') print(gseg) print(sseg) print(fseg) print('\n') print(sLD) print(fLD) print('\n') # accuracy startsil = gseg[-1][1] # Index of Silence g_len = len(gwordphones[:startsil]) s_correct += segCorrect(gwordphones[:startsil], swordphones[:startsil]) f_correct += segCorrect(gwordphones[:startsil], fwordphones[:startsil]) total += g_len # max-seg-score with known boundaries # per word, then test set score wordweight = 0 for seg in gseg[:-1]: # last phone is silence '_' maxsegtotal += 1 cphone = seg[0] sboundedlist = swordphones[seg[1]:seg[2]] fboundedlist = fwordphones[seg[1]:seg[2]] smaxphone = max(sboundedlist, key=sboundedlist.count) fmaxphone = max(fboundedlist, key=fboundedlist.count) if smaxphone == cphone: s_seg += 1 if fmaxphone == cphone: f_seg += 1 if diagnose: print(seg) print(smaxphone, fmaxphone, cphone) sLDpercent = sLD / total * 100 fLDpercent = fLD / total * 100 print(f'Insertions, Deletions, Substitions (SM):{sLD} out of {total}: {sLDpercent}%') print(f'Insertions, Deletions, Substitions (FM):{fLD} out of {total}: {fLDpercent}%') Spercent = s_correct / total * 100 Fpercent = f_correct / total * 100 print('\n') print(f'Softmax: {s_correct} out of {total}, {Spercent}%') print(f'Forced: {f_correct} out of {total}, {Fpercent}%') Spercent = s_seg / maxsegtotal * 100 Fpercent = f_seg / maxsegtotal * 100 print(f'Softmax (seg): {s_seg} out of {maxsegtotal}, {Spercent}%') print(f'Forced (seg): {f_seg} out of {maxsegtotal}, {Fpercent}%') cdnn_records_add(loss=Spercent, accuracy=Fpercent, model_name=Model_name, nn_records_name=cdnn_dict_name, results_address=cdnn_address) del gen del model k.clear_session()
def main(testing=False): # Config Values[DNN params] Frame_length = 0.025 Frame_step = 0.01 Dbdir = './speakers_db_correct/' overwrite_MFCCs = False TrainAll = False #testing = True FramelevelORword = False cwd = os.getcwd() SysPath = cwd.split('GOP-LSTM')[0] Wavdir = SysPath + 'corpus/dat/speakers/' Dbdir = SysPath + 'GOP-LSTM/PhoneInfo/speakers_db_correct/' Holddir = SysPath + 'HoldDir/' Traindir = Holddir + 'Train/' Testdir = Holddir + 'Test/' N_context = 2 N_ceps = 26 wordcount = 30 ByCount = 4000 # Training & Test Data if FramelevelORword: speakers_trainNtest(db_corpus=Dbdir, wav_corpus=Wavdir, n_ceps=N_ceps, n_context=N_context, frame_length=Frame_length, frame_step=Frame_step, inmat=True, holddir=Holddir, overwrite=overwrite_MFCCs) cdnn_dict_name = 'crnn_gridsearch_records_ALL.pk' if not TrainAll: Traindir = Holddir + 'Train_Correct/' Testdir = Holddir + 'Test_Correct/' selected_phones, totalcount = select_trainNtest( bycount=ByCount, holddir=Holddir, train_corpus=Traindir, test_corpus=Testdir, overwrite=False) Traindir = Holddir + f'Train_Select_{ByCount}/' Testdir = Holddir + f'Test_Select_{ByCount}/' N_classes = len(selected_phones) print(f'N selected classes: {N_classes}') cdnn_dict_name = f'crnn_gridsearch_records_{ByCount}.pk' else: selected_phones, totalcount, w2pdict = createNcount_trainNtest( frame_length=Frame_length, frame_step=Frame_step, n_ceps=N_ceps, n_context=N_context, dbdir=Dbdir, datdir=Wavdir, holddir=Holddir, wordcount=wordcount) N_classes = len(selected_phones) Traindir = Holddir + f'FLP_Train_{wordcount}/' Testdir = Holddir + f'FLP_Test_{wordcount}/' print(f'Selected phones: {selected_phones}') cdnn_dict_name = f'cp_blstm_gridsearch_records_wl_{wordcount}.pk' if testing: cdnn_dict_name = f'testing_records.pk' cdnn_address = SysPath + 'GOP-LSTM/Results/CDNN_phones/' print(f'Train count \& test count: {totalcount}') # Iterate over gridsearch N_epochs = 70 Input_tuple = (5, 26, 1) ConvLayerList = [[64, 64, 64, 64], [32, 32, 32, 32]] DropoutList = [0.8] seq_sizelist = [16, 32] for seq_size in seq_sizelist: totaltrain = nseqsofnsize(Traindir, seq_size=seq_size) totaltest = nseqsofnsize(Testdir, seq_size=seq_size) for cl in ConvLayerList: for dl in DropoutList: # Compile Params cname = '_'.join(str(x) for x in cl) Model_name = f'BLSTM_CP{cname}_FBN_SS{seq_size}_DL{dl}' model = make_CNNLSTM_classifier(input_tuple=Input_tuple, conv_layers=cl, n_classes=N_classes, seq_size=seq_size, dropout_rate=dl, channel_order='channels_last') model, Model_name = train_model(n_epochs=N_epochs, model=model, traindir=Traindir, model_name=Model_name, n_classes=N_classes, totalsamples=totaltrain, dict_name=cdnn_dict_name, results_dir=cdnn_address, batch_size=seq_size, testing=testing) evaluate_model(model=model, testdir=Testdir, n_classes=N_classes, totalsamples=totaltest, model_name=Model_name, dict_name=cdnn_dict_name, results_dir=cdnn_address, batch_size=seq_size, testing=testing) print('...Predicting...') if testing: totaltest = 30 gen = generator_train_bufferedseq_wfname(train_dir=Testdir, batch_size=seq_size, n_classes=N_classes) p_correct = 0 f_correct = 0 total = 0 for s in range(totaltest): x, y, file = next(gen) fname = file.split('.')[0] potphones = w2pdict[fname] pind = [selected_phones.index(sp) for sp in potphones] predictions = model.predict_proba(x=x) TrueY = [ selected_phones[sp] for sp in np.argmax(y, axis=2)[0] ] PredY = [ selected_phones[sp] for sp in np.argmax(predictions, axis=2)[0] ] ForcY = [ selected_phones[pind[sp]] for sp in np.argmax(predictions[:, :, pind][0], axis=1) ] p_correct += len( [1 for x, y in zip(TrueY, PredY) if x == y]) f_correct += len( [1 for x, y in zip(TrueY, ForcY) if x == y]) total += len(TrueY) p_percent = p_correct / total * 100 f_percent = f_correct / total * 100 cdnn_records_add(loss=p_percent, accuracy=f_percent, model_name=Model_name, nn_records_name=cdnn_dict_name, results_address=cdnn_address) print( f'Predicted correct:{p_correct} out of {total}, {p_percent}' ) print( f'Forced correct:{f_correct} out of {total}, {f_percent}') diagnosis = False if diagnosis: print(potphones) print(file) print( f'Goldstd:{[selected_phones[sp] for sp in np.argmax(y, axis=2)[0]]}' ) print( f'Max All:{[selected_phones[sp] for sp in np.argmax(predictions,axis=2)[0]]}' ) print( f'ForcedA:{[selected_phones[pind[sp]] for sp in np.argmax(predictions[:, :, pind][0], axis=1)]}' ) del gen del model k.clear_session() cdnn_records_rankNprint(nn_record_name=cdnn_dict_name, results_address=cdnn_address)