def feature_extraction_and_force_alignment(filepath, nstates, phoneHMMs): """ handle one .wav file """ samples, samplingrate = loadAudio(filepath) wordTrans = list(path2info(filepath)[2]) phoneTrans = words2phones(wordTrans, prondict) stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans for stateid in range(nstates[phone])] lmfcc_result = mfcc(samples) mspec_result = mspec(samples) targets = [] _, viterbi_path = forcedAlignment(lmfcc_result, phoneHMMs, phoneTrans) targets = [stateTrans[idx] for idx in viterbi_path.astype(np.int16)] return lmfcc_result, mspec_result, targets
# create a state list # split a state by the number of states in its HMM. # E.g. # of states of ah = 3; ah -> ['ah_0', 'ah_1', 'ah_2'] stateList = list() for ph in phoneHMMs.keys(): for i in range(nstates[ph]): stateList.append('%s_%d' % (ph, i)) # -------------------------------------------------------------- data = list() for root, dirs, files in walk(folder_to_extract): for f in tqdm(files): if not f.endswith('.wav'): continue # do our work filename = os.path.join(root, f) sample, srate = loadAudio(filename) mspec_x = mspec(sample, samplingrate=srate) lmfcc_x = mfcc(sample, samplingrate=srate) wordTrans = list(path2info(filename)[2]) phoneTrans = words2phones(wordTrans, prondict) targets = forcedAlignment(lmfcc_x, phoneHMMs, phoneTrans) # convert the targets from str to int idx_targets = [stateList.index(t) for t in targets] data.append({ 'filename': filename, 'lmfcc': lmfcc_x, 'mspec': mspec_x, 'targets': idx_targets }) kwargs = {data_type: data}
print("check that the intit proba is equal to 1") print(np.sum(wordTest['startprob']) == 1) sumTot = 0.0 N = len(wordTest['transmat']) for i in range(N): sumTot += np.sum(wordTest['transmat'][i]) #print("les deux nombres suivants doivent etre egaux") print(sumTot) print(N) #print(wordTest['startprob']) #print(wordTest['transmat']) ########### filename = 'tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav' samples, samplingrate = lab3_tools.loadAudio(filename) lmfcc = lab1.mfcc(samples) wordTrans = list(lab3_tools.path2info(filename)[2]) print(wordTrans) #should be ['z', '4', '3'] phoneTrans = words2phones(wordTrans, prondict, addShortPause=True) print(phoneTrans) #should be ['sil', 'z', 'iy', 'r', 'ow', 'f', 'ao', 'r', 'th', 'r', 'iy', 'sil'] stateTrans = [ phone + '_' + str(stateid) for phone in phoneTrans for stateid in range(nstates[phone]) ] print(stateTrans)
""" from lab1_proto import mfcc from lab3_tools import loadAudio from lab3_tools import path2info from lab3_proto import words2phones from prondict import prondict from lab2_proto import concatHMMs from lab3_tools import frames2trans import numpy as np # ============== from lab3_proto import forcedAlignment if __name__ == "__main__": filename = 'tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav' # Get the mfcc feature vectors samples, samplingrate = loadAudio(filename) lmfcc = mfcc(samples) # ================================================= # Get the word level transcription wordTrans = list(path2info(filename)[2]) # get the phone level transcription phoneTrans = words2phones(wordTrans, prondict) # print(phoneTrans) # ================================================================== # combine the HMMs according to the phone level transcription phoneHMMs = np.load('lab2_models_all.npz',allow_pickle=True)['phoneHMMs'].item() utteranceHMM = concatHMMs(phoneHMMs, phoneTrans) # translate the state (idx) to a name phones = sorted(phoneHMMs.keys()) nstates = dict() for ph in phones: