def feature_extraction_and_force_alignment(filepath, nstates, phoneHMMs):
   """
   handle one .wav file
   """
   samples, samplingrate = loadAudio(filepath)
   wordTrans = list(path2info(filepath)[2])
   phoneTrans = words2phones(wordTrans, prondict)
   stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans
            for stateid in range(nstates[phone])]
   lmfcc_result = mfcc(samples)
   mspec_result = mspec(samples)
   targets = []

   _, viterbi_path = forcedAlignment(lmfcc_result, phoneHMMs, phoneTrans)
   targets = [stateTrans[idx] for idx in viterbi_path.astype(np.int16)] 
   
   return lmfcc_result, mspec_result, targets
Ejemplo n.º 2
0
    # create a state list
    # split a state by the number of states in its HMM.
    # E.g. # of states of ah = 3; ah -> ['ah_0', 'ah_1', 'ah_2']
    stateList = list()
    for ph in phoneHMMs.keys():
        for i in range(nstates[ph]):
            stateList.append('%s_%d' % (ph, i))
    # --------------------------------------------------------------
    data = list()
    for root, dirs, files in walk(folder_to_extract):
        for f in tqdm(files):
            if not f.endswith('.wav'):
                continue
            # do our work
            filename = os.path.join(root, f)
            sample, srate = loadAudio(filename)
            mspec_x = mspec(sample, samplingrate=srate)
            lmfcc_x = mfcc(sample, samplingrate=srate)
            wordTrans = list(path2info(filename)[2])
            phoneTrans = words2phones(wordTrans, prondict)
            targets = forcedAlignment(lmfcc_x, phoneHMMs, phoneTrans)
            # convert the targets from str to int
            idx_targets = [stateList.index(t) for t in targets]
            data.append({
                'filename': filename,
                'lmfcc': lmfcc_x,
                'mspec': mspec_x,
                'targets': idx_targets
            })

    kwargs = {data_type: data}
Ejemplo n.º 3
0
print("check that the intit proba is equal to 1")
print(np.sum(wordTest['startprob']) == 1)
sumTot = 0.0
N = len(wordTest['transmat'])
for i in range(N):
    sumTot += np.sum(wordTest['transmat'][i])
#print("les deux nombres suivants doivent etre egaux")
print(sumTot)
print(N)

#print(wordTest['startprob'])
#print(wordTest['transmat'])

###########
filename = 'tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
samples, samplingrate = lab3_tools.loadAudio(filename)
lmfcc = lab1.mfcc(samples)
wordTrans = list(lab3_tools.path2info(filename)[2])
print(wordTrans)
#should be ['z', '4', '3']

phoneTrans = words2phones(wordTrans, prondict, addShortPause=True)
print(phoneTrans)
#should be ['sil', 'z', 'iy', 'r', 'ow', 'f', 'ao', 'r', 'th', 'r', 'iy', 'sil']

stateTrans = [
    phone + '_' + str(stateid) for phone in phoneTrans
    for stateid in range(nstates[phone])
]
print(stateTrans)
Ejemplo n.º 4
0
"""
from lab1_proto import mfcc
from lab3_tools import loadAudio
from lab3_tools import path2info
from lab3_proto import words2phones
from prondict import prondict
from lab2_proto import concatHMMs
from lab3_tools import frames2trans
import numpy as np
# ==============
from lab3_proto import forcedAlignment

if __name__ == "__main__":
    filename = 'tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
    # Get the mfcc feature vectors
    samples, samplingrate = loadAudio(filename)
    lmfcc = mfcc(samples)
    # =================================================
    # Get the word level transcription
    wordTrans = list(path2info(filename)[2])
    # get the phone level transcription
    phoneTrans = words2phones(wordTrans, prondict)
    # print(phoneTrans)
    # ==================================================================
    # combine the HMMs according to the phone level transcription
    phoneHMMs = np.load('lab2_models_all.npz',allow_pickle=True)['phoneHMMs'].item()
    utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)
    # translate the state (idx) to a name
    phones = sorted(phoneHMMs.keys())
    nstates = dict()
    for ph in phones: