def forcedAlignment(lmfcc, phoneHMMs, phoneTrans): """ forcedAlignmen: aligns a phonetic transcription at the state level Args: lmfcc: NxD array of MFCC feature vectors (N vectors of dimension D) computed the same way as for the training of phoneHMMs phoneHMMs: set of phonetic Gaussian HMM models phoneTrans: list of phonetic symbols to be aligned including initial and final silence Returns: list of strings in the form phoneme_index specifying, for each time step the state from phoneHMMs corresponding to the viterbi path. """ # phone transcription => state transcription phones = sorted(phoneHMMs.keys()) nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones} stateTrans = [p + '_' + str(i) for p in phoneTrans for i in range(nstates[p])] # combined HMM for utterance utteranceHMM = concatHMMs(phoneHMMs, phoneTrans) # Viterbi decoder obsloglik = log_multivariate_normal_density_diag(lmfcc, utteranceHMM['means'], utteranceHMM['covars']) viterbiPath = viterbi(obsloglik, np.log(utteranceHMM['startprob']), np.log(utteranceHMM['transmat']))[1] # time alignment (frame-by-frame state transcription) viterbiStateTrans = [stateTrans[s] for s in viterbiPath] return viterbiStateTrans
def get_best_path_loglik(feature, hmm): trans_mat = hmm['transmat'][:-1, :-1] pi_vec = hmm['startprob'][:-1] means = hmm['means'] covars = hmm['covars'] obsloglik = log_multivariate_normal_density_diag(feature, means, covars) ret, _ = viterbi(obsloglik, np.log(pi_vec), np.log(trans_mat)) return ret
def forcedAlignment(lmfcc, phoneHMMs, phoneTrans): """ forcedAlignmen: aligns a phonetic transcription at the state level Args: lmfcc: NxD array of MFCC feature vectors (N vectors of dimension D) computed the same way as for the training of phoneHMMs phoneHMMs: set of phonetic Gaussian HMM models phoneTrans: list of phonetic symbols to be aligned including initial and final silence Returns: if return_syb: list of strings in the form phoneme_index specifying, for each time step the state from phoneHMMs corresponding to the viterbi path. """ # Obtain the mapping from state to number of state nstates = dict() for ph in phoneHMMs.keys(): num_state = phoneHMMs[ph]['means'].shape[0] nstates[ph] = num_state # Obtain a mapping from the phoneHMMs to statename stateTrans = list() for ph in phoneTrans: for i in range(nstates[ph]): stateTrans.append("%s_%i" % (ph, i)) # =========================================================== # Create the hmm model for this utterance with only the information # of transcription utteranceHMM = concatHMMs(phoneHMMs, phoneTrans) # calculate the Viterbi path means = utteranceHMM['means'] covars = utteranceHMM['covars'] log_emlik = log_mnd_diag(lmfcc, means, covars) # get \pi and A; ignore the terminal state log_pi = np.log(utteranceHMM['startprob'][:-1]) log_trans = np.log(utteranceHMM['transmat'][:-1, :-1]) _, path = viterbi(log_emlik, log_pi, log_trans) # ========================================================= ret = [stateTrans[i] for i in path] return ret
def forcedAlignment(lmfcc, phoneHMMs, phoneTrans): """ forcedAlignmen: aligns a phonetic transcription at the state level Args: lmfcc: NxD array of MFCC feature vectors (N vectors of dimension D) computed the same way as for the training of phoneHMMs phoneHMMs: set of phonetic Gaussian HMM models phoneTrans: list of phonetic symbols to be aligned including initial and final silence Returns: list of strings in the form phoneme_index specifying, for each time step the state from phoneHMMs corresponding to the viterbi path. """ utteranceHMM = concatHMMs(phoneHMMs, phoneTrans) emmision = log_multivariate_normal_density_diag(lmfcc, utteranceHMM['means'], utteranceHMM['covars']) return viterbi(emmision, np.log(utteranceHMM['startprob']), np.log(utteranceHMM['transmat']))
[2]) # word transcription (contained in the filename) phoneTrans = words2phones( wordTrans, prondict) # word transcription => phone transcription stateTrans = [ p + '_' + str(i) for p in phoneTrans for i in range(nstates[p]) ] # phone transcription => state transcription # combined HMM for utterance utteranceHMM = concatHMMs(phoneHMMs, phoneTrans) # Viterbi decoder obsloglik = log_multivariate_normal_density_diag(lmfcc, utteranceHMM['means'], utteranceHMM['covars']) viterbiLoglik, viterbiPath = viterbi(obsloglik, np.log(utteranceHMM['startprob']), np.log(utteranceHMM['transmat'])) # time alignment (frame-by-frame state transcription) viterbiStateTrans = [stateTrans[s] for s in viterbiPath] # save in standard format (to use it, put it in the same directory of .wav and open .wav with wavesurfer) frames2trans(viterbiStateTrans, outfilename='data/transcriptions/z43a.lab') # check results plt.figure() pcolormesh(lmfcc, 'MFCC - computed', ylabel='MFCC') plt.figure() pcolormesh(example['lmfcc'], 'MFCC - example', ylabel='MFCC') plt.figure()
def viterbi_algorithm(): wordHMMs = {} isolated = get_isolated(prondict) # verify implementation wordHMMs['o'] = concatHMMs(phoneHMMsAll, isolated['o']) log_st_prob = np.log(wordHMMs['o']['startprob']) log_transmat = np.log(wordHMMs['o']['transmat']) vloglik, bestPath = viterbi(example['obsloglik'], log_st_prob, log_transmat) alpha_matrix = forward(example['obsloglik'], log_st_prob, log_transmat) print('vloglik from viterbi():', vloglik) print('vloglik from example:', example['vloglik']) # plot fig = plt.figure(figsize=(12, 6)) ax = plt.subplot(121) ax.set_title('viterbi path from Viterbi()') plt.pcolormesh(alpha_matrix) plt.plot(bestPath, np.arange(len(bestPath)), color='red') plt.colorbar() plt.show() fig = plt.figure(figsize=(12, 6)) ax = plt.subplot(121) ax.set_title('viterbi path from example') plt.pcolormesh(example['logalpha']) plt.plot(example['vpath'], np.arange(len(bestPath)), color='red') plt.colorbar() plt.show() # 44 data labels keys_list = [x for x in isolated.keys()] scores_models_all = np.zeros((len(data), len(isolated))) scores_models_onespkr = np.zeros_like(scores_models_all) for j in range(len(keys_list)): key = keys_list[j] hmms = concatHMMs(phoneHMMsAll, isolated[key]) log_st_prob = np.log(hmms['startprob']) log_transmat = np.log(hmms['transmat']) for i in range(len(data)): lpr_test = log_multivariate_normal_density_diag( data[i]['lmfcc'], hmms['means'], hmms['covars']) loglik, path = viterbi(lpr_test, log_st_prob, log_transmat) scores_models_all[i, j] = loglik hmms = concatHMMs(phoneHMMsOne, isolated[key]) log_st_prob = np.log(hmms['startprob']) log_transmat = np.log(hmms['transmat']) for i in range(len(data)): lpr_test = log_multivariate_normal_density_diag( data[i]['lmfcc'], hmms['means'], hmms['covars']) loglik, path = viterbi(lpr_test, log_st_prob, log_transmat) scores_models_onespkr[i, j] = loglik predict_all = np.argmax(scores_models_all, axis=1) predict_one = np.argmax(scores_models_onespkr, axis=1) label_all = [keys_list[x] for x in predict_all] label_one = [keys_list[x] for x in predict_one] true_label = [data[x]['digit'] for x in range(len(data))] print(true_label) print(label_all) print(label_one)
if __name__ == "__main__": # load data data = np.load('data/lab2_data.npz')['data'] example = np.load('data/lab2_example.npz')['example'].item() phoneHMMs = np.load('data/lab2_models_onespkr.npz')['phoneHMMs'].item() # Build hmm wordHMMs = {} wordHMMs['o'] = concatHMMs(phoneHMMs, isolated['o']) trans_mat = wordHMMs['o']['transmat'][:-1, :-1] pi_vec = wordHMMs['o']['startprob'][:-1] # ===================================================== best_seq_loglik, best_path = viterbi(example['obsloglik'], np.log(pi_vec), np.log(trans_mat)) assert np.allclose(best_seq_loglik, example['vloglik']) assert np.array_equal(best_path, example['vpath']) # ======================================================================== onespkr_wordHMMs = {} for k in isolated.keys(): onespkr_wordHMMs[k] = concatHMMs(phoneHMMs, isolated[k]) phoneHMMs_all = np.load('data/lab2_models_all.npz')['phoneHMMs'].item() for d in isolated.keys(): wordHMMs[d] = concatHMMs(phoneHMMs_all, isolated[d]) st = time() resp = match_model_and_utterances(data, onespkr_wordHMMs) onespkr_match = pd.DataFrame(resp) resp = match_model_and_utterances(data, wordHMMs)