def forcedAlignment(lmfcc, phoneHMMs, phoneTrans):
    """ forcedAlignmen: aligns a phonetic transcription at the state level

    Args:
       lmfcc: NxD array of MFCC feature vectors (N vectors of dimension D)
              computed the same way as for the training of phoneHMMs
       phoneHMMs: set of phonetic Gaussian HMM models
       phoneTrans: list of phonetic symbols to be aligned including initial and
                   final silence

    Returns:
       list of strings in the form phoneme_index specifying, for each time step
       the state from phoneHMMs corresponding to the viterbi path.
    """
    # phone transcription => state transcription
    phones = sorted(phoneHMMs.keys())
    nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones}
    stateTrans = [p + '_' + str(i) for p in phoneTrans for i in range(nstates[p])]

    # combined HMM for utterance
    utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)

    # Viterbi decoder
    obsloglik = log_multivariate_normal_density_diag(lmfcc, utteranceHMM['means'], utteranceHMM['covars'])
    viterbiPath = viterbi(obsloglik, np.log(utteranceHMM['startprob']), np.log(utteranceHMM['transmat']))[1]

    # time alignment (frame-by-frame state transcription)
    viterbiStateTrans = [stateTrans[s] for s in viterbiPath]

    return viterbiStateTrans
Beispiel #2
0
def get_best_path_loglik(feature, hmm):
    trans_mat = hmm['transmat'][:-1, :-1]
    pi_vec = hmm['startprob'][:-1]
    means = hmm['means']
    covars = hmm['covars']

    obsloglik = log_multivariate_normal_density_diag(feature, means, covars)
    ret, _ = viterbi(obsloglik, np.log(pi_vec), np.log(trans_mat))
    return ret
Beispiel #3
0
def forcedAlignment(lmfcc, phoneHMMs, phoneTrans):
    """ forcedAlignmen: aligns a phonetic transcription at the state level

    Args:
       lmfcc: NxD array of MFCC feature vectors (N vectors of dimension D)
              computed the same way as for the training of phoneHMMs
       phoneHMMs: set of phonetic Gaussian HMM models
       phoneTrans: list of phonetic symbols to be aligned including initial and
                   final silence

    Returns:
        if return_syb:
            list of strings in the form phoneme_index specifying, for each time step
            the state from phoneHMMs corresponding to the viterbi path.
    """
    # Obtain the mapping from state to number of state
    nstates = dict()
    for ph in phoneHMMs.keys():
        num_state = phoneHMMs[ph]['means'].shape[0]
        nstates[ph] = num_state
    # Obtain a mapping from the phoneHMMs to statename
    stateTrans = list()
    for ph in phoneTrans:
        for i in range(nstates[ph]):
            stateTrans.append("%s_%i" % (ph, i))
    # ===========================================================
    # Create the hmm model for this utterance with only the information
    # of transcription
    utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)

    # calculate the Viterbi path
    means = utteranceHMM['means']
    covars = utteranceHMM['covars']
    log_emlik = log_mnd_diag(lmfcc, means, covars)
    # get \pi and A; ignore the terminal state
    log_pi = np.log(utteranceHMM['startprob'][:-1])
    log_trans = np.log(utteranceHMM['transmat'][:-1, :-1])
    _, path = viterbi(log_emlik, log_pi, log_trans)
    # =========================================================
    ret = [stateTrans[i] for i in path]

    return ret
def forcedAlignment(lmfcc, phoneHMMs, phoneTrans):
    """ forcedAlignmen: aligns a phonetic transcription at the state level

   Args:
      lmfcc: NxD array of MFCC feature vectors (N vectors of dimension D)
            computed the same way as for the training of phoneHMMs
      phoneHMMs: set of phonetic Gaussian HMM models
      phoneTrans: list of phonetic symbols to be aligned including initial and
                  final silence

   Returns:
      list of strings in the form phoneme_index specifying, for each time step
      the state from phoneHMMs corresponding to the viterbi path.
   """
    utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)
    emmision = log_multivariate_normal_density_diag(lmfcc,
                                                    utteranceHMM['means'],
                                                    utteranceHMM['covars'])
    return viterbi(emmision, np.log(utteranceHMM['startprob']),
                   np.log(utteranceHMM['transmat']))
Beispiel #5
0
                     [2])  # word transcription (contained in the filename)
    phoneTrans = words2phones(
        wordTrans, prondict)  # word transcription => phone transcription
    stateTrans = [
        p + '_' + str(i) for p in phoneTrans for i in range(nstates[p])
    ]  # phone transcription => state transcription

    # combined HMM for utterance
    utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)

    # Viterbi decoder
    obsloglik = log_multivariate_normal_density_diag(lmfcc,
                                                     utteranceHMM['means'],
                                                     utteranceHMM['covars'])
    viterbiLoglik, viterbiPath = viterbi(obsloglik,
                                         np.log(utteranceHMM['startprob']),
                                         np.log(utteranceHMM['transmat']))

    # time alignment (frame-by-frame state transcription)
    viterbiStateTrans = [stateTrans[s] for s in viterbiPath]

    # save in standard format (to use it, put it in the same directory of .wav and open .wav with wavesurfer)
    frames2trans(viterbiStateTrans, outfilename='data/transcriptions/z43a.lab')

    # check results
    plt.figure()
    pcolormesh(lmfcc, 'MFCC - computed', ylabel='MFCC')
    plt.figure()
    pcolormesh(example['lmfcc'], 'MFCC - example', ylabel='MFCC')

    plt.figure()
def viterbi_algorithm():
    wordHMMs = {}
    isolated = get_isolated(prondict)

    # verify implementation
    wordHMMs['o'] = concatHMMs(phoneHMMsAll, isolated['o'])
    log_st_prob = np.log(wordHMMs['o']['startprob'])
    log_transmat = np.log(wordHMMs['o']['transmat'])
    vloglik, bestPath = viterbi(example['obsloglik'], log_st_prob,
                                log_transmat)
    alpha_matrix = forward(example['obsloglik'], log_st_prob, log_transmat)
    print('vloglik from viterbi():', vloglik)
    print('vloglik from example:', example['vloglik'])

    # plot
    fig = plt.figure(figsize=(12, 6))
    ax = plt.subplot(121)
    ax.set_title('viterbi path from Viterbi()')
    plt.pcolormesh(alpha_matrix)
    plt.plot(bestPath, np.arange(len(bestPath)), color='red')
    plt.colorbar()
    plt.show()

    fig = plt.figure(figsize=(12, 6))
    ax = plt.subplot(121)
    ax.set_title('viterbi path from example')
    plt.pcolormesh(example['logalpha'])
    plt.plot(example['vpath'], np.arange(len(bestPath)), color='red')
    plt.colorbar()
    plt.show()

    # 44 data labels
    keys_list = [x for x in isolated.keys()]
    scores_models_all = np.zeros((len(data), len(isolated)))
    scores_models_onespkr = np.zeros_like(scores_models_all)

    for j in range(len(keys_list)):
        key = keys_list[j]
        hmms = concatHMMs(phoneHMMsAll, isolated[key])
        log_st_prob = np.log(hmms['startprob'])
        log_transmat = np.log(hmms['transmat'])
        for i in range(len(data)):
            lpr_test = log_multivariate_normal_density_diag(
                data[i]['lmfcc'], hmms['means'], hmms['covars'])
            loglik, path = viterbi(lpr_test, log_st_prob, log_transmat)
            scores_models_all[i, j] = loglik

        hmms = concatHMMs(phoneHMMsOne, isolated[key])
        log_st_prob = np.log(hmms['startprob'])
        log_transmat = np.log(hmms['transmat'])
        for i in range(len(data)):
            lpr_test = log_multivariate_normal_density_diag(
                data[i]['lmfcc'], hmms['means'], hmms['covars'])
            loglik, path = viterbi(lpr_test, log_st_prob, log_transmat)
            scores_models_onespkr[i, j] = loglik

    predict_all = np.argmax(scores_models_all, axis=1)
    predict_one = np.argmax(scores_models_onespkr, axis=1)

    label_all = [keys_list[x] for x in predict_all]
    label_one = [keys_list[x] for x in predict_one]

    true_label = [data[x]['digit'] for x in range(len(data))]
    print(true_label)
    print(label_all)
    print(label_one)
Beispiel #7
0

if __name__ == "__main__":
    # load data
    data = np.load('data/lab2_data.npz')['data']
    example = np.load('data/lab2_example.npz')['example'].item()
    phoneHMMs = np.load('data/lab2_models_onespkr.npz')['phoneHMMs'].item()

    # Build hmm
    wordHMMs = {}
    wordHMMs['o'] = concatHMMs(phoneHMMs, isolated['o'])

    trans_mat = wordHMMs['o']['transmat'][:-1, :-1]
    pi_vec = wordHMMs['o']['startprob'][:-1]
    # =====================================================
    best_seq_loglik, best_path = viterbi(example['obsloglik'], np.log(pi_vec),
                                         np.log(trans_mat))
    assert np.allclose(best_seq_loglik, example['vloglik'])
    assert np.array_equal(best_path, example['vpath'])
    # ========================================================================
    onespkr_wordHMMs = {}
    for k in isolated.keys():
        onespkr_wordHMMs[k] = concatHMMs(phoneHMMs, isolated[k])

    phoneHMMs_all = np.load('data/lab2_models_all.npz')['phoneHMMs'].item()
    for d in isolated.keys():
        wordHMMs[d] = concatHMMs(phoneHMMs_all, isolated[d])

    st = time()
    resp = match_model_and_utterances(data, onespkr_wordHMMs)
    onespkr_match = pd.DataFrame(resp)
    resp = match_model_and_utterances(data, wordHMMs)