def textgridError(textgrid_path, syllableTierName, phonemeTierName):
    '''
    find annotation errors: phoneme not in the dic_pho_map keys list
    '''

    recordings = getRecordings(textgrid_path)
    error = []
    for recording in recordings:
        print 'processing recording:', recording
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName)
        for pho in nestedPhonemeLists:
            if len(pho[1]) > 5 or pho[0][
                    2] not in dic_pinyin_2_initial_final_map.keys():
                errorInfo = (recording, str(pho[0][0]), pho[0][2],
                             str([p[2] for p in pho[1]]))
                error.append(errorInfo)
            for p in pho[1]:
                ##-- for debug the Textgrid phoneme annotation
                if p[2] not in dic_pho_map.keys():
                    errorInfo = (recording, str(p[0]), p[2])
                    error.append(errorInfo)
    with open('textgridError.txt', 'wb') as f:
        for errorInfo in error:
            f.write(' '.join(errorInfo))
            f.write('\n')
            f.write('\n')
Example #2
0
def detectedBoundariesOutput(recording, varin):

    # varin['feature_select'] = feature_string
    #
    # varin['vuvCorrection'] = True
    #
    # h2          = 0.02
    # alpha       = 0.2
    # p_lambda    = 0.1
    #
    # varin['h2']             = h2
    # varin['alpha']          = alpha
    # varin['p_lambda']       = p_lambda

    # print 'evaluate ', recording, ' l,h1,h2', varin['h2'], varin['alpha'], varin['p_lambda']

    nestedPhonemeLists, numSyllables, numPhonemes = syllableTextgridExtraction(
        textgrid_path, recording, 'pinyin', 'details')
    feature_Syllables, mfcc_syllables, spec_syllables, feature_vuv_syllables = featureSyllableSegmentation(
        feature_path, recording, nestedPhonemeLists, varin)
    groundtruthBoundariesSong, detectedBoundariesSong, groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong = \
        eval4oneSong(feature_Syllables, mfcc_syllables, spec_syllables, feature_vuv_syllables, nestedPhonemeLists,
                     varin['phonemeSegFunction'], varin)

    return groundtruthBoundariesSong, detectedBoundariesSong, groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong
def dump_feature_phn(wav_path, textgrid_path, recordings, syllableTierName,
                     phonemeTierName):
    """
    Dump feature for each phoneme
    :param wav_path:
    :param textgrid_path:
    :param recordings:
    :param syllableTierName:
    :param phonemeTierName:
    :return:
    """

    dic_pho_feature = {}

    for _, pho in enumerate(set(dic_pho_map.values())):
        dic_pho_feature[pho] = np.array([])

    for artist_path, recording in recordings:
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path=textgrid_path,
                                         recording=join(artist_path,recording),
                                         tier0=syllableTierName,
                                         tier1=phonemeTierName)

        # audio
        wav_full_filename = join(wav_path, artist_path, recording + '.wav')

        log_mel = get_log_mel_madmom(wav_full_filename,
                                     fs,
                                     hopsize_t,
                                     channel=1)

        for ii, pho in enumerate(nestedPhonemeLists):
            print 'calculating ', recording, ' and phoneme ', str(
                ii), ' of ', str(len(nestedPhonemeLists))
            for p in pho[1]:
                # map from annotated xsampa to readable notation
                try:
                    key = dic_pho_map[p[2]]
                except KeyError:
                    print(artist_path, recording)
                    print(ii, p[2])
                    raise

                sf = int(round(p[0] * fs / float(hopsize)))  # starting frame
                ef = int(round(p[1] * fs / float(hopsize)))  # ending frame

                log_mel_phn = log_mel[sf:ef, :]  # phoneme syllable

                if not len(dic_pho_feature[key]):
                    dic_pho_feature[key] = log_mel_phn
                else:
                    dic_pho_feature[key] = np.vstack(
                        (dic_pho_feature[key], log_mel_phn))

    return dic_pho_feature
def getValidTransGt(textgrid_path, syllableTierName, phonemeTierName):
    recordings = getRecordingNames('TEST', dataset)
    numValidTrans = 0
    for recording in recordings:
        print 'get valid trans gt processing recording:', recording
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName)
        for pho in nestedPhonemeLists:
            if pho[1][0][2] in [
                    'm', 'l', 'n', 'c', 'f', 'k', 's', 'x', "r\\'", 'w', 'j'
            ]:
                numValidTrans += 1
            for ii in range(len(pho[1]) - 1):
                if pho[1][ii][2] + '_' + pho[1][ii + 1][
                        2] in tails_comb_i + tails_comb_N + tails_comb_n + tails_comb_u:
                    numValidTrans += 1
    return numValidTrans
def findTestRecordingNumber(textgrid_path, syllableTierName, phonemeTierName):
    '''
    find test recording numbers
    '''
    recordings = getRecordings(textgrid_path)
    boundaries = []
    for recording in recordings:
        print 'processing recording:', recording
        boundaries_oneSong = 0
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName)
        for nestedPho in nestedPhonemeLists:
            boundaries_oneSong += len(nestedPho[1]) - 1
        boundaries.append(boundaries_oneSong)
    proportion_testset = 0.25
    print 'processing boundary ...'
    print boundaries
    index_testset = testRecordings(boundaries, proportion_testset)
    # output test set index
    return index_testset
def detectedBoundariesOutput(recording,varin):

    # varin['feature_select'] = feature_string
    #
    # varin['vuvCorrection'] = True
    #
    # h2          = 0.02
    # alpha       = 0.2
    # p_lambda    = 0.1
    #
    # varin['h2']             = h2
    # varin['alpha']          = alpha
    # varin['p_lambda']       = p_lambda

    # print 'evaluate ', recording, ' l,h1,h2', varin['h2'], varin['alpha'], varin['p_lambda']

    nestedPhonemeLists, numSyllables, numPhonemes = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details')
    feature_Syllables, mfcc_syllables, spec_syllables, feature_vuv_syllables = featureSyllableSegmentation(feature_path, recording,
                                                                                nestedPhonemeLists, varin)
    groundtruthBoundariesSong, detectedBoundariesSong, groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong = \
        eval4oneSong(feature_Syllables, mfcc_syllables, spec_syllables, feature_vuv_syllables, nestedPhonemeLists,
                     varin['phonemeSegFunction'], varin)

    return groundtruthBoundariesSong, detectedBoundariesSong, groundtruthBoundariesVoicedSong,detectedBoundariesVoicedSong
        number_recording     = test_recordings

    return number_recording

if __name__ == '__main__':

    recordings = getRecordings(wav_path)

    boundaries  = []
    numSyllable_all, numVoiced_all, numUnvoiced_all = 0,0,0
    lengthSyllable_all, lengthVoiced_all, lengthUnvoiced_all = [],[],[]
    for recording in recordings:

        boundaries_oneSong  = 0
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details')

        numSyllable_all += numSyllables

        for pho in nestedPhonemeLists:
            lengthSyllable_all.append(pho[0][1]-pho[0][0])
            for p in pho[1]:
                if p[2] == '':
                    continue
                if p[2] in ['c','k','f','x']:
                    numUnvoiced_all += 1
                    lengthUnvoiced_all.append(p[1]-p[0])
                else:
                    numVoiced_all   += 1
                    lengthVoiced_all.append(p[1]-p[0])
def textgridStat(textgrid_path, syllableTierName, phonemeTierName):
    '''
    syllableTierName: pinyin or dian
    phonemeTierName: details
    '''

    recordings = getRecordings(textgrid_path)
    # recordings = getRecordingNames('TEST',dataset)

    numLine_all, numSyllable_all, numVoiced_all, numUnvoiced_all = 0, 0, 0, 0
    lengthLine_all, lengthSyllable_all, lengthVoiced_all, lengthUnvoiced_all = [],[],[],[]
    numVowels, numSemivowels, numDiphtongs, numCompoundfinals, \
    numNonvoicedconsonants, numVoicedconsonants, numSilornament = 0,0,0,0,0,0,0

    # from * transit to phoneme
    trans2n = []
    trans2i = []
    trans2N = []
    trans2u = []

    dict_numTrans_phoneme = {}
    for tp in trans_phoneme:
        dict_numTrans_phoneme[tp] = 0

    for recording in recordings:
        print 'processing recording:', recording
        nestedSyllableLists, numLines, numSyllables \
            = syllableTextgridExtraction(textgrid_path,recording,'line',syllableTierName)
        numLine_all += numLines
        for line in nestedSyllableLists:
            lengthLine_all.append(line[0][1] - line[0][0])


        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName)
        numSyllable_all += numSyllables
        for pho in nestedPhonemeLists:
            lengthSyllable_all.append(pho[0][1] - pho[0][0])
            for p in pho[1]:
                if p[2] in ['c', 'k', 'f', 'x']:
                    numUnvoiced_all += 1
                    lengthUnvoiced_all.append(p[1] - p[0])
                else:
                    numVoiced_all += 1
                    lengthVoiced_all.append(p[1] - p[0])
            for p in pho[1]:
                if p[2] in vowels:
                    numVowels += 1
                elif p[2] in semivowels:
                    numSemivowels += 1
                elif p[2] in diphtongs:
                    numDiphtongs += 1
                elif p[2] in compoundfinals:
                    numCompoundfinals += 1
                elif p[2] in nonvoicedconsonants:
                    numNonvoicedconsonants += 1
                elif p[2] in voicedconsonants:
                    numVoicedconsonants += 1
                elif p[2] in silornament:
                    numSilornament += 1

        # transition
        for pho in nestedPhonemeLists:

            for ii in range(len(pho[1]) - 1):
                p0, p1 = transHelper(pho[1][ii][2], pho[1][ii + 1][2])

                for tp in trans_phoneme:
                    if p0 == tp.split('_')[0] and p1 == tp.split('_')[1]:
                        dict_numTrans_phoneme[tp] += 1

                if pho[1][ii + 1][2] == 'n':
                    trans2n.append(pho[1][ii][2] + '_' + pho[1][ii + 1][2])
                elif pho[1][ii + 1][2] == 'i':
                    trans2i.append(pho[1][ii][2] + '_' + pho[1][ii + 1][2])
                elif pho[1][ii + 1][2] == 'N':
                    trans2N.append(pho[1][ii][2] + '_' + pho[1][ii + 1][2])
                elif pho[1][ii + 1][2] == 'u':
                    trans2u.append(pho[1][ii][2] + '_' + pho[1][ii + 1][2])

    occurrence_threshold = sum(dict_numTrans_phoneme.values()) * 0.005
    sorted_numTrans_phoneme = sorted(dict_numTrans_phoneme.items(),
                                     key=operator.itemgetter(1))[::-1]

    sorted_numTrans_phoneme_threshed = []
    for sntp in sorted_numTrans_phoneme:
        if sntp[1] > occurrence_threshold:
            sorted_numTrans_phoneme_threshed.append(sntp)

    ##-- output statistics of the dataset
    print 'num recordings %i' % len(recordings)
    print 'num lines %i, num syllables %i, voiced phonemes %i, unvoiced phonemes %i' % (
        numLine_all, numSyllable_all, numVoiced_all, numUnvoiced_all)
    print 'avg len (s) lines %.3f, syllables %.3f, voiced phonemes %.3f, unvoiced phonemes %.3f' % (
        np.mean(lengthLine_all), np.mean(lengthSyllable_all),
        np.mean(lengthVoiced_all), np.mean(lengthUnvoiced_all))
    print 'std len (s) lines %.3f, syllables %.3f, voiced phonemes %.3f, unvoiced phonemes %.3f' % (
        np.std(lengthLine_all), np.std(lengthSyllable_all),
        np.std(lengthVoiced_all), np.std(lengthUnvoiced_all))
    print 'numVowels %d, numSemivowels %d, numDiphtongs %d, numCompoundfinals %d, numNonvoicedconsonants %d, numVoicedconsonants %d, numSilornament %d' % (
        numVowels, numSemivowels, numDiphtongs, numCompoundfinals,
        numNonvoicedconsonants, numVoicedconsonants, numSilornament)
    print sorted_numTrans_phoneme_threshed

    print set(trans2n)
    print set(trans2i)
    print set(trans2N)
    print set(trans2u)
def getDataAll(textgrid_path, recordings, varin):

    icdPatterns_all = []  # voiced incorrect patterns
    voicedPatterns_all = []  # voiced patterns, including incorrect patterns
    index_vp_all = []  # index of detected boundaries who has voiced patterns
    f_s_all = []
    f_vuv_s_all = []
    spec_all = []
    pho_s_all = []
    gtb_all, db_all, gtbv_all, dbv_all = [], [], [], []

    # recordings level
    for recording in recordings:

        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details')

        feature_syllables, feature_vuv_syllables, spec_syllables, phoneme_syllables \
            = featureSyllableSegmentation(feature_path, recording, nestedPhonemeLists,varin)

        groundtruthBoundariesSong, detectedBoundariesSong, \
        groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong \
            = detectedBoundariesOutput(recording,varin)

        f_s_all.append(feature_syllables)
        f_vuv_s_all.append(feature_vuv_syllables)
        spec_all.append(spec_syllables)
        pho_s_all.append(phoneme_syllables)

        gtb_all.append(groundtruthBoundariesSong)
        db_all.append(detectedBoundariesSong)
        gtbv_all.append(groundtruthBoundariesVoicedSong)
        dbv_all.append(detectedBoundariesVoicedSong)

        icdPatterns_song = []
        voicedPatterns_song = []
        index_vp_song = []

        # syllable level
        for ii in range(len(groundtruthBoundariesVoicedSong)):

            groundtruthBoundariesVoiced = groundtruthBoundariesVoicedSong[ii]
            detectedBoundariesVoiced = detectedBoundariesVoicedSong[ii]
            icd = incorrectDetection(groundtruthBoundariesVoiced,
                                     detectedBoundariesVoiced,
                                     varin['tolerance'])

            feature = feature_syllables[ii]
            icdPatterns, _ = icdPatternCollection(feature, icd, varin)
            if len(icdPatterns):
                icdPatterns = np.vstack(icdPatterns)
                icdPatterns_song.append(icdPatterns)

            voicedPatterns, index_vp = icdPatternCollection(
                feature, detectedBoundariesVoiced, varin)
            if len(voicedPatterns):
                voicedPatterns = np.vstack(voicedPatterns)
            # voicedPatterns is possible to be empty
            voicedPatterns_song.append(voicedPatterns)
            index_vp_song.append(index_vp)

        icdPatterns_song = np.vstack(icdPatterns_song)
        icdPatterns_all.append(icdPatterns_song)

        voicedPatterns_all.append(voicedPatterns_song)
        index_vp_all.append(index_vp_song)

    icdPatterns_all = np.vstack(icdPatterns_all)

    return icdPatterns_all, voicedPatterns_all, index_vp_all, f_s_all, f_vuv_s_all, spec_all, pho_s_all, gtb_all, db_all, gtbv_all, dbv_all
#
# varin['framesize']   = int(round(framesize_t*fs))
# varin['hopsize']     = int(round(hopsize_t*fs))

varin['N_pattern']   = 11
varin['N_feature']   = 36

recordings = getRecordings(wav_path)

####---- collect all features and phonemes
f_s_all             = []
f_vuv_s_all         = []
spec_all            = []
pho_s_all           = []
for recording in recordings:
    nestedPhonemeLists, numSyllables, numPhonemes = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details')
    feature_syllables, feature_vuv_syllables, spec_syllables, phoneme_syllables = featureSyllableSegmentation(feature_path, recording, nestedPhonemeLists,varin)
    f_s_all.append(feature_syllables)
    f_vuv_s_all.append(feature_vuv_syllables)
    spec_all.append(spec_syllables)
    pho_s_all.append(phoneme_syllables)

f_s_all     = np.hstack(f_s_all)
f_vuv_s_all = np.hstack(f_vuv_s_all)
spec_all    = np.hstack(spec_all)
pho_s_all   = np.hstack(pho_s_all)

####--- patterns
patterns_voiced_change,_        = voicedChangePatternCollection(f_s_all, pho_s_all, varin)
patterns_voiced_unchange        = voicedUnchangePatternCollection(f_s_all, pho_s_all, varin)
def getDataAll(textgrid_path,recordings,varin):

    icdPatterns_all     = []    # voiced incorrect patterns
    voicedPatterns_all  = []    # voiced patterns, including incorrect patterns
    index_vp_all        = []    # index of detected boundaries who has voiced patterns
    f_s_all             = []
    f_vuv_s_all         = []
    spec_all            = []
    pho_s_all           = []
    gtb_all,db_all,gtbv_all,dbv_all     = [],[],[],[]

    # recordings level
    for recording in recordings:

        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details')

        feature_syllables, feature_vuv_syllables, spec_syllables, phoneme_syllables \
            = featureSyllableSegmentation(feature_path, recording, nestedPhonemeLists,varin)

        groundtruthBoundariesSong, detectedBoundariesSong, \
        groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong \
            = detectedBoundariesOutput(recording,varin)

        f_s_all.append(feature_syllables)
        f_vuv_s_all.append(feature_vuv_syllables)
        spec_all.append(spec_syllables)
        pho_s_all.append(phoneme_syllables)

        gtb_all.append(groundtruthBoundariesSong)
        db_all.append(detectedBoundariesSong)
        gtbv_all.append(groundtruthBoundariesVoicedSong)
        dbv_all.append(detectedBoundariesVoicedSong)

        icdPatterns_song     = []
        voicedPatterns_song  = []
        index_vp_song        = []

        # syllable level
        for ii in range(len(groundtruthBoundariesVoicedSong)):

            groundtruthBoundariesVoiced     = groundtruthBoundariesVoicedSong[ii]
            detectedBoundariesVoiced        = detectedBoundariesVoicedSong[ii]
            icd                             = incorrectDetection(groundtruthBoundariesVoiced,
                                                                 detectedBoundariesVoiced,
                                                                 varin['tolerance'])

            feature                         = feature_syllables[ii]
            icdPatterns,_                   = icdPatternCollection(feature,icd,varin)
            if len(icdPatterns):
                icdPatterns                 = np.vstack(icdPatterns)
                icdPatterns_song.append(icdPatterns)

            voicedPatterns,index_vp         = icdPatternCollection(feature,detectedBoundariesVoiced,varin)
            if len(voicedPatterns):
                voicedPatterns              = np.vstack(voicedPatterns)
            # voicedPatterns is possible to be empty
            voicedPatterns_song.append(voicedPatterns)
            index_vp_song.append(index_vp)

        icdPatterns_song        = np.vstack(icdPatterns_song)
        icdPatterns_all.append(icdPatterns_song)

        voicedPatterns_all.append(voicedPatterns_song)
        index_vp_all.append(index_vp_song)

    icdPatterns_all             = np.vstack(icdPatterns_all)

    return icdPatterns_all,voicedPatterns_all,index_vp_all,f_s_all,f_vuv_s_all,spec_all,pho_s_all,gtb_all,db_all,gtbv_all,dbv_all
def predict(textgrid_path,feature_path,scaler_filename,svm_model_filename,recording,varin):

    hopsize         = varin['hopsize']
    fs              = varin['fs']
    framesize       = varin['framesize']
    N               = 2*framesize

    scaler          = joblib.load(scaler_filename)
    svm_model_object= joblib.load(svm_model_filename)

    sumNumGroundtruthIntervals,sumNumDetectedIntervals,sumNumCorrect = 0,0,0

    nestedPhonemeLists, numSyllables, numPhonemes \
            = syllableTextgridExtraction(textgrid_path, recording, 'pinyin', 'details')

    # classification feature
    feature                 = featureVUV(feature_path,recording,varin)

    for ii, nestedPho in enumerate(nestedPhonemeLists):

        print 'evaluate syllable ', ii+1, ' in', len(nestedPhonemeLists)

        syllable_start_frame    = int(round(nestedPho[0][0]*fs/hopsize))
        syllable_end_frame      = int(round(nestedPho[0][1]*fs/hopsize))
        syllable_feature        = feature[syllable_start_frame:syllable_end_frame,:]

        detectedBoundaries_interval = consonantInterval(syllable_feature,scaler,svm_model_object,varin)

        ####---- merge interval into boundaries
        # if detectedBoundaries_interval:
        #     detectedBoundaries = np.hstack(detectedBoundaries_interval)
        # else:
        #     detectedBoundaries = np.array([])
        #
        # detectedBoundaries = detectedBoundaries*hopsize/float(fs)

        # phonemes of syllable
        phoList                 = nestedPhonemeLists[ii][1]
        syllable_start_time     = phoList[0][0]
        groundtruthBoundaries_interval   = []

        for pho in phoList:
            if pho[2] in ['c','k','f','x']:
                groundtruthBoundaries_interval.append([pho[0]-syllable_start_time,pho[1]-syllable_start_time])


        # # evaluate the consonant boundaries
        # numDetectedBoundaries, numGroundtruthBoundaries, numCorrect = \
        #     metrics.boundaryDetection(groundtruthBoundaries=groundtruthBoundaries,
        #                           detectedBoundaries=detectedBoundaries,
        #                           tolerance=varin['tolerance'])

        numDetectedIntervals, numGroundtruthIntervals, numCorrect = \
        metrics.intervalDetection(groundtruthBoundaries_interval,detectedBoundaries_interval,varin['tolerance'])

        # print numGroundtruthBoundaries, numDetectedBoundaries,numCorrect

        sumNumGroundtruthIntervals += numGroundtruthIntervals
        sumNumDetectedIntervals    += numDetectedIntervals
        sumNumCorrect              += numCorrect


        if varin['plot']:
            # load spectrogram
            spec_filename   = os.path.join(feature_path,'spec'+'_'+recording+'_'
                                    +str(varin['framesize'])+'_'+str(varin['hopsize'])+'.npy')
            spec            = np.load(spec_filename)
            syllable_spec   = spec[syllable_start_frame:syllable_end_frame,:]
            binFreqs        = np.arange(syllable_spec.shape[1])*fs/float(N)
            timestamps_spec = np.arange(syllable_spec.shape[0]) * (hopsize/float(fs))

            f, axarr = plt.subplots(2, sharex=True)
            axarr[0].pcolormesh(timestamps_spec,binFreqs,20*np.log10(syllable_spec.T+np.finfo(np.float).eps))
            for interval in detectedBoundaries_interval:
                axarr[0].axvspan(interval[0], interval[1], alpha=0.5, color='red')
            for interval in groundtruthBoundaries_interval:
                axarr[1].axvspan(interval[0], interval[1], alpha=0.5, color='red')
            plt.show()

    return sumNumGroundtruthIntervals,sumNumDetectedIntervals,sumNumCorrect
#
# varin['framesize']   = int(round(framesize_t*fs))
# varin['hopsize']     = int(round(hopsize_t*fs))

varin['N_pattern'] = 11
varin['N_feature'] = 36

recordings = getRecordings(wav_path)

####---- collect all features and phonemes
f_s_all = []
f_vuv_s_all = []
spec_all = []
pho_s_all = []
for recording in recordings:
    nestedPhonemeLists, numSyllables, numPhonemes = syllableTextgridExtraction(
        textgrid_path, recording, 'pinyin', 'details')
    feature_syllables, feature_vuv_syllables, spec_syllables, phoneme_syllables = featureSyllableSegmentation(
        feature_path, recording, nestedPhonemeLists, varin)
    f_s_all.append(feature_syllables)
    f_vuv_s_all.append(feature_vuv_syllables)
    spec_all.append(spec_syllables)
    pho_s_all.append(phoneme_syllables)

f_s_all = np.hstack(f_s_all)
f_vuv_s_all = np.hstack(f_vuv_s_all)
spec_all = np.hstack(spec_all)
pho_s_all = np.hstack(pho_s_all)

####--- patterns
patterns_voiced_change, _ = voicedChangePatternCollection(
    f_s_all, pho_s_all, varin)
    return number_recording


if __name__ == '__main__':

    recordings = getRecordings(wav_path)

    boundaries = []
    numSyllable_all, numVoiced_all, numUnvoiced_all = 0, 0, 0
    lengthSyllable_all, lengthVoiced_all, lengthUnvoiced_all = [], [], []
    for recording in recordings:

        boundaries_oneSong = 0
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details')

        numSyllable_all += numSyllables

        for pho in nestedPhonemeLists:
            lengthSyllable_all.append(pho[0][1] - pho[0][0])
            for p in pho[1]:
                if p[2] == '':
                    continue
                if p[2] in ['c', 'k', 'f', 'x']:
                    numUnvoiced_all += 1
                    lengthUnvoiced_all.append(p[1] - p[0])
                else:
                    numVoiced_all += 1
                    lengthVoiced_all.append(p[1] - p[0])
Example #15
0
def predict(textgrid_path, feature_path, scaler_filename, svm_model_filename,
            recording, varin):

    hopsize = varin['hopsize']
    fs = varin['fs']
    framesize = varin['framesize']
    N = 2 * framesize

    scaler = joblib.load(scaler_filename)
    svm_model_object = joblib.load(svm_model_filename)

    sumNumGroundtruthIntervals, sumNumDetectedIntervals, sumNumCorrect = 0, 0, 0

    nestedPhonemeLists, numSyllables, numPhonemes \
            = syllableTextgridExtraction(textgrid_path, recording, 'pinyin', 'details')

    # classification feature
    feature = featureVUV(feature_path, recording, varin)

    for ii, nestedPho in enumerate(nestedPhonemeLists):

        print 'evaluate syllable ', ii + 1, ' in', len(nestedPhonemeLists)

        syllable_start_frame = int(round(nestedPho[0][0] * fs / hopsize))
        syllable_end_frame = int(round(nestedPho[0][1] * fs / hopsize))
        syllable_feature = feature[syllable_start_frame:syllable_end_frame, :]

        detectedBoundaries_interval = consonantInterval(
            syllable_feature, scaler, svm_model_object, varin)

        ####---- merge interval into boundaries
        # if detectedBoundaries_interval:
        #     detectedBoundaries = np.hstack(detectedBoundaries_interval)
        # else:
        #     detectedBoundaries = np.array([])
        #
        # detectedBoundaries = detectedBoundaries*hopsize/float(fs)

        # phonemes of syllable
        phoList = nestedPhonemeLists[ii][1]
        syllable_start_time = phoList[0][0]
        groundtruthBoundaries_interval = []

        for pho in phoList:
            if pho[2] in ['c', 'k', 'f', 'x']:
                groundtruthBoundaries_interval.append([
                    pho[0] - syllable_start_time, pho[1] - syllable_start_time
                ])

        # # evaluate the consonant boundaries
        # numDetectedBoundaries, numGroundtruthBoundaries, numCorrect = \
        #     metrics.boundaryDetection(groundtruthBoundaries=groundtruthBoundaries,
        #                           detectedBoundaries=detectedBoundaries,
        #                           tolerance=varin['tolerance'])

        numDetectedIntervals, numGroundtruthIntervals, numCorrect = \
        metrics.intervalDetection(groundtruthBoundaries_interval,detectedBoundaries_interval,varin['tolerance'])

        # print numGroundtruthBoundaries, numDetectedBoundaries,numCorrect

        sumNumGroundtruthIntervals += numGroundtruthIntervals
        sumNumDetectedIntervals += numDetectedIntervals
        sumNumCorrect += numCorrect

        if varin['plot']:
            # load spectrogram
            spec_filename = os.path.join(
                feature_path, 'spec' + '_' + recording + '_' +
                str(varin['framesize']) + '_' + str(varin['hopsize']) + '.npy')
            spec = np.load(spec_filename)
            syllable_spec = spec[syllable_start_frame:syllable_end_frame, :]
            binFreqs = np.arange(syllable_spec.shape[1]) * fs / float(N)
            timestamps_spec = np.arange(
                syllable_spec.shape[0]) * (hopsize / float(fs))

            f, axarr = plt.subplots(2, sharex=True)
            axarr[0].pcolormesh(
                timestamps_spec, binFreqs,
                20 * np.log10(syllable_spec.T + np.finfo(np.float).eps))
            for interval in detectedBoundaries_interval:
                axarr[0].axvspan(interval[0],
                                 interval[1],
                                 alpha=0.5,
                                 color='red')
            for interval in groundtruthBoundaries_interval:
                axarr[1].axvspan(interval[0],
                                 interval[1],
                                 alpha=0.5,
                                 color='red')
            plt.show()

    return sumNumGroundtruthIntervals, sumNumDetectedIntervals, sumNumCorrect
Example #16
0
def doClassification():
    """
    1. collect features from test set
    2. predict by GMM or DNN models
    3. save the prediction
    :return: prediction of GMM and DNN model
    """

    phone_class = PhonemeClassification()
    phone_class.create_gmm(gmmModel_path)

    mfcc_all = np.array([])
    mfccBands1D_all = np.array([])
    mfccBands2D_all = np.array([])

    y_true = []

    for recording in getRecordingNames('TEST', dataset):
        nestedPhonemeLists, numSyllables, numPhonemes   \
            = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName)

        wav_full_filename = os.path.join(wav_path, recording + '.wav')
        audio = ess.MonoLoader(downmix='left',
                               filename=wav_full_filename,
                               sampleRate=fs)()

        # plotAudio(audio,15,16)

        print 'calculating mfcc and mfcc bands ... ', recording
        mfcc = getFeature(audio, d=True, nbf=False)
        mfccBands1D = getMFCCBands1D(audio, nbf=True)
        mfccBands2D = getMFCCBands2D(audio, nbf=True)
        mfccBands2D = np.log(10000 * mfccBands2D + 1)

        # scale mfccBands1D for dnn acoustic models
        mfccBands1D_std = preprocessing.StandardScaler().fit_transform(
            mfccBands1D)

        # scale mfccBands2D for cnn acoustic models
        scaler = pickle.load(open(scaler_path, 'rb'))
        mfccBands2D_std = scaler.transform(mfccBands2D)

        for ii, pho in enumerate(nestedPhonemeLists):

            print 'calculating ', recording, ' and phoneme ', str(
                ii), ' of ', str(len(nestedPhonemeLists))

            # MFCC feature
            sf = round(pho[0][0] * fs / hopsize)
            ef = round(pho[0][1] * fs / hopsize)

            # mfcc syllable
            mfcc_s = mfcc[sf:ef, :]
            mfccBands_s = mfccBands2D[sf:ef, :]
            mfccBands1D_s_std = mfccBands1D_std[sf:ef, :]
            mfccBands2D_s_std = mfccBands2D_std[sf:ef, :]

            if len(mfcc_all):
                mfcc_all = np.vstack((mfcc_all, mfcc_s))
                mfccBands1D_all = np.vstack(
                    (mfccBands1D_all, mfccBands1D_s_std))
                mfccBands2D_all = np.vstack(
                    (mfccBands2D_all, mfccBands2D_s_std))
            else:
                mfcc_all = mfcc_s
                mfccBands1D_all = mfccBands1D_s_std
                mfccBands2D_all = mfccBands2D_s_std

            # print mfcc_all.shape, mfccBands2D_all.shape

            ##-- parsing y_true
            y_true_s = []
            for ii_p, p in enumerate(pho[1]):
                # map from annotated xsampa to readable notation
                key = dic_pho_map[p[2]]
                index_key = dic_pho_label[key]
                y_true_s += [index_key] * int(round((p[1] - p[0]) / hopsize_t))

            print len(y_true_s), mfcc_s.shape[0]

            if len(y_true_s) > mfcc_s.shape[0]:
                y_true_s = y_true_s[:mfcc_s.shape[0]]
            elif len(y_true_s) < mfcc_s.shape[0]:
                y_true_s += [y_true_s[-1]] * (mfcc_s.shape[0] - len(y_true_s))

            y_true += y_true_s

    phone_class.mapb_gmm(mfcc_all)
    obs_gmm = phone_class.mapb_gmm_getter()
    y_pred_gmm = phone_class.prediction(obs_gmm)

    mfccBands2D_all = featureReshape(mfccBands2D_all)

    phone_class.mapb_keras(mfccBands2D_all, kerasModels_jordi_path, jordi=True)
    obs_cnn_jordi = phone_class.mapb_keras_getter()
    y_pred_jordi = phone_class.prediction(obs_cnn_jordi)

    phone_class.mapb_keras(mfccBands2D_all, kerasModels_choi_path)
    obs_cnn_choi = phone_class.mapb_keras_getter()
    y_pred_choi = phone_class.prediction(obs_cnn_choi)

    phone_class.mapb_keras(mfccBands1D_all, kerasModels_dnn_path)
    obs_dnn = phone_class.mapb_keras_getter()
    y_pred_dnn = phone_class.prediction(obs_dnn)

    np.save('./trainingData/y_pred_gmm.npy', y_pred_gmm)
    np.save('./trainingData/y_pred_jordi.npy', y_pred_jordi)
    np.save('./trainingData/y_pred_choi.npy', y_pred_choi)
    np.save('./trainingData/y_pred_dnn.npy', y_pred_dnn)

    np.save('./trainingData/y_true.npy', y_true)