def textgridError(textgrid_path, syllableTierName, phonemeTierName): ''' find annotation errors: phoneme not in the dic_pho_map keys list ''' recordings = getRecordings(textgrid_path) error = [] for recording in recordings: print 'processing recording:', recording nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName) for pho in nestedPhonemeLists: if len(pho[1]) > 5 or pho[0][ 2] not in dic_pinyin_2_initial_final_map.keys(): errorInfo = (recording, str(pho[0][0]), pho[0][2], str([p[2] for p in pho[1]])) error.append(errorInfo) for p in pho[1]: ##-- for debug the Textgrid phoneme annotation if p[2] not in dic_pho_map.keys(): errorInfo = (recording, str(p[0]), p[2]) error.append(errorInfo) with open('textgridError.txt', 'wb') as f: for errorInfo in error: f.write(' '.join(errorInfo)) f.write('\n') f.write('\n')
def detectedBoundariesOutput(recording, varin): # varin['feature_select'] = feature_string # # varin['vuvCorrection'] = True # # h2 = 0.02 # alpha = 0.2 # p_lambda = 0.1 # # varin['h2'] = h2 # varin['alpha'] = alpha # varin['p_lambda'] = p_lambda # print 'evaluate ', recording, ' l,h1,h2', varin['h2'], varin['alpha'], varin['p_lambda'] nestedPhonemeLists, numSyllables, numPhonemes = syllableTextgridExtraction( textgrid_path, recording, 'pinyin', 'details') feature_Syllables, mfcc_syllables, spec_syllables, feature_vuv_syllables = featureSyllableSegmentation( feature_path, recording, nestedPhonemeLists, varin) groundtruthBoundariesSong, detectedBoundariesSong, groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong = \ eval4oneSong(feature_Syllables, mfcc_syllables, spec_syllables, feature_vuv_syllables, nestedPhonemeLists, varin['phonemeSegFunction'], varin) return groundtruthBoundariesSong, detectedBoundariesSong, groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong
def dump_feature_phn(wav_path, textgrid_path, recordings, syllableTierName, phonemeTierName): """ Dump feature for each phoneme :param wav_path: :param textgrid_path: :param recordings: :param syllableTierName: :param phonemeTierName: :return: """ dic_pho_feature = {} for _, pho in enumerate(set(dic_pho_map.values())): dic_pho_feature[pho] = np.array([]) for artist_path, recording in recordings: nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path=textgrid_path, recording=join(artist_path,recording), tier0=syllableTierName, tier1=phonemeTierName) # audio wav_full_filename = join(wav_path, artist_path, recording + '.wav') log_mel = get_log_mel_madmom(wav_full_filename, fs, hopsize_t, channel=1) for ii, pho in enumerate(nestedPhonemeLists): print 'calculating ', recording, ' and phoneme ', str( ii), ' of ', str(len(nestedPhonemeLists)) for p in pho[1]: # map from annotated xsampa to readable notation try: key = dic_pho_map[p[2]] except KeyError: print(artist_path, recording) print(ii, p[2]) raise sf = int(round(p[0] * fs / float(hopsize))) # starting frame ef = int(round(p[1] * fs / float(hopsize))) # ending frame log_mel_phn = log_mel[sf:ef, :] # phoneme syllable if not len(dic_pho_feature[key]): dic_pho_feature[key] = log_mel_phn else: dic_pho_feature[key] = np.vstack( (dic_pho_feature[key], log_mel_phn)) return dic_pho_feature
def getValidTransGt(textgrid_path, syllableTierName, phonemeTierName): recordings = getRecordingNames('TEST', dataset) numValidTrans = 0 for recording in recordings: print 'get valid trans gt processing recording:', recording nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName) for pho in nestedPhonemeLists: if pho[1][0][2] in [ 'm', 'l', 'n', 'c', 'f', 'k', 's', 'x', "r\\'", 'w', 'j' ]: numValidTrans += 1 for ii in range(len(pho[1]) - 1): if pho[1][ii][2] + '_' + pho[1][ii + 1][ 2] in tails_comb_i + tails_comb_N + tails_comb_n + tails_comb_u: numValidTrans += 1 return numValidTrans
def findTestRecordingNumber(textgrid_path, syllableTierName, phonemeTierName): ''' find test recording numbers ''' recordings = getRecordings(textgrid_path) boundaries = [] for recording in recordings: print 'processing recording:', recording boundaries_oneSong = 0 nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName) for nestedPho in nestedPhonemeLists: boundaries_oneSong += len(nestedPho[1]) - 1 boundaries.append(boundaries_oneSong) proportion_testset = 0.25 print 'processing boundary ...' print boundaries index_testset = testRecordings(boundaries, proportion_testset) # output test set index return index_testset
def detectedBoundariesOutput(recording,varin): # varin['feature_select'] = feature_string # # varin['vuvCorrection'] = True # # h2 = 0.02 # alpha = 0.2 # p_lambda = 0.1 # # varin['h2'] = h2 # varin['alpha'] = alpha # varin['p_lambda'] = p_lambda # print 'evaluate ', recording, ' l,h1,h2', varin['h2'], varin['alpha'], varin['p_lambda'] nestedPhonemeLists, numSyllables, numPhonemes = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details') feature_Syllables, mfcc_syllables, spec_syllables, feature_vuv_syllables = featureSyllableSegmentation(feature_path, recording, nestedPhonemeLists, varin) groundtruthBoundariesSong, detectedBoundariesSong, groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong = \ eval4oneSong(feature_Syllables, mfcc_syllables, spec_syllables, feature_vuv_syllables, nestedPhonemeLists, varin['phonemeSegFunction'], varin) return groundtruthBoundariesSong, detectedBoundariesSong, groundtruthBoundariesVoicedSong,detectedBoundariesVoicedSong
number_recording = test_recordings return number_recording if __name__ == '__main__': recordings = getRecordings(wav_path) boundaries = [] numSyllable_all, numVoiced_all, numUnvoiced_all = 0,0,0 lengthSyllable_all, lengthVoiced_all, lengthUnvoiced_all = [],[],[] for recording in recordings: boundaries_oneSong = 0 nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details') numSyllable_all += numSyllables for pho in nestedPhonemeLists: lengthSyllable_all.append(pho[0][1]-pho[0][0]) for p in pho[1]: if p[2] == '': continue if p[2] in ['c','k','f','x']: numUnvoiced_all += 1 lengthUnvoiced_all.append(p[1]-p[0]) else: numVoiced_all += 1 lengthVoiced_all.append(p[1]-p[0])
def textgridStat(textgrid_path, syllableTierName, phonemeTierName): ''' syllableTierName: pinyin or dian phonemeTierName: details ''' recordings = getRecordings(textgrid_path) # recordings = getRecordingNames('TEST',dataset) numLine_all, numSyllable_all, numVoiced_all, numUnvoiced_all = 0, 0, 0, 0 lengthLine_all, lengthSyllable_all, lengthVoiced_all, lengthUnvoiced_all = [],[],[],[] numVowels, numSemivowels, numDiphtongs, numCompoundfinals, \ numNonvoicedconsonants, numVoicedconsonants, numSilornament = 0,0,0,0,0,0,0 # from * transit to phoneme trans2n = [] trans2i = [] trans2N = [] trans2u = [] dict_numTrans_phoneme = {} for tp in trans_phoneme: dict_numTrans_phoneme[tp] = 0 for recording in recordings: print 'processing recording:', recording nestedSyllableLists, numLines, numSyllables \ = syllableTextgridExtraction(textgrid_path,recording,'line',syllableTierName) numLine_all += numLines for line in nestedSyllableLists: lengthLine_all.append(line[0][1] - line[0][0]) nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName) numSyllable_all += numSyllables for pho in nestedPhonemeLists: lengthSyllable_all.append(pho[0][1] - pho[0][0]) for p in pho[1]: if p[2] in ['c', 'k', 'f', 'x']: numUnvoiced_all += 1 lengthUnvoiced_all.append(p[1] - p[0]) else: numVoiced_all += 1 lengthVoiced_all.append(p[1] - p[0]) for p in pho[1]: if p[2] in vowels: numVowels += 1 elif p[2] in semivowels: numSemivowels += 1 elif p[2] in diphtongs: numDiphtongs += 1 elif p[2] in compoundfinals: numCompoundfinals += 1 elif p[2] in nonvoicedconsonants: numNonvoicedconsonants += 1 elif p[2] in voicedconsonants: numVoicedconsonants += 1 elif p[2] in silornament: numSilornament += 1 # transition for pho in nestedPhonemeLists: for ii in range(len(pho[1]) - 1): p0, p1 = transHelper(pho[1][ii][2], pho[1][ii + 1][2]) for tp in trans_phoneme: if p0 == tp.split('_')[0] and p1 == tp.split('_')[1]: dict_numTrans_phoneme[tp] += 1 if pho[1][ii + 1][2] == 'n': trans2n.append(pho[1][ii][2] + '_' + pho[1][ii + 1][2]) elif pho[1][ii + 1][2] == 'i': trans2i.append(pho[1][ii][2] + '_' + pho[1][ii + 1][2]) elif pho[1][ii + 1][2] == 'N': trans2N.append(pho[1][ii][2] + '_' + pho[1][ii + 1][2]) elif pho[1][ii + 1][2] == 'u': trans2u.append(pho[1][ii][2] + '_' + pho[1][ii + 1][2]) occurrence_threshold = sum(dict_numTrans_phoneme.values()) * 0.005 sorted_numTrans_phoneme = sorted(dict_numTrans_phoneme.items(), key=operator.itemgetter(1))[::-1] sorted_numTrans_phoneme_threshed = [] for sntp in sorted_numTrans_phoneme: if sntp[1] > occurrence_threshold: sorted_numTrans_phoneme_threshed.append(sntp) ##-- output statistics of the dataset print 'num recordings %i' % len(recordings) print 'num lines %i, num syllables %i, voiced phonemes %i, unvoiced phonemes %i' % ( numLine_all, numSyllable_all, numVoiced_all, numUnvoiced_all) print 'avg len (s) lines %.3f, syllables %.3f, voiced phonemes %.3f, unvoiced phonemes %.3f' % ( np.mean(lengthLine_all), np.mean(lengthSyllable_all), np.mean(lengthVoiced_all), np.mean(lengthUnvoiced_all)) print 'std len (s) lines %.3f, syllables %.3f, voiced phonemes %.3f, unvoiced phonemes %.3f' % ( np.std(lengthLine_all), np.std(lengthSyllable_all), np.std(lengthVoiced_all), np.std(lengthUnvoiced_all)) print 'numVowels %d, numSemivowels %d, numDiphtongs %d, numCompoundfinals %d, numNonvoicedconsonants %d, numVoicedconsonants %d, numSilornament %d' % ( numVowels, numSemivowels, numDiphtongs, numCompoundfinals, numNonvoicedconsonants, numVoicedconsonants, numSilornament) print sorted_numTrans_phoneme_threshed print set(trans2n) print set(trans2i) print set(trans2N) print set(trans2u)
def getDataAll(textgrid_path, recordings, varin): icdPatterns_all = [] # voiced incorrect patterns voicedPatterns_all = [] # voiced patterns, including incorrect patterns index_vp_all = [] # index of detected boundaries who has voiced patterns f_s_all = [] f_vuv_s_all = [] spec_all = [] pho_s_all = [] gtb_all, db_all, gtbv_all, dbv_all = [], [], [], [] # recordings level for recording in recordings: nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details') feature_syllables, feature_vuv_syllables, spec_syllables, phoneme_syllables \ = featureSyllableSegmentation(feature_path, recording, nestedPhonemeLists,varin) groundtruthBoundariesSong, detectedBoundariesSong, \ groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong \ = detectedBoundariesOutput(recording,varin) f_s_all.append(feature_syllables) f_vuv_s_all.append(feature_vuv_syllables) spec_all.append(spec_syllables) pho_s_all.append(phoneme_syllables) gtb_all.append(groundtruthBoundariesSong) db_all.append(detectedBoundariesSong) gtbv_all.append(groundtruthBoundariesVoicedSong) dbv_all.append(detectedBoundariesVoicedSong) icdPatterns_song = [] voicedPatterns_song = [] index_vp_song = [] # syllable level for ii in range(len(groundtruthBoundariesVoicedSong)): groundtruthBoundariesVoiced = groundtruthBoundariesVoicedSong[ii] detectedBoundariesVoiced = detectedBoundariesVoicedSong[ii] icd = incorrectDetection(groundtruthBoundariesVoiced, detectedBoundariesVoiced, varin['tolerance']) feature = feature_syllables[ii] icdPatterns, _ = icdPatternCollection(feature, icd, varin) if len(icdPatterns): icdPatterns = np.vstack(icdPatterns) icdPatterns_song.append(icdPatterns) voicedPatterns, index_vp = icdPatternCollection( feature, detectedBoundariesVoiced, varin) if len(voicedPatterns): voicedPatterns = np.vstack(voicedPatterns) # voicedPatterns is possible to be empty voicedPatterns_song.append(voicedPatterns) index_vp_song.append(index_vp) icdPatterns_song = np.vstack(icdPatterns_song) icdPatterns_all.append(icdPatterns_song) voicedPatterns_all.append(voicedPatterns_song) index_vp_all.append(index_vp_song) icdPatterns_all = np.vstack(icdPatterns_all) return icdPatterns_all, voicedPatterns_all, index_vp_all, f_s_all, f_vuv_s_all, spec_all, pho_s_all, gtb_all, db_all, gtbv_all, dbv_all
# # varin['framesize'] = int(round(framesize_t*fs)) # varin['hopsize'] = int(round(hopsize_t*fs)) varin['N_pattern'] = 11 varin['N_feature'] = 36 recordings = getRecordings(wav_path) ####---- collect all features and phonemes f_s_all = [] f_vuv_s_all = [] spec_all = [] pho_s_all = [] for recording in recordings: nestedPhonemeLists, numSyllables, numPhonemes = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details') feature_syllables, feature_vuv_syllables, spec_syllables, phoneme_syllables = featureSyllableSegmentation(feature_path, recording, nestedPhonemeLists,varin) f_s_all.append(feature_syllables) f_vuv_s_all.append(feature_vuv_syllables) spec_all.append(spec_syllables) pho_s_all.append(phoneme_syllables) f_s_all = np.hstack(f_s_all) f_vuv_s_all = np.hstack(f_vuv_s_all) spec_all = np.hstack(spec_all) pho_s_all = np.hstack(pho_s_all) ####--- patterns patterns_voiced_change,_ = voicedChangePatternCollection(f_s_all, pho_s_all, varin) patterns_voiced_unchange = voicedUnchangePatternCollection(f_s_all, pho_s_all, varin)
def getDataAll(textgrid_path,recordings,varin): icdPatterns_all = [] # voiced incorrect patterns voicedPatterns_all = [] # voiced patterns, including incorrect patterns index_vp_all = [] # index of detected boundaries who has voiced patterns f_s_all = [] f_vuv_s_all = [] spec_all = [] pho_s_all = [] gtb_all,db_all,gtbv_all,dbv_all = [],[],[],[] # recordings level for recording in recordings: nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details') feature_syllables, feature_vuv_syllables, spec_syllables, phoneme_syllables \ = featureSyllableSegmentation(feature_path, recording, nestedPhonemeLists,varin) groundtruthBoundariesSong, detectedBoundariesSong, \ groundtruthBoundariesVoicedSong, detectedBoundariesVoicedSong \ = detectedBoundariesOutput(recording,varin) f_s_all.append(feature_syllables) f_vuv_s_all.append(feature_vuv_syllables) spec_all.append(spec_syllables) pho_s_all.append(phoneme_syllables) gtb_all.append(groundtruthBoundariesSong) db_all.append(detectedBoundariesSong) gtbv_all.append(groundtruthBoundariesVoicedSong) dbv_all.append(detectedBoundariesVoicedSong) icdPatterns_song = [] voicedPatterns_song = [] index_vp_song = [] # syllable level for ii in range(len(groundtruthBoundariesVoicedSong)): groundtruthBoundariesVoiced = groundtruthBoundariesVoicedSong[ii] detectedBoundariesVoiced = detectedBoundariesVoicedSong[ii] icd = incorrectDetection(groundtruthBoundariesVoiced, detectedBoundariesVoiced, varin['tolerance']) feature = feature_syllables[ii] icdPatterns,_ = icdPatternCollection(feature,icd,varin) if len(icdPatterns): icdPatterns = np.vstack(icdPatterns) icdPatterns_song.append(icdPatterns) voicedPatterns,index_vp = icdPatternCollection(feature,detectedBoundariesVoiced,varin) if len(voicedPatterns): voicedPatterns = np.vstack(voicedPatterns) # voicedPatterns is possible to be empty voicedPatterns_song.append(voicedPatterns) index_vp_song.append(index_vp) icdPatterns_song = np.vstack(icdPatterns_song) icdPatterns_all.append(icdPatterns_song) voicedPatterns_all.append(voicedPatterns_song) index_vp_all.append(index_vp_song) icdPatterns_all = np.vstack(icdPatterns_all) return icdPatterns_all,voicedPatterns_all,index_vp_all,f_s_all,f_vuv_s_all,spec_all,pho_s_all,gtb_all,db_all,gtbv_all,dbv_all
def predict(textgrid_path,feature_path,scaler_filename,svm_model_filename,recording,varin): hopsize = varin['hopsize'] fs = varin['fs'] framesize = varin['framesize'] N = 2*framesize scaler = joblib.load(scaler_filename) svm_model_object= joblib.load(svm_model_filename) sumNumGroundtruthIntervals,sumNumDetectedIntervals,sumNumCorrect = 0,0,0 nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path, recording, 'pinyin', 'details') # classification feature feature = featureVUV(feature_path,recording,varin) for ii, nestedPho in enumerate(nestedPhonemeLists): print 'evaluate syllable ', ii+1, ' in', len(nestedPhonemeLists) syllable_start_frame = int(round(nestedPho[0][0]*fs/hopsize)) syllable_end_frame = int(round(nestedPho[0][1]*fs/hopsize)) syllable_feature = feature[syllable_start_frame:syllable_end_frame,:] detectedBoundaries_interval = consonantInterval(syllable_feature,scaler,svm_model_object,varin) ####---- merge interval into boundaries # if detectedBoundaries_interval: # detectedBoundaries = np.hstack(detectedBoundaries_interval) # else: # detectedBoundaries = np.array([]) # # detectedBoundaries = detectedBoundaries*hopsize/float(fs) # phonemes of syllable phoList = nestedPhonemeLists[ii][1] syllable_start_time = phoList[0][0] groundtruthBoundaries_interval = [] for pho in phoList: if pho[2] in ['c','k','f','x']: groundtruthBoundaries_interval.append([pho[0]-syllable_start_time,pho[1]-syllable_start_time]) # # evaluate the consonant boundaries # numDetectedBoundaries, numGroundtruthBoundaries, numCorrect = \ # metrics.boundaryDetection(groundtruthBoundaries=groundtruthBoundaries, # detectedBoundaries=detectedBoundaries, # tolerance=varin['tolerance']) numDetectedIntervals, numGroundtruthIntervals, numCorrect = \ metrics.intervalDetection(groundtruthBoundaries_interval,detectedBoundaries_interval,varin['tolerance']) # print numGroundtruthBoundaries, numDetectedBoundaries,numCorrect sumNumGroundtruthIntervals += numGroundtruthIntervals sumNumDetectedIntervals += numDetectedIntervals sumNumCorrect += numCorrect if varin['plot']: # load spectrogram spec_filename = os.path.join(feature_path,'spec'+'_'+recording+'_' +str(varin['framesize'])+'_'+str(varin['hopsize'])+'.npy') spec = np.load(spec_filename) syllable_spec = spec[syllable_start_frame:syllable_end_frame,:] binFreqs = np.arange(syllable_spec.shape[1])*fs/float(N) timestamps_spec = np.arange(syllable_spec.shape[0]) * (hopsize/float(fs)) f, axarr = plt.subplots(2, sharex=True) axarr[0].pcolormesh(timestamps_spec,binFreqs,20*np.log10(syllable_spec.T+np.finfo(np.float).eps)) for interval in detectedBoundaries_interval: axarr[0].axvspan(interval[0], interval[1], alpha=0.5, color='red') for interval in groundtruthBoundaries_interval: axarr[1].axvspan(interval[0], interval[1], alpha=0.5, color='red') plt.show() return sumNumGroundtruthIntervals,sumNumDetectedIntervals,sumNumCorrect
# # varin['framesize'] = int(round(framesize_t*fs)) # varin['hopsize'] = int(round(hopsize_t*fs)) varin['N_pattern'] = 11 varin['N_feature'] = 36 recordings = getRecordings(wav_path) ####---- collect all features and phonemes f_s_all = [] f_vuv_s_all = [] spec_all = [] pho_s_all = [] for recording in recordings: nestedPhonemeLists, numSyllables, numPhonemes = syllableTextgridExtraction( textgrid_path, recording, 'pinyin', 'details') feature_syllables, feature_vuv_syllables, spec_syllables, phoneme_syllables = featureSyllableSegmentation( feature_path, recording, nestedPhonemeLists, varin) f_s_all.append(feature_syllables) f_vuv_s_all.append(feature_vuv_syllables) spec_all.append(spec_syllables) pho_s_all.append(phoneme_syllables) f_s_all = np.hstack(f_s_all) f_vuv_s_all = np.hstack(f_vuv_s_all) spec_all = np.hstack(spec_all) pho_s_all = np.hstack(pho_s_all) ####--- patterns patterns_voiced_change, _ = voicedChangePatternCollection( f_s_all, pho_s_all, varin)
return number_recording if __name__ == '__main__': recordings = getRecordings(wav_path) boundaries = [] numSyllable_all, numVoiced_all, numUnvoiced_all = 0, 0, 0 lengthSyllable_all, lengthVoiced_all, lengthUnvoiced_all = [], [], [] for recording in recordings: boundaries_oneSong = 0 nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,'pinyin','details') numSyllable_all += numSyllables for pho in nestedPhonemeLists: lengthSyllable_all.append(pho[0][1] - pho[0][0]) for p in pho[1]: if p[2] == '': continue if p[2] in ['c', 'k', 'f', 'x']: numUnvoiced_all += 1 lengthUnvoiced_all.append(p[1] - p[0]) else: numVoiced_all += 1 lengthVoiced_all.append(p[1] - p[0])
def predict(textgrid_path, feature_path, scaler_filename, svm_model_filename, recording, varin): hopsize = varin['hopsize'] fs = varin['fs'] framesize = varin['framesize'] N = 2 * framesize scaler = joblib.load(scaler_filename) svm_model_object = joblib.load(svm_model_filename) sumNumGroundtruthIntervals, sumNumDetectedIntervals, sumNumCorrect = 0, 0, 0 nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path, recording, 'pinyin', 'details') # classification feature feature = featureVUV(feature_path, recording, varin) for ii, nestedPho in enumerate(nestedPhonemeLists): print 'evaluate syllable ', ii + 1, ' in', len(nestedPhonemeLists) syllable_start_frame = int(round(nestedPho[0][0] * fs / hopsize)) syllable_end_frame = int(round(nestedPho[0][1] * fs / hopsize)) syllable_feature = feature[syllable_start_frame:syllable_end_frame, :] detectedBoundaries_interval = consonantInterval( syllable_feature, scaler, svm_model_object, varin) ####---- merge interval into boundaries # if detectedBoundaries_interval: # detectedBoundaries = np.hstack(detectedBoundaries_interval) # else: # detectedBoundaries = np.array([]) # # detectedBoundaries = detectedBoundaries*hopsize/float(fs) # phonemes of syllable phoList = nestedPhonemeLists[ii][1] syllable_start_time = phoList[0][0] groundtruthBoundaries_interval = [] for pho in phoList: if pho[2] in ['c', 'k', 'f', 'x']: groundtruthBoundaries_interval.append([ pho[0] - syllable_start_time, pho[1] - syllable_start_time ]) # # evaluate the consonant boundaries # numDetectedBoundaries, numGroundtruthBoundaries, numCorrect = \ # metrics.boundaryDetection(groundtruthBoundaries=groundtruthBoundaries, # detectedBoundaries=detectedBoundaries, # tolerance=varin['tolerance']) numDetectedIntervals, numGroundtruthIntervals, numCorrect = \ metrics.intervalDetection(groundtruthBoundaries_interval,detectedBoundaries_interval,varin['tolerance']) # print numGroundtruthBoundaries, numDetectedBoundaries,numCorrect sumNumGroundtruthIntervals += numGroundtruthIntervals sumNumDetectedIntervals += numDetectedIntervals sumNumCorrect += numCorrect if varin['plot']: # load spectrogram spec_filename = os.path.join( feature_path, 'spec' + '_' + recording + '_' + str(varin['framesize']) + '_' + str(varin['hopsize']) + '.npy') spec = np.load(spec_filename) syllable_spec = spec[syllable_start_frame:syllable_end_frame, :] binFreqs = np.arange(syllable_spec.shape[1]) * fs / float(N) timestamps_spec = np.arange( syllable_spec.shape[0]) * (hopsize / float(fs)) f, axarr = plt.subplots(2, sharex=True) axarr[0].pcolormesh( timestamps_spec, binFreqs, 20 * np.log10(syllable_spec.T + np.finfo(np.float).eps)) for interval in detectedBoundaries_interval: axarr[0].axvspan(interval[0], interval[1], alpha=0.5, color='red') for interval in groundtruthBoundaries_interval: axarr[1].axvspan(interval[0], interval[1], alpha=0.5, color='red') plt.show() return sumNumGroundtruthIntervals, sumNumDetectedIntervals, sumNumCorrect
def doClassification(): """ 1. collect features from test set 2. predict by GMM or DNN models 3. save the prediction :return: prediction of GMM and DNN model """ phone_class = PhonemeClassification() phone_class.create_gmm(gmmModel_path) mfcc_all = np.array([]) mfccBands1D_all = np.array([]) mfccBands2D_all = np.array([]) y_true = [] for recording in getRecordingNames('TEST', dataset): nestedPhonemeLists, numSyllables, numPhonemes \ = syllableTextgridExtraction(textgrid_path,recording,syllableTierName,phonemeTierName) wav_full_filename = os.path.join(wav_path, recording + '.wav') audio = ess.MonoLoader(downmix='left', filename=wav_full_filename, sampleRate=fs)() # plotAudio(audio,15,16) print 'calculating mfcc and mfcc bands ... ', recording mfcc = getFeature(audio, d=True, nbf=False) mfccBands1D = getMFCCBands1D(audio, nbf=True) mfccBands2D = getMFCCBands2D(audio, nbf=True) mfccBands2D = np.log(10000 * mfccBands2D + 1) # scale mfccBands1D for dnn acoustic models mfccBands1D_std = preprocessing.StandardScaler().fit_transform( mfccBands1D) # scale mfccBands2D for cnn acoustic models scaler = pickle.load(open(scaler_path, 'rb')) mfccBands2D_std = scaler.transform(mfccBands2D) for ii, pho in enumerate(nestedPhonemeLists): print 'calculating ', recording, ' and phoneme ', str( ii), ' of ', str(len(nestedPhonemeLists)) # MFCC feature sf = round(pho[0][0] * fs / hopsize) ef = round(pho[0][1] * fs / hopsize) # mfcc syllable mfcc_s = mfcc[sf:ef, :] mfccBands_s = mfccBands2D[sf:ef, :] mfccBands1D_s_std = mfccBands1D_std[sf:ef, :] mfccBands2D_s_std = mfccBands2D_std[sf:ef, :] if len(mfcc_all): mfcc_all = np.vstack((mfcc_all, mfcc_s)) mfccBands1D_all = np.vstack( (mfccBands1D_all, mfccBands1D_s_std)) mfccBands2D_all = np.vstack( (mfccBands2D_all, mfccBands2D_s_std)) else: mfcc_all = mfcc_s mfccBands1D_all = mfccBands1D_s_std mfccBands2D_all = mfccBands2D_s_std # print mfcc_all.shape, mfccBands2D_all.shape ##-- parsing y_true y_true_s = [] for ii_p, p in enumerate(pho[1]): # map from annotated xsampa to readable notation key = dic_pho_map[p[2]] index_key = dic_pho_label[key] y_true_s += [index_key] * int(round((p[1] - p[0]) / hopsize_t)) print len(y_true_s), mfcc_s.shape[0] if len(y_true_s) > mfcc_s.shape[0]: y_true_s = y_true_s[:mfcc_s.shape[0]] elif len(y_true_s) < mfcc_s.shape[0]: y_true_s += [y_true_s[-1]] * (mfcc_s.shape[0] - len(y_true_s)) y_true += y_true_s phone_class.mapb_gmm(mfcc_all) obs_gmm = phone_class.mapb_gmm_getter() y_pred_gmm = phone_class.prediction(obs_gmm) mfccBands2D_all = featureReshape(mfccBands2D_all) phone_class.mapb_keras(mfccBands2D_all, kerasModels_jordi_path, jordi=True) obs_cnn_jordi = phone_class.mapb_keras_getter() y_pred_jordi = phone_class.prediction(obs_cnn_jordi) phone_class.mapb_keras(mfccBands2D_all, kerasModels_choi_path) obs_cnn_choi = phone_class.mapb_keras_getter() y_pred_choi = phone_class.prediction(obs_cnn_choi) phone_class.mapb_keras(mfccBands1D_all, kerasModels_dnn_path) obs_dnn = phone_class.mapb_keras_getter() y_pred_dnn = phone_class.prediction(obs_dnn) np.save('./trainingData/y_pred_gmm.npy', y_pred_gmm) np.save('./trainingData/y_pred_jordi.npy', y_pred_jordi) np.save('./trainingData/y_pred_choi.npy', y_pred_choi) np.save('./trainingData/y_pred_dnn.npy', y_pred_dnn) np.save('./trainingData/y_true.npy', y_true)