def evalAccuracyTest(): ######### for test logic see WordLevelEvaluator instead PATH_TEST_DATASET = '../example/' annotationURI = os.path.join(PATH_TEST_DATASET, 'grTruth.TextGrid') # load from file # detectedURI = os.path.join(PATH_TEST_DATASET, audioName + '.phrasesDurationAligned') detectedTokenList = readListOfListTextFile(os.path.join(PATH_TEST_DATASET, 'detected.aligned')) ############### annotationURI = '/Users/joro/Documents/Phd/UPF/arias_dev_01_t_70//laosheng-erhuang_04.TextGrid' detectedTokenList = readListOfListTextFile('/Users/joro/Documents/Phd/UPF/arias_dev_01_t_70/laosheng-erhuang_04_49.8541936425_108.574785469.syllables') startIdx=1; endIdx=13 ################# annotationURI = '/Users/joro/Documents/Phd/UPF/arias_dev_01_t_70/laosheng-erhuang_04.TextGrid' detectedTokenList = readListOfListTextFile('/Users/joro/Documents/Phd/UPF/arias_dev_01_t_70/laosheng-erhuang_04_134.647686205_168.77679257.syllables') startIdx=15; endIdx=26 whichTier=3 durationCorrect, totalLength = _evalAccuracy(annotationURI, detectedTokenList,whichTier , startIdx, endIdx) print durationCorrect print totalLength print durationCorrect/totalLength
def test_oracle_jingju(URIrecordingNoExt, whichSentence, fromPhonemeIdx, toPhonemeIdx): ''' read phoneme-level ground truth and test with dan-xipi_02 ''' ANNOTATION_EXT = '.TextGrid' listSentences = divideIntoSentencesFromAnno(URIrecordingNoExt + ANNOTATION_EXT) #uses TextGrid annotation to derive structure. TODO: instead of annotation, uses score withSynthesis = False currSentence = listSentences[whichSentence] # consider only part of audio fromTs = currSentence[0] toTs = currSentence[1] lyrics = loadLyricsFromTextGridSentence(currSentence) tokenLevelAlignedSuffix = '.syllables_oracle' detectedAlignedfileName = URIrecordingNoExt + '_' + str(fromTs) + '_' + str(toTs) + '_' + tokenLevelAlignedSuffix if os.path.isfile(detectedAlignedfileName): print "{} already exists. No decoding".format(detectedAlignedfileName) from Utilz import readListOfListTextFile detectedTokenList = readListOfListTextFile(detectedAlignedfileName) else: detectedTokenList = decodeWithOracle(lyrics, URIrecordingNoExt, fromTs, toTs, fromPhonemeIdx, toPhonemeIdx) if not os.path.isfile(detectedAlignedfileName): from PraatVisualiser import tokenList2TabFile detectedAlignedfileName = tokenList2TabFile(detectedTokenList, URIrecordingNoExt, tokenLevelAlignedSuffix) # eval on phrase level evalLevel = 2 fromSyllable = currSentence[2] toSyllable = currSentence[3] correctDuration, totalDuration = _evalAccuracy(URIrecordingNoExt + ANNOTATION_EXT, detectedTokenList, evalLevel, fromSyllable, toSyllable ) print "accuracy= {}".format(correctDuration / totalDuration) return detectedTokenList
def getReferenceDurations(URI_recording_noExt, lyricsWithModels, evalLevel): ''' timestamps of words according to reference durations read from score. Used to obtain so called 'score-deviation' metric not used in decoding ''' annotationURI = URI_recording_noExt + ANNOTATION_EXT ##### get duration of initial silence try: annotationTokenListA = TextGrid2WordList(annotationURI, evalLevel) # just copy duration of silence in groundTruth annoTsAndToken = annotationTokenListA[0] if annoTsAndToken[2] != "" and not(annoTsAndToken[2].isspace()): # skip empty phrases logger.warn("annotaiton {} starts with non-sil token ".format(annotationURI)) finalSilFram = float(annoTsAndToken[0]) * NUM_FRAMES_PERSECOND else: finalSilFram = float(annoTsAndToken[1]) * NUM_FRAMES_PERSECOND except : # if no Gr Truth annotation file (or needed layer) present - take from model finalSilFram = 0 countFirstStateFirstWord = lyricsWithModels.listWords[0].syllables[0].phonemes[0].numFirstState for i in range(countFirstStateFirstWord): finalSilFram += lyricsWithModels.statesNetwork[i].getDurationInFrames() refTokenList = expandlyrics2WordList (lyricsWithModels, lyricsWithModels.statesNetwork, finalSilFram, _constructTimeStampsForToken) grTruthDurationfileExtension = '.scoreDeviation' writeListOfListToTextFile(refTokenList, None , URI_recording_noExt + grTruthDurationfileExtension ) # TODO: could be done easier with this code, and check last method in Word # refTokenList = testT(lyricsWithModels) correctDuration, totalDuration = _evalAccuracy(annotationURI, refTokenList, evalLevel ) return correctDuration, totalDuration
def doitOneChunkAlign(URIrecordingNoExt, musicXMLParser, whichSentence, currSentence, withScores, withVocalPrediction): ''' align one chunk only. @param musicXMLParser: parsed score for whole recording @param whichSentence: sentence number to process ''' fromTs = currSentence[0] toTs = currSentence[1] listNonVocalFragments = [] if withVocalPrediction: listNonVocalFragments = getListNonVocalFragments(URIrecordingNoExt, fromTs, toTs) URIRecordingChunkNoExt = URIrecordingNoExt + "_" + str(fromTs) + '_' + str(toTs) if (withScores): tokenLevelAlignedSuffix = '.syllables_dur' else: tokenLevelAlignedSuffix = '.syllables' detectedAlignedfileName = URIRecordingChunkNoExt + tokenLevelAlignedSuffix fromSyllable = currSentence[2] toSyllable = currSentence[3] # already decoded if os.path.isfile(detectedAlignedfileName): print "{} already exists. No decoding".format(detectedAlignedfileName) detectedTokenList = readListOfListTextFile(detectedAlignedfileName) correctDuration, totalDuration = _evalAccuracy(URIrecordingNoExt + ANNOTATION_EXT, detectedTokenList, evalLevel, fromSyllable, toSyllable ) # correctDuration= 0; totalDuration=1 return correctDuration, totalDuration ###### 1) load Lyrics lyrics = loadLyricsFromTextGridSentence(currSentence) # if logger.level == logging.DEBUG: # lyrics.printSyllables() if withScores: # load from score instead lyrics = musicXMLParser.getLyricsForSection(whichSentence) # indexing in python withSynthesis = True # 2) load features lyricsWithModels, obsFeatures, dummyChunkURI = loadSmallAudioFragment(lyrics, URIrecordingNoExt, withSynthesis, fromTs, toTs) # lyricsWithModels.printWordsAndStates() ##### align usePersistentFiles = 'False' alpha = 0.97 from hmm.Parameters import Parameters ONLY_MIDDLE_STATE = False params = Parameters(alpha, ONLY_MIDDLE_STATE) alignmentErrors, detectedTokenList, detectedPath = alignOneChunk(obsFeatures, lyricsWithModels, listNonVocalFragments, alpha, evalLevel, usePersistentFiles, tokenLevelAlignedSuffix, URIRecordingChunkNoExt) correctDuration, totalDuration = _evalAccuracy(URIrecordingNoExt + ANNOTATION_EXT, detectedTokenList, evalLevel, fromSyllable, toSyllable ) acc = correctDuration / totalDuration print "result is: " + str(acc) return correctDuration, totalDuration
def evalAccuracy(self, eval_level): pathEvaluation = os.path.join(parentDir, 'AlignmentEvaluation/align_eval') if pathEvaluation not in sys.path: sys.path.append(pathEvaluation) from AccuracyEvaluator import _evalAccuracy totalCorrectDurations = 0 totalDurations = 0 if self.WITH_SECTION_ANNOTATIONS: sectionLinks = self.recording.sectionAnnos else: sectionLinks = self.recording.sectionLinks ## might be needed for jingju URI_TextGrid = os.path.join(self.recording.recordingNoExtURI + ANNOTATION_EXT) ##### add index of begin token and end token high_level_tier_name = tierAliases.line list_start_end_indices, annotationLinesList = divideIntoSentencesFromAnnoWithSil(URI_TextGrid, \ high_level_tier_name, eval_level) if len(list_start_end_indices) != len(sectionLinks): sys.exit( 'TextGrid has {} lines, whereas section Links are {}'.format( len(list_start_end_indices), len(sectionLinks))) for idx, currSectionLink in enumerate( sectionLinks ): # assign syllable/word/phrase start- and end-indices in TextGrid currSectionLink.set_begin_end_indices( list_start_end_indices[idx][0], list_start_end_indices[idx][1]) for currSectionLink in sectionLinks: if not hasattr( currSectionLink, 'detectedTokenList' ): # use the exostence of detected token list as indicator of lyrics-sections continue ###################### eval phoeneme level # self.eval_percentage_phonemes(URI_TextGrid, currSectionLink) ############################# eval accuracy Annotaion level correctDuration = 0 totalDuration = 1 correctDuration, totalDuration = _evalAccuracy( URI_TextGrid, currSectionLink.detectedTokenList, ParametersAlgo.EVAL_LEVEL, currSectionLink.token_begin_idx, currSectionLink.token_end_idx) logger.debug('current section {} accuracy: {}'.format( currSectionLink, correctDuration / totalDuration)) totalCorrectDurations += correctDuration totalDurations += totalDuration accuracy = totalCorrectDurations / totalDurations logger.warning("recording {} accuracy: {:.2f}".format( self.recording.recordingNoExtURI, accuracy)) return totalCorrectDurations, totalDurations