def _mainLoop(self, wordController, batchSizeSec): assert(batchSizeSec > 0.0) valueCoveredVectors = self._valueCoveredVectors # Init coveredPhonemes if self._coveredPhonemes is not None: coveredPhonemes = self._coveredPhonemes.copy() else: coveredPhonemes = Counter() # coveredPhonemes = sparse.csc_matrix((1, self.wordController._numPhonemes)) # Prepare datastructures log("Computing submodular gains") candidateList = [] gainDict = self._initGains(wordController, coveredPhonemes, valueCoveredVectors) gainQueue = FastBucketQueue() checkedCnWordSet = set() for (adapter, cnWordIdx), ratio in gainDict.iteritems(): cnWord = adapter[cnWordIdx] if not cnWord._marked and not cnWord._transcribed and not cnWord.ignore: gainQueue.push((adapter, cnWordIdx), ratio) i = 0 accumTime = 0.0 accumTimeFrameDurations = 0.0 try: log('Number of marked words: {0}, -> {1} percent marked'.format(self.wordController.getNumMarkedWords(),\ self.wordController.getNumMarkedWords() / self.wordController.getNumWords() )) except: import ipdb; ipdb.set_trace() reportAfterSamples = 10000 while len(gainQueue) > 0 and accumTime < batchSizeSec: # Get top element from list and add to selected set (adapter, cnWordIdx), candidateRatio = gainQueue.pop() candidateCnWord = adapter[cnWordIdx] # Update Gain, ratio, cost # if candidateCnWord not in checkedCnWordSet: candidateGain, candidateRatio, costs = self._computeRatioGain(adapter, cnWordIdx, coveredPhonemes, valueCoveredVectors) # debug("Costs: {0}".format(costs)) # print("\n\nIteration {0}, word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}".format(i, candidateCnWord.id, candidateGain, costs, candidateRatio, candidateCnWord.duration)) # assert(candidateGain >= 0), "Gain must be non-negative in order to be submodular" # print "Queue.max(): {0}".format(gainQueue.max()) if candidateCnWord in checkedCnWordSet or candidateRatio >= gainQueue.max(): # print("--> Selected!") candidateList.append((adapter, cnWordIdx)) candidateAdapter = adapter wordPos = cnWordIdx candidateAdapter.assignWordToSegment(wordPos) # coveredPhonemes.update(candidateCnWord._tfIdfDict) coveredPhonemes += candidateCnWord._tfIdfDict valueCoveredVectors += candidateGain checkedCnWordSet.clear() candidateCnWord._marked = True # Cost-insensitive way to compute accumTime #accumTime += candidateCnWord.durationSec accumTimeFrameDurations += candidateCnWord.durationSec # Cost-sensitive way to compute accumTime (inaccurate because costs can be lowered through forming segments) accumTime += costs candidateCnWord.cost = costs candidateCnWord.utility = candidateGain candidateCnWord.ratio = candidateRatio candidateCnWord._score = i # Update gain neighbor words leftSeg, rightSeg = candidateAdapter.getNeighborSegments(wordPos) # if leftSeg is None and wordPos > 0: if wordPos > 0 and not candidateAdapter[wordPos-1]._marked and not candidateAdapter[wordPos-1].ignore: try: neighGain, neighRatio, neighCosts = self._computeRatioGain(adapter, wordPos-1, coveredPhonemes, valueCoveredVectors) gainQueue.update((adapter, wordPos-1), neighRatio) adapter[wordPos-1].cost = neighCosts adapter[wordPos-1].utility = neighGain adapter[wordPos-1].ratio = neighRatio except: warn(("Adapter {0} wordpos {1}".format(adapter.cnId, wordPos))) warn(adapter.visualizeWordsChain()) warn(adapter.visualizeSegments()) raise # print("--> word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}"\ # .format(i, leftWord.id, neighGain, neighCosts, neighRatio, leftWord.duration)) # if rightSeg is None and wordPos < candidateAdapter.getNumWords()-1: if wordPos < candidateAdapter.getNumWords()-1 and not candidateAdapter[wordPos+1]._marked and not candidateAdapter[wordPos+1].ignore: neighGain, neighRatio, neighCosts = self._computeRatioGain(adapter, wordPos+1, coveredPhonemes, valueCoveredVectors) gainQueue.update((adapter, wordPos+1), neighRatio) adapter[wordPos+1].cost = neighCosts adapter[wordPos+1].utility = neighGain adapter[wordPos+1].ratio = neighRatio # print("--> word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}"\ # .format(i, rightWord.id, neighGain, neighCosts, neighRatio, rightWord.duration)) i += 1 # print candidateAdapter.visualizeSegments() # # TODO Remove # for adapter, idx in candidateAdapter: # newGain, newRatio, newCosts = self._computeRatioGain(adapter, idx, coveredPhonemes, valueCoveredVectors) # adapter[idx].cost = newCosts # adapter[idx].utility = newGain # adapter[idx].ratio = newRatio # TODO REMOVE # debug("\n{0}".format(adapter.visualizeSegments().encode('utf-8'))) # debug("\nCandidate: \nRatio {0}\nUtility {1}\nCosts {2}".format(candidateRatio, candidateGain, costs)) # TODO remove # import ipdb; ipdb.set_trace() if i%reportAfterSamples == 0: log('Number of marked words: {0}, -> {1} percent marked'.format(i, float(i) / float(self.wordController.getNumWords()) )) else: # print("--> Refused!") gainQueue.push((adapter, cnWordIdx), candidateRatio) checkedCnWordSet.add(candidateCnWord) if len(gainQueue) <= 10: debug("Priority queue has only {0} elements left".format(len(gainQueue))) log("Sampled {0} hours of words \ (annotation cost model roughly predicts {0} hours of annotation)".format(accumTimeFrameDurations/3600.0, accumTime)) return candidateList, coveredPhonemes, valueCoveredVectors
def getScores(self, wordController=None, batchSizeSec=None): log('Approach submodular sentence/utterance coverage (tf-idf vector based): calculating scores') if batchSizeSec is None: batchSizeSec = float("inf") if wordController: self.wordController = wordController log('Number of words: {0}'.format(self.wordController.getNumWords())) # outsourced to make it accessible for unit testing candidateList, coveredPhonemes, valueCoveredVectors = self._mainLoop(self.wordController, batchSizeSec) # Save scores for i, (adapter, cnWordIdx) in enumerate(candidateList): #print key cnWord = adapter[cnWordIdx] cnWord._score = float(i) # Set instance variables after everything else has been done # to ensure consistency that if an error occurs and the iteration is restarted # no left-overs of the last iterations are contained in coveredPhonemes # or valueCoveredVectors. self._coveredPhonemes = coveredPhonemes self._valueCoveredVectors = valueCoveredVectors # Get estimated annotation time information # from activeLearning.tools.contextManagers import DebugLogging self.annotationCosts = 0.0 self.annotationCostsWithoutSupervised = 0.0 # with DebugLogging("{0}.visualization".format(self.alConfig.logFile)): # for key, adapter in self.wordController.confusionNetworkAdapters.iteritems(): # debug("\n{0}".format(adapter.visualizeSegments())) for key, adapter in self.wordController.confusionNetworkAdapters.iteritems(): # Visualize Costs for segment in adapter.getSegmentIterator(): # print segment # print segment.durationSec # print len(segment) costs = self._scoreFcnFast(len(segment), segment.durationSec) # debug("\n{0} segment, costs {1}".format(segment, costs)) self.annotationCostsWithoutSupervised += costs for key, adapter in self.wordController.confusionNetworkAdapters.iteritems(): # Visualize Costs for subSeg in adapter.getMarkedAndSupervisedSegments(): # print "\nSubseg:" # for word in subSeg: # print word subSegDuration = sum(x.durationSec for x in subSeg) # print subSegDuration # print len(subSeg) costs = self._scoreFcnFast(len(subSeg), subSegDuration) # debug("\n{0} segment, costs {1}".format(segment, costs)) self.annotationCosts += costs # Clear Cost-Model cache self._estimatedGPCosts.clear() # Get noisy annotation time information self.noisyAnnotationCosts = 0.0 self.noisyAnnotationCostsWithoutSupervised = 0.0 for key, adapter in self.wordController.confusionNetworkAdapters.iteritems(): # Visualize Costs for subSeg in adapter.getMarkedAndSupervisedSegments(): # print "\nSubseg:" # for word in subSeg: # print word subSegDuration = sum(x.durationSec for x in subSeg) # print subSegDuration # print len(subSeg) subSegDuration = sum(x.durationSec for x in subSeg) costs = self._scoreFcnFastNoisy(len(subSeg), subSegDuration) # debug("\n{0} segment, costs {1}".format(segment, costs)) self.noisyAnnotationCosts += costs for key, adapter in self.wordController.confusionNetworkAdapters.iteritems(): for segment in adapter.getSegmentIterator(): costs = self._scoreFcnFastNoisy(len(segment), segment.durationSec) self.noisyAnnotationCostsWithoutSupervised += costs # Clear Cost-Model cache self._noisyGPCosts.clear() return candidateList
def _mainLoop(self, wordController, batchSizeSec): assert(batchSizeSec > 0.0) valueCoveredVectors = self._valueCoveredVectors # Init coveredPhonemes if self._coveredPhonemes is not None: coveredPhonemes = self._coveredPhonemes.copy() else: coveredPhonemes = Counter() # coveredPhonemes = sparse.csc_matrix((1, self.wordController._numPhonemes)) # Prepare datastructures log("Computing submodular gains") candidateList = [] gainDict = self._initGains(wordController, coveredPhonemes, valueCoveredVectors) gainQueue = FastBucketQueue() checkedCnWordSet = set() for (adapter, cnWordIdx), gain in gainDict.iteritems(): cnWord = adapter[cnWordIdx] if not cnWord._marked and not cnWord._transcribed and not cnWord.ignore: gainQueue.push((adapter, cnWordIdx), gain) i = 0 accumTime = 0.0 log('Number of marked words: {0}, -> {1} percent marked'.format(self.wordController.getNumMarkedWords(),\ self.wordController.getNumMarkedWords() / self.wordController.getNumWords() )) reportAfterSamples = 10000 while len(gainQueue) > 0 and accumTime < batchSizeSec: # Get top element from list and add to selected set (adapter, cnWordIdx), candidateGain = gainQueue.pop() candidateCnWord = adapter[cnWordIdx] # Update Gain, ratio, cost # if candidateCnWord not in checkedCnWordSet: candidateGain = self._computeGain(adapter, cnWordIdx, coveredPhonemes, valueCoveredVectors) # print("\n\nIteration {0}, word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}".format(i, candidateCnWord.id, candidateGain, costs, candidateRatio, candidateCnWord.duration)) # assert(candidateGain >= 0), "Gain must be non-negative in order to be submodular" # print "Queue.max(): {0}".format(gainQueue.max()) if candidateCnWord in checkedCnWordSet or candidateGain >= gainQueue.max(): # print("--> Selected!") candidateList.append((adapter, cnWordIdx)) candidateAdapter = adapter wordPos = cnWordIdx candidateAdapter.assignWordToSegment(wordPos) # coveredPhonemes.update(candidateCnWord._tfIdfDict) coveredPhonemes += candidateCnWord._tfIdfDict valueCoveredVectors += candidateGain checkedCnWordSet.clear() candidateCnWord._marked = True accumTime += candidateCnWord.durationSec candidateCnWord.utility = candidateGain # candidateCnWord._score = i # # Update gain neighbor words # leftSeg, rightSeg = candidateAdapter.getNeighborSegments(wordPos) # if leftSeg is None and wordPos > 0: # leftWord = candidateAdapter[wordPos-1] # neighGain, neighRatio, neighCosts = self._computeRatioGain(leftWord, coveredPhonemes, valueCoveredVectors) # gainQueue.update(leftWord, neighRatio) # # print("--> word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}"\ # # .format(i, leftWord.id, neighGain, neighCosts, neighRatio, leftWord.duration)) # # if rightSeg is None and wordPos < candidateAdapter.getNumWords()-1: # rightWord = candidateAdapter[wordPos+1] # neighGain, neighRatio, neighCosts = self._computeRatioGain(rightWord, coveredPhonemes, valueCoveredVectors) # gainQueue.update(rightWord, neighRatio) # # print("--> word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}"\ # # .format(i, rightWord.id, neighGain, neighCosts, neighRatio, rightWord.duration)) # TODO remove i += 1 # print candidateAdapter.visualizeSegments() if i%reportAfterSamples == 0: log('Number of marked words: {0}, -> {1} percent marked'.format(i, float(i) / float(self.wordController.getNumWords()) )) else: # print("--> Refused!") gainQueue.push((adapter, cnWordIdx), candidateGain) checkedCnWordSet.add(candidateCnWord) if len(gainQueue) <= 10: debug("Priority queue has only {0} elements left".format(len(gainQueue))) log("Sampled {0} hours of words".format(accumTime/3600.0)) return candidateList, coveredPhonemes, valueCoveredVectors