def _mainLoop(self, wordController, batchSizeSec):
        assert(batchSizeSec > 0.0)

        valueCoveredVectors = self._valueCoveredVectors

        # Init coveredPhonemes
        if self._coveredPhonemes is not None:
            coveredPhonemes = self._coveredPhonemes.copy()
        else:
            coveredPhonemes = Counter()
            # coveredPhonemes = sparse.csc_matrix((1, self.wordController._numPhonemes))


        # Prepare datastructures
        log("Computing submodular gains")

        candidateList = []
        gainDict = self._initGains(wordController, coveredPhonemes, valueCoveredVectors)
        gainQueue = FastBucketQueue()
        checkedCnWordSet = set()
        for (adapter, cnWordIdx), ratio in gainDict.iteritems():
            cnWord = adapter[cnWordIdx]
            if not cnWord._marked and not cnWord._transcribed and not cnWord.ignore:
                gainQueue.push((adapter, cnWordIdx), ratio)

        i = 0
        accumTime = 0.0
        accumTimeFrameDurations = 0.0
        try:
            log('Number of marked words: {0}, -> {1} percent marked'.format(self.wordController.getNumMarkedWords(),\
                                                                        self.wordController.getNumMarkedWords() / self.wordController.getNumWords() ))
        except:
            import ipdb; ipdb.set_trace()
        reportAfterSamples = 10000
        while len(gainQueue) > 0 and accumTime < batchSizeSec:

            # Get top element from list and add to selected set
            (adapter, cnWordIdx), candidateRatio = gainQueue.pop()
            candidateCnWord = adapter[cnWordIdx]

            # Update Gain, ratio, cost
            # if candidateCnWord not in checkedCnWordSet:
            candidateGain, candidateRatio, costs = self._computeRatioGain(adapter, cnWordIdx, coveredPhonemes, valueCoveredVectors)
            # debug("Costs: {0}".format(costs))
            # print("\n\nIteration {0}, word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}".format(i, candidateCnWord.id, candidateGain, costs, candidateRatio, candidateCnWord.duration))
            # assert(candidateGain >= 0), "Gain must be non-negative in order to be submodular"

            # print "Queue.max(): {0}".format(gainQueue.max())
            if candidateCnWord in checkedCnWordSet or candidateRatio >= gainQueue.max():
                # print("--> Selected!")
                candidateList.append((adapter, cnWordIdx))
                candidateAdapter = adapter
                wordPos = cnWordIdx
                candidateAdapter.assignWordToSegment(wordPos)
                # coveredPhonemes.update(candidateCnWord._tfIdfDict)
                coveredPhonemes += candidateCnWord._tfIdfDict
                valueCoveredVectors += candidateGain
                checkedCnWordSet.clear()
                candidateCnWord._marked = True

                # Cost-insensitive way to compute accumTime
                #accumTime += candidateCnWord.durationSec
                accumTimeFrameDurations += candidateCnWord.durationSec

                # Cost-sensitive way to compute accumTime (inaccurate because costs can be lowered through forming segments)
                accumTime += costs
                candidateCnWord.cost = costs
                candidateCnWord.utility = candidateGain
                candidateCnWord.ratio = candidateRatio
                candidateCnWord._score = i

               


                # Update gain neighbor words
                leftSeg, rightSeg = candidateAdapter.getNeighborSegments(wordPos)
                # if leftSeg is None and wordPos > 0:
                if wordPos > 0 and not candidateAdapter[wordPos-1]._marked and not candidateAdapter[wordPos-1].ignore:
                    try:
                        neighGain, neighRatio, neighCosts = self._computeRatioGain(adapter, wordPos-1, coveredPhonemes, valueCoveredVectors)
                        gainQueue.update((adapter, wordPos-1), neighRatio)
                        adapter[wordPos-1].cost = neighCosts
                        adapter[wordPos-1].utility = neighGain
                        adapter[wordPos-1].ratio = neighRatio
                    except:
                        warn(("Adapter {0} wordpos {1}".format(adapter.cnId, wordPos)))
                        warn(adapter.visualizeWordsChain())
                        warn(adapter.visualizeSegments())
                        raise
                    # print("--> word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}"\
                          # .format(i, leftWord.id, neighGain, neighCosts, neighRatio, leftWord.duration))

                # if rightSeg is None and wordPos < candidateAdapter.getNumWords()-1:
                if wordPos < candidateAdapter.getNumWords()-1 and not candidateAdapter[wordPos+1]._marked and not candidateAdapter[wordPos+1].ignore:
                    neighGain, neighRatio, neighCosts = self._computeRatioGain(adapter, wordPos+1, coveredPhonemes, valueCoveredVectors)
                    gainQueue.update((adapter, wordPos+1), neighRatio)
                    adapter[wordPos+1].cost = neighCosts
                    adapter[wordPos+1].utility = neighGain
                    adapter[wordPos+1].ratio = neighRatio
                    # print("--> word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}"\
                          # .format(i, rightWord.id, neighGain, neighCosts, neighRatio, rightWord.duration))
                i += 1
                # print candidateAdapter.visualizeSegments()


                # # TODO Remove
                # for adapter, idx in candidateAdapter:
                #     newGain, newRatio, newCosts = self._computeRatioGain(adapter, idx, coveredPhonemes, valueCoveredVectors)
                #     adapter[idx].cost = newCosts
                #     adapter[idx].utility = newGain 
                #     adapter[idx].ratio = newRatio

                # TODO REMOVE
                # debug("\n{0}".format(adapter.visualizeSegments().encode('utf-8')))
                # debug("\nCandidate: \nRatio {0}\nUtility {1}\nCosts {2}".format(candidateRatio, candidateGain, costs))
                # TODO remove
                # import ipdb; ipdb.set_trace()

                if i%reportAfterSamples == 0:
                    log('Number of marked words: {0}, -> {1} percent marked'.format(i, float(i) / float(self.wordController.getNumWords()) ))

            else:
                # print("--> Refused!")
                gainQueue.push((adapter, cnWordIdx), candidateRatio)
                checkedCnWordSet.add(candidateCnWord)

            if len(gainQueue) <= 10:
                debug("Priority queue has only {0} elements left".format(len(gainQueue)))

        log("Sampled {0} hours of words \
                (annotation cost model roughly predicts {0} hours of annotation)".format(accumTimeFrameDurations/3600.0, accumTime))


        return candidateList, coveredPhonemes, valueCoveredVectors
    def getScores(self, wordController=None, batchSizeSec=None):
        log('Approach submodular sentence/utterance coverage (tf-idf vector based): calculating scores')


        if batchSizeSec is None:
            batchSizeSec = float("inf")

        if wordController:
            self.wordController = wordController

        log('Number of words: {0}'.format(self.wordController.getNumWords()))


        # outsourced to make it accessible for unit testing
        candidateList, coveredPhonemes, valueCoveredVectors = self._mainLoop(self.wordController, batchSizeSec)

        # Save scores
        for i, (adapter, cnWordIdx) in enumerate(candidateList):
            #print key
            cnWord = adapter[cnWordIdx]
            cnWord._score = float(i)

        # Set instance variables after everything else has been done
        # to ensure consistency that if an error occurs and the iteration is restarted
        # no left-overs of the last iterations are contained in coveredPhonemes
        # or valueCoveredVectors.
        self._coveredPhonemes = coveredPhonemes
        self._valueCoveredVectors = valueCoveredVectors

        # Get estimated annotation time information
        # from activeLearning.tools.contextManagers import DebugLogging
        self.annotationCosts = 0.0
        self.annotationCostsWithoutSupervised = 0.0
        # with DebugLogging("{0}.visualization".format(self.alConfig.logFile)):
        #     for key, adapter in self.wordController.confusionNetworkAdapters.iteritems():
        #         debug("\n{0}".format(adapter.visualizeSegments()))

        for key, adapter in self.wordController.confusionNetworkAdapters.iteritems():
            # Visualize Costs
            for segment in adapter.getSegmentIterator():

                # print segment
                # print segment.durationSec
                # print len(segment)
                costs = self._scoreFcnFast(len(segment), segment.durationSec)
                # debug("\n{0} segment, costs {1}".format(segment, costs))
                self.annotationCostsWithoutSupervised += costs

        for key, adapter in self.wordController.confusionNetworkAdapters.iteritems():
            # Visualize Costs
            for subSeg in adapter.getMarkedAndSupervisedSegments():
                # print "\nSubseg:"
                # for word in subSeg:
                #     print word
                subSegDuration = sum(x.durationSec for x in subSeg)
                # print subSegDuration
                # print len(subSeg)
                costs = self._scoreFcnFast(len(subSeg), subSegDuration)
                # debug("\n{0} segment, costs {1}".format(segment, costs))
                self.annotationCosts += costs

        # Clear Cost-Model cache
        self._estimatedGPCosts.clear()

        # Get noisy annotation time information
        self.noisyAnnotationCosts = 0.0
        self.noisyAnnotationCostsWithoutSupervised = 0.0
        for key, adapter in self.wordController.confusionNetworkAdapters.iteritems():
            # Visualize Costs
            for subSeg in adapter.getMarkedAndSupervisedSegments():
                # print "\nSubseg:"
                # for word in subSeg:
                #     print word
                subSegDuration = sum(x.durationSec for x in subSeg)
                # print subSegDuration
                # print len(subSeg)
                subSegDuration = sum(x.durationSec for x in subSeg)
                costs = self._scoreFcnFastNoisy(len(subSeg), subSegDuration)
                # debug("\n{0} segment, costs {1}".format(segment, costs))
                self.noisyAnnotationCosts += costs

        for key, adapter in self.wordController.confusionNetworkAdapters.iteritems():
            for segment in adapter.getSegmentIterator():
                costs = self._scoreFcnFastNoisy(len(segment), segment.durationSec)
                self.noisyAnnotationCostsWithoutSupervised += costs


        # Clear Cost-Model cache
        self._noisyGPCosts.clear()

        return candidateList
    def _mainLoop(self, wordController, batchSizeSec):
        assert(batchSizeSec > 0.0)

        valueCoveredVectors = self._valueCoveredVectors

        # Init coveredPhonemes
        if self._coveredPhonemes is not None:
            coveredPhonemes = self._coveredPhonemes.copy()
        else:
            coveredPhonemes = Counter()
            # coveredPhonemes = sparse.csc_matrix((1, self.wordController._numPhonemes))


        # Prepare datastructures
        log("Computing submodular gains")
        candidateList = []
        gainDict = self._initGains(wordController, coveredPhonemes, valueCoveredVectors)
        gainQueue = FastBucketQueue()
        checkedCnWordSet = set()
        for (adapter, cnWordIdx), gain in gainDict.iteritems():
            cnWord = adapter[cnWordIdx]
            if not cnWord._marked and not cnWord._transcribed and not cnWord.ignore:
                gainQueue.push((adapter, cnWordIdx), gain)

        i = 0
        accumTime = 0.0
        log('Number of marked words: {0}, -> {1} percent marked'.format(self.wordController.getNumMarkedWords(),\
                                                                        self.wordController.getNumMarkedWords() / self.wordController.getNumWords() ))
        reportAfterSamples = 10000
        while len(gainQueue) > 0 and accumTime < batchSizeSec:

            # Get top element from list and add to selected set
            (adapter, cnWordIdx), candidateGain = gainQueue.pop()
            candidateCnWord = adapter[cnWordIdx]

            # Update Gain, ratio, cost
            # if candidateCnWord not in checkedCnWordSet:
            candidateGain = self._computeGain(adapter, cnWordIdx, coveredPhonemes, valueCoveredVectors)
            # print("\n\nIteration {0}, word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}".format(i, candidateCnWord.id, candidateGain, costs, candidateRatio, candidateCnWord.duration))
            # assert(candidateGain >= 0), "Gain must be non-negative in order to be submodular"

            # print "Queue.max(): {0}".format(gainQueue.max())
            if candidateCnWord in checkedCnWordSet or candidateGain >= gainQueue.max():
                # print("--> Selected!")
                candidateList.append((adapter, cnWordIdx))
                candidateAdapter = adapter
                wordPos = cnWordIdx
                candidateAdapter.assignWordToSegment(wordPos)
                # coveredPhonemes.update(candidateCnWord._tfIdfDict)
                coveredPhonemes += candidateCnWord._tfIdfDict
                valueCoveredVectors += candidateGain
                checkedCnWordSet.clear()
                candidateCnWord._marked = True
                accumTime += candidateCnWord.durationSec
                candidateCnWord.utility = candidateGain
                # candidateCnWord._score = i

                # # Update gain neighbor words
                # leftSeg, rightSeg = candidateAdapter.getNeighborSegments(wordPos)
                # if leftSeg is None and wordPos > 0:
                #     leftWord = candidateAdapter[wordPos-1]
                #     neighGain, neighRatio, neighCosts = self._computeRatioGain(leftWord, coveredPhonemes, valueCoveredVectors)
                #     gainQueue.update(leftWord, neighRatio)
                #     # print("--> word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}"\
                #           # .format(i, leftWord.id, neighGain, neighCosts, neighRatio, leftWord.duration))
                #
                # if rightSeg is None and wordPos < candidateAdapter.getNumWords()-1:
                #     rightWord = candidateAdapter[wordPos+1]
                #     neighGain, neighRatio, neighCosts = self._computeRatioGain(rightWord, coveredPhonemes, valueCoveredVectors)
                #     gainQueue.update(rightWord, neighRatio)
                #     # print("--> word: {1}\nGain: {2}, Costs: {3}, Ratio: {4}, Frames: {5}"\
                #           # .format(i, rightWord.id, neighGain, neighCosts, neighRatio, rightWord.duration))
                # TODO remove
                i += 1
                # print candidateAdapter.visualizeSegments()

                if i%reportAfterSamples == 0:
                    log('Number of marked words: {0}, -> {1} percent marked'.format(i, float(i) / float(self.wordController.getNumWords()) ))

            else:
                # print("--> Refused!")
                gainQueue.push((adapter, cnWordIdx), candidateGain)
                checkedCnWordSet.add(candidateCnWord)

            if len(gainQueue) <= 10:
                debug("Priority queue has only {0} elements left".format(len(gainQueue)))

        log("Sampled {0} hours of words".format(accumTime/3600.0))

        return candidateList, coveredPhonemes, valueCoveredVectors