Example #1
0
 def effectiveCounts(self, counts, minCount, discount):
     total = counts.sum()
     effectiveCounts = Counts([
         (predicted, discount(value))
         for predicted, value in counts.threshold(minCount)
     ])
     return effectiveCounts, total
Example #2
0
    def makeZeroOrder(self, allCounts):
        minCount, discount = self.parametrizeOrder(0)

        counts = sumLotsOfCounts(map(lambda item: item[1], allCounts))
        effectiveCounts, total = self.effectiveCounts(counts, minCount,
                                                      discount)
        effectiveTotal = effectiveCounts.sum()

        seenWords = set([w for w, n in effectiveCounts])
        assert self.sentenceStart not in seenWords
        unseenWords = set(self.predictedWords) - seenWords
        assert self.sentenceStart not in unseenWords
        self.log('number of unseen words', len(unseenWords))

        pZero = 1 / len(self.predictedWords)
        backOffMass = total - effectiveTotal
        nZero = backOffMass * pZero
        interpolatedCounts = []
        for predicted, effectiveCount in effectiveCounts:
            interpolatedCounts.append((predicted, effectiveCount + nZero))
        for predicted in unseenWords:
            interpolatedCounts.append((predicted, nZero))
        interpolatedCounts = Counts(interpolatedCounts)

        self.log('%d predicted events' % (interpolatedCounts.size))
        return [(MGram(()), (interpolatedCounts, total))]
Example #3
0
    def build(self, allCounts, result):
        assert self.vocabulary
        assert self.highestOrder is not None
        assert self.discounts is not None

        result.vocabulary = self.vocabulary

        allEffectiveCounts = self.makeZeroOrder(allCounts)

        result_add = result.topSection(0)
        for history, (values, total) in allEffectiveCounts:
            probabilities = values / total
            result_add(history, probabilities)

        for order in range(1, self.highestOrder + 1):
            minCount, discount = self.parametrizeOrder(order)

            allLowerOrderEffectiveCounts = allEffectiveCounts
            groupedCounts = self.groupedCounts(allCounts, order)

            result_add = result.boSection(order - 1)
            allEffectiveCounts = self.StoredEffectiveCounts()
            nHistories = nPredicted = 0
            for (lowerOrderHistory, (lowerOrderEffectiveCounts, lowerOrderTotal), counts) \
             in leftJoin(allLowerOrderEffectiveCounts, groupedCounts):
                if counts is None:
                    lowerOrderDistribution = lowerOrderEffectiveCounts / \
                        lowerOrderTotal
                    result_add(lowerOrderHistory, lowerOrderDistribution)
                    continue

                effectiveCounts = []
                for oldest, values in counts:
                    effVals, total = self.effectiveCounts(
                        values, minCount, discount)
                    if effVals:
                        effectiveCounts.append((oldest, effVals, total))

                effectiveMarginalCounts = sumCounts(
                    [values for oldest, values, total in effectiveCounts])
                effectiveMarginalTotal = effectiveMarginalCounts.sum()

                lowerOrderDistribution = []
                den = lowerOrderTotal - effectiveMarginalTotal
                for predicted, lowerOrderEffectiveCount in lowerOrderEffectiveCounts:
                    num = lowerOrderEffectiveCount - effectiveMarginalCounts[
                        predicted]
                    if num <= 0.0 or den <= 0.0:
                        self.log(
                            'warning: marginal inversion encountered',
                            repr((lowerOrderHistory, predicted,
                                  lowerOrderEffectiveCount,
                                  effectiveMarginalCounts[predicted], den)))
                    else:
                        lowerOrderDistribution.append((predicted, num / den))
                lowerOrderDistribution = Counts(lowerOrderDistribution)

                result_add(lowerOrderHistory, lowerOrderDistribution)

                for oldest, effectiveCountsGroup, total in effectiveCounts:
                    history = lowerOrderHistory + MGram((oldest, ))
                    effectiveTotal = effectiveCountsGroup.sum()
                    backOffMass = total - effectiveTotal
                    assert backOffMass >= 0

                    interpolatedCounts = leftJoinInterpolateAndAddOneSparse(
                        effectiveCountsGroup, backOffMass,
                        lowerOrderDistribution, self.vocabulary.noneIndex,
                        backOffMass)

                    allEffectiveCounts.add(history, interpolatedCounts, total)
                    nHistories += 1
                    nPredicted += interpolatedCounts.size

            allEffectiveCounts.finalize()
            self.log('%d predicted events in %d histories' %
                     (nPredicted, nHistories))

            result_add = result.topSection(order)
            for history, (values, total) in allEffectiveCounts:
                probabilities = values / total
                result_add(history, probabilities)

        result.finalize()
        return result
Example #4
0
 def leftJoinInterpolateAndAddOneSparse(left, scale, right, extraKey,
                                        extraValue):
     result = [(extraKey, extraValue)]
     for k, v in left:
         result.append((k, v + scale * right[k]))
     return Counts(result)