Ejemplo n.º 1
0
    def build(self, allCounts, result):
        assert self.vocabulary
        assert self.highestOrder is not None
        assert self.discounts is not None

        result.vocabulary = self.vocabulary

        allEffectiveCounts = self.makeZeroOrder(allCounts)

        result_add = result.topSection(0)
        for history, (values, total) in allEffectiveCounts:
            probabilities = values / total
            result_add(history, probabilities)

        for order in range(1, self.highestOrder + 1):
            minCount, discount = self.parametrizeOrder(order)

            allLowerOrderEffectiveCounts = allEffectiveCounts
            groupedCounts = self.groupedCounts(allCounts, order)

            result_add = result.boSection(order - 1)
            allEffectiveCounts = self.StoredEffectiveCounts()
            nHistories = nPredicted = 0
            for (lowerOrderHistory, (lowerOrderEffectiveCounts, lowerOrderTotal), counts) \
             in leftJoin(allLowerOrderEffectiveCounts, groupedCounts):
                if counts is None:
                    lowerOrderDistribution = lowerOrderEffectiveCounts / \
                        lowerOrderTotal
                    result_add(lowerOrderHistory, lowerOrderDistribution)
                    continue

                effectiveCounts = []
                for oldest, values in counts:
                    effVals, total = self.effectiveCounts(
                        values, minCount, discount)
                    if effVals:
                        effectiveCounts.append((oldest, effVals, total))

                effectiveMarginalCounts = sumCounts(
                    [values for oldest, values, total in effectiveCounts])
                effectiveMarginalTotal = effectiveMarginalCounts.sum()

                lowerOrderDistribution = []
                den = lowerOrderTotal - effectiveMarginalTotal
                for predicted, lowerOrderEffectiveCount in lowerOrderEffectiveCounts:
                    num = lowerOrderEffectiveCount - effectiveMarginalCounts[
                        predicted]
                    if num <= 0.0 or den <= 0.0:
                        self.log(
                            'warning: marginal inversion encountered',
                            repr((lowerOrderHistory, predicted,
                                  lowerOrderEffectiveCount,
                                  effectiveMarginalCounts[predicted], den)))
                    else:
                        lowerOrderDistribution.append((predicted, num / den))
                lowerOrderDistribution = Counts(lowerOrderDistribution)

                result_add(lowerOrderHistory, lowerOrderDistribution)

                for oldest, effectiveCountsGroup, total in effectiveCounts:
                    history = lowerOrderHistory + MGram((oldest, ))
                    effectiveTotal = effectiveCountsGroup.sum()
                    backOffMass = total - effectiveTotal
                    assert backOffMass >= 0

                    interpolatedCounts = leftJoinInterpolateAndAddOneSparse(
                        effectiveCountsGroup, backOffMass,
                        lowerOrderDistribution, self.vocabulary.noneIndex,
                        backOffMass)

                    allEffectiveCounts.add(history, interpolatedCounts, total)
                    nHistories += 1
                    nPredicted += interpolatedCounts.size

            allEffectiveCounts.finalize()
            self.log('%d predicted events in %d histories' %
                     (nPredicted, nHistories))

            result_add = result.topSection(order)
            for history, (values, total) in allEffectiveCounts:
                probabilities = values / total
                result_add(history, probabilities)

        result.finalize()
        return result
Ejemplo n.º 2
0
    def build(self, allCounts, result):
	assert self.vocabulary
	assert self.highestOrder is not None
	assert self.discounts is not None

	result.vocabulary = self.vocabulary

	allEffectiveCounts = self.makeZeroOrder(allCounts)

	result_add = result.topSection(0)
	for history, (values, total) in allEffectiveCounts:
	    probabilities = values / total
	    result_add(history, probabilities)

	for order in range(1, self.highestOrder + 1):
	    minCount, discount = self.parametrizeOrder(order)

	    allLowerOrderEffectiveCounts = allEffectiveCounts
	    groupedCounts = self.groupedCounts(allCounts, order)

	    result_add = result.boSection(order - 1)
	    allEffectiveCounts = self.StoredEffectiveCounts()
	    nHistories = nPredicted = 0
	    for (lowerOrderHistory, (lowerOrderEffectiveCounts, lowerOrderTotal), counts) \
		    in leftJoin(allLowerOrderEffectiveCounts, groupedCounts):
		if counts is None:
		    lowerOrderDistribution = lowerOrderEffectiveCounts / \
					     lowerOrderTotal
		    result_add(lowerOrderHistory, lowerOrderDistribution)
		    continue

		effectiveCounts = []
		for oldest, values in counts:
		    effVals, total = self.effectiveCounts(values, minCount, discount)
		    if effVals:
			effectiveCounts.append((oldest, effVals, total))

		effectiveMarginalCounts = sumCounts([
		    values for oldest, values, total in effectiveCounts ])
		effectiveMarginalTotal  = effectiveMarginalCounts.sum()

		lowerOrderDistribution = []
		den = lowerOrderTotal - effectiveMarginalTotal
		for predicted, lowerOrderEffectiveCount in lowerOrderEffectiveCounts:
		    num = lowerOrderEffectiveCount - effectiveMarginalCounts[predicted]
		    if num <= 0.0 or den <= 0.0:
			self.log('warning: marginal inversion encountered',
				 repr((lowerOrderHistory, predicted,
				       lowerOrderEffectiveCount, effectiveMarginalCounts[predicted],
				       den)))
		    else:
			lowerOrderDistribution.append((predicted, num / den))
		lowerOrderDistribution = Counts(lowerOrderDistribution)

		result_add(lowerOrderHistory, lowerOrderDistribution)

		for oldest, effectiveCountsGroup, total in effectiveCounts:
		    history = lowerOrderHistory + MGram((oldest,))
		    effectiveTotal = effectiveCountsGroup.sum()
		    backOffMass = total - effectiveTotal
		    assert backOffMass >= 0

		    interpolatedCounts = leftJoinInterpolateAndAddOneSparse(
			effectiveCountsGroup,
			backOffMass,
			lowerOrderDistribution,
			self.vocabulary.noneIndex,
			backOffMass)

		    allEffectiveCounts.add(history, interpolatedCounts, total)
		    nHistories += 1
		    nPredicted += interpolatedCounts.size

	    allEffectiveCounts.finalize()
	    self.log('%d predicted events in %d histories' % (nPredicted, nHistories))

	    result_add = result.topSection(order)
	    for history, (values, total) in allEffectiveCounts:
		probabilities = values / total
		result_add(history, probabilities)

	result.finalize()
	return result