def build(self, allCounts, result): assert self.vocabulary assert self.highestOrder is not None assert self.discounts is not None result.vocabulary = self.vocabulary allEffectiveCounts = self.makeZeroOrder(allCounts) result_add = result.topSection(0) for history, (values, total) in allEffectiveCounts: probabilities = values / total result_add(history, probabilities) for order in range(1, self.highestOrder + 1): minCount, discount = self.parametrizeOrder(order) allLowerOrderEffectiveCounts = allEffectiveCounts groupedCounts = self.groupedCounts(allCounts, order) result_add = result.boSection(order - 1) allEffectiveCounts = self.StoredEffectiveCounts() nHistories = nPredicted = 0 for (lowerOrderHistory, (lowerOrderEffectiveCounts, lowerOrderTotal), counts) \ in leftJoin(allLowerOrderEffectiveCounts, groupedCounts): if counts is None: lowerOrderDistribution = lowerOrderEffectiveCounts / \ lowerOrderTotal result_add(lowerOrderHistory, lowerOrderDistribution) continue effectiveCounts = [] for oldest, values in counts: effVals, total = self.effectiveCounts( values, minCount, discount) if effVals: effectiveCounts.append((oldest, effVals, total)) effectiveMarginalCounts = sumCounts( [values for oldest, values, total in effectiveCounts]) effectiveMarginalTotal = effectiveMarginalCounts.sum() lowerOrderDistribution = [] den = lowerOrderTotal - effectiveMarginalTotal for predicted, lowerOrderEffectiveCount in lowerOrderEffectiveCounts: num = lowerOrderEffectiveCount - effectiveMarginalCounts[ predicted] if num <= 0.0 or den <= 0.0: self.log( 'warning: marginal inversion encountered', repr((lowerOrderHistory, predicted, lowerOrderEffectiveCount, effectiveMarginalCounts[predicted], den))) else: lowerOrderDistribution.append((predicted, num / den)) lowerOrderDistribution = Counts(lowerOrderDistribution) result_add(lowerOrderHistory, lowerOrderDistribution) for oldest, effectiveCountsGroup, total in effectiveCounts: history = lowerOrderHistory + MGram((oldest, )) effectiveTotal = effectiveCountsGroup.sum() backOffMass = total - effectiveTotal assert backOffMass >= 0 interpolatedCounts = leftJoinInterpolateAndAddOneSparse( effectiveCountsGroup, backOffMass, lowerOrderDistribution, self.vocabulary.noneIndex, backOffMass) allEffectiveCounts.add(history, interpolatedCounts, total) nHistories += 1 nPredicted += interpolatedCounts.size allEffectiveCounts.finalize() self.log('%d predicted events in %d histories' % (nPredicted, nHistories)) result_add = result.topSection(order) for history, (values, total) in allEffectiveCounts: probabilities = values / total result_add(history, probabilities) result.finalize() return result
def build(self, allCounts, result): assert self.vocabulary assert self.highestOrder is not None assert self.discounts is not None result.vocabulary = self.vocabulary allEffectiveCounts = self.makeZeroOrder(allCounts) result_add = result.topSection(0) for history, (values, total) in allEffectiveCounts: probabilities = values / total result_add(history, probabilities) for order in range(1, self.highestOrder + 1): minCount, discount = self.parametrizeOrder(order) allLowerOrderEffectiveCounts = allEffectiveCounts groupedCounts = self.groupedCounts(allCounts, order) result_add = result.boSection(order - 1) allEffectiveCounts = self.StoredEffectiveCounts() nHistories = nPredicted = 0 for (lowerOrderHistory, (lowerOrderEffectiveCounts, lowerOrderTotal), counts) \ in leftJoin(allLowerOrderEffectiveCounts, groupedCounts): if counts is None: lowerOrderDistribution = lowerOrderEffectiveCounts / \ lowerOrderTotal result_add(lowerOrderHistory, lowerOrderDistribution) continue effectiveCounts = [] for oldest, values in counts: effVals, total = self.effectiveCounts(values, minCount, discount) if effVals: effectiveCounts.append((oldest, effVals, total)) effectiveMarginalCounts = sumCounts([ values for oldest, values, total in effectiveCounts ]) effectiveMarginalTotal = effectiveMarginalCounts.sum() lowerOrderDistribution = [] den = lowerOrderTotal - effectiveMarginalTotal for predicted, lowerOrderEffectiveCount in lowerOrderEffectiveCounts: num = lowerOrderEffectiveCount - effectiveMarginalCounts[predicted] if num <= 0.0 or den <= 0.0: self.log('warning: marginal inversion encountered', repr((lowerOrderHistory, predicted, lowerOrderEffectiveCount, effectiveMarginalCounts[predicted], den))) else: lowerOrderDistribution.append((predicted, num / den)) lowerOrderDistribution = Counts(lowerOrderDistribution) result_add(lowerOrderHistory, lowerOrderDistribution) for oldest, effectiveCountsGroup, total in effectiveCounts: history = lowerOrderHistory + MGram((oldest,)) effectiveTotal = effectiveCountsGroup.sum() backOffMass = total - effectiveTotal assert backOffMass >= 0 interpolatedCounts = leftJoinInterpolateAndAddOneSparse( effectiveCountsGroup, backOffMass, lowerOrderDistribution, self.vocabulary.noneIndex, backOffMass) allEffectiveCounts.add(history, interpolatedCounts, total) nHistories += 1 nPredicted += interpolatedCounts.size allEffectiveCounts.finalize() self.log('%d predicted events in %d histories' % (nPredicted, nHistories)) result_add = result.topSection(order) for history, (values, total) in allEffectiveCounts: probabilities = values / total result_add(history, probabilities) result.finalize() return result