seq = assertIsSorted(seq) it = iter(seq) key, value = it.next() current = [value] for k, value in it: if k == key: current.append(value) else: yield key, current key = k current = [value] yield key, current aggregate = restartable(aggregate) # =========================================================================== def leftJoin(seqA, seqB): seqA = assertIsSortedAndConsolidated(seqA) seqB = assertIsSortedAndConsolidated(seqB) aIter = iter(seqA) bIter = iter(seqB) bKey = None try: for aKey, aValue in aIter: while aKey > bKey: bKey, bValue = bIter.next() if aKey == bKey:
class LanguageModelBuilder(object): minCounts = [1, 1, 2, 3] discountType = [ZipfGoodTuringDiscounting] vocabulary = None highestOrder = None discounts = None def setVocabulary(self, vocabulary): self.vocabulary = vocabulary self.sentenceStart = vocabulary.index('<s>') predictedWords = set(self.vocabulary.indices()) predictedWords.remove(self.sentenceStart) predictedWords.remove(self.vocabulary.noneIndex) self.predictedWords = list(predictedWords) self.predictedWords.sort() def setHighestOrder(self, highestOrder): self.highestOrder = highestOrder def setDiscountTypes(self, types): self.discountType = types def estimateDiscounts(self, countsOfCounts): self.discounts = [] lowerOrderDiscount = None for order in range(self.highestOrder + 1): discount = self.discountType[min(order, len(self.discountType) - 1)]() discount.estimateParameters(countsOfCounts[order], floor=lowerOrderDiscount) self.discounts.append(discount) lowerOrderDiscount = discount def setCountCutoffs(self, cutoffs): self.minCounts = cutoffs def countCutoff(self, order): return self.minCounts[min(order, len(self.minCounts) - 1)] def discount(self, order): return self.discounts[order] def rawCountsForOrder(self, allCounts, order): for history, values in assertIsSorted(allCounts): if len(history) < order: continue history, oldest = history[:order - 1], history[order - 1] yield history, oldest, values def groupedCounts(self, allCounts, order): it = self.rawCountsForOrder(allCounts, order) history, oldest, values = next(it) group = [] accu = CountsAccumulator() accu.set(values) for h, o, v in it: if h == history: if o == oldest: accu += v else: group.append((oldest, accu.sum())) oldest = o accu.set(v) else: group.append((oldest, accu.sum())) yield history, group history = h oldest = o accu.set(v) group = [] group.append((oldest, accu.sum())) yield history, group groupedCounts = restartable(groupedCounts) def effectiveCounts(self, counts, minCount, discount): total = counts.sum() effectiveCounts = Counts([ (predicted, discount(value)) for predicted, value in counts.threshold(minCount) ]) return effectiveCounts, total def parametrizeOrder(self, order): self.log('\nbuilding order', order) minCount = self.countCutoff(order) self.log('count cutoff: ingoring counts < %d' % minCount) discount = self.discount(order) self.log('discounting:') if self.logFile: discount.report(self.logFile) return minCount, discount def makeZeroOrder(self, allCounts): minCount, discount = self.parametrizeOrder(0) counts = sumLotsOfCounts(map(lambda item: item[1], allCounts)) effectiveCounts, total = self.effectiveCounts(counts, minCount, discount) effectiveTotal = effectiveCounts.sum() seenWords = set([w for w, n in effectiveCounts]) assert self.sentenceStart not in seenWords unseenWords = set(self.predictedWords) - seenWords assert self.sentenceStart not in unseenWords self.log('number of unseen words', len(unseenWords)) pZero = 1 / len(self.predictedWords) backOffMass = total - effectiveTotal nZero = backOffMass * pZero interpolatedCounts = [] for predicted, effectiveCount in effectiveCounts: interpolatedCounts.append((predicted, effectiveCount + nZero)) for predicted in unseenWords: interpolatedCounts.append((predicted, nZero)) interpolatedCounts = Counts(interpolatedCounts) self.log('%d predicted events' % (interpolatedCounts.size)) return [(MGram(()), (interpolatedCounts, total))] class StoredEffectiveCounts(object): def __init__(self): self.fname = tempfile.mkstemp('counts')[1] self.file = open(self.fname, 'wb') def add(self, history, values, total): marshal.dump(history, self.file) SparseVector.dump(values, self.file) marshal.dump(total, self.file) def finalize(self): self.file.close() self.file = None def __iter__(self): assert self.file is None file = open(self.fname, 'rb') while True: try: history = marshal.load(file) values = SparseVector.load(file) total = marshal.load(file) yield history, (values, total) except EOFError: break file.close() def __del__(self): os.unlink(self.fname) def build(self, allCounts, result): assert self.vocabulary assert self.highestOrder is not None assert self.discounts is not None result.vocabulary = self.vocabulary allEffectiveCounts = self.makeZeroOrder(allCounts) result_add = result.topSection(0) for history, (values, total) in allEffectiveCounts: probabilities = values / total result_add(history, probabilities) for order in range(1, self.highestOrder + 1): minCount, discount = self.parametrizeOrder(order) allLowerOrderEffectiveCounts = allEffectiveCounts groupedCounts = self.groupedCounts(allCounts, order) result_add = result.boSection(order - 1) allEffectiveCounts = self.StoredEffectiveCounts() nHistories = nPredicted = 0 for (lowerOrderHistory, (lowerOrderEffectiveCounts, lowerOrderTotal), counts) \ in leftJoin(allLowerOrderEffectiveCounts, groupedCounts): if counts is None: lowerOrderDistribution = lowerOrderEffectiveCounts / \ lowerOrderTotal result_add(lowerOrderHistory, lowerOrderDistribution) continue effectiveCounts = [] for oldest, values in counts: effVals, total = self.effectiveCounts( values, minCount, discount) if effVals: effectiveCounts.append((oldest, effVals, total)) effectiveMarginalCounts = sumCounts( [values for oldest, values, total in effectiveCounts]) effectiveMarginalTotal = effectiveMarginalCounts.sum() lowerOrderDistribution = [] den = lowerOrderTotal - effectiveMarginalTotal for predicted, lowerOrderEffectiveCount in lowerOrderEffectiveCounts: num = lowerOrderEffectiveCount - effectiveMarginalCounts[ predicted] if num <= 0.0 or den <= 0.0: self.log( 'warning: marginal inversion encountered', repr((lowerOrderHistory, predicted, lowerOrderEffectiveCount, effectiveMarginalCounts[predicted], den))) else: lowerOrderDistribution.append((predicted, num / den)) lowerOrderDistribution = Counts(lowerOrderDistribution) result_add(lowerOrderHistory, lowerOrderDistribution) for oldest, effectiveCountsGroup, total in effectiveCounts: history = lowerOrderHistory + MGram((oldest, )) effectiveTotal = effectiveCountsGroup.sum() backOffMass = total - effectiveTotal assert backOffMass >= 0 interpolatedCounts = leftJoinInterpolateAndAddOneSparse( effectiveCountsGroup, backOffMass, lowerOrderDistribution, self.vocabulary.noneIndex, backOffMass) allEffectiveCounts.add(history, interpolatedCounts, total) nHistories += 1 nPredicted += interpolatedCounts.size allEffectiveCounts.finalize() self.log('%d predicted events in %d histories' % (nPredicted, nHistories)) result_add = result.topSection(order) for history, (values, total) in allEffectiveCounts: probabilities = values / total result_add(history, probabilities) result.finalize() return result logFile = None def setLogFile(self, f): self.logFile = f def log(self, *args): if self.logFile is not None: print(' '.join(map(str, args)), file=self.logFile) def make(self, vocabulary, counts, order): self.setVocabulary(vocabulary) self.setHighestOrder(order) coc = [ mGramCounts.countsOfCounts( mGramCounts.mGramReduceToOrder(counts, order)) for order in range(order + 1) ] self.estimateDiscounts(coc) result = Lm(order) counts = store(contract(counts)) self.build(counts, result) return result
merge items of a sorted iterator """ seq = assertIsSorted(seq) it = iter(seq) key, value = it.next() current = [value] for k, value in it: if k == key: current.append(value) else: yield key, current key = k current = [value] yield key, current aggregate = restartable(aggregate) # =========================================================================== def leftJoin(seqA, seqB): seqA = assertIsSortedAndConsolidated(seqA) seqB = assertIsSortedAndConsolidated(seqB) aIter = iter(seqA) bIter = iter(seqB) bKey = None try: for aKey, aValue in aIter: while aKey > bKey: bKey, bValue = bIter.next() if aKey == bKey: yield aKey, aValue, bValue
def contract(seq): it = iter(seq) (history, predicted), value = it.next() values = [(predicted, value)] for (h, p), v in it: if h != history: if h < history: raise NonMonotonousHistoriesError(history, h) yield history, Counts(values) history = h values = [] values.append((p, v)) yield history, Counts(values) contract = restartable(contract) class CountsAccumulator(object): def __init__(self): self.terms = [[], [], []] def set(self, initial=None): self.terms = [[initial], [], []] def shrink(self): for i in range(3): if len(self.terms[i]) < 64: break s = sumCounts(self.terms[i]) try:
pass def contract(seq): it = iter(seq) (history, predicted), value = it.next() values = [(predicted, value)] for (h, p), v in it: if h != history: if h < history: raise NonMonotonousHistoriesError(history, h) yield history, Counts(values) history = h values = [] values.append((p, v)) yield history, Counts(values) contract = restartable(contract) class CountsAccumulator(object): def __init__(self): self.terms = [ [], [], [] ] def set(self, initial = None): self.terms = [ [initial], [], [] ] def shrink(self): for i in range(3): if len(self.terms[i]) < 64: break s = sumCounts(self.terms[i]) try: self.terms[i+1].append(s)