Ejemplo n.º 1
0
    seq = assertIsSorted(seq)
    it = iter(seq)

    key, value = it.next()
    current = [value]
    for k, value in it:
        if k == key:
            current.append(value)
        else:
            yield key, current
            key = k
            current = [value]
    yield key, current


aggregate = restartable(aggregate)


# ===========================================================================
def leftJoin(seqA, seqB):
    seqA = assertIsSortedAndConsolidated(seqA)
    seqB = assertIsSortedAndConsolidated(seqB)
    aIter = iter(seqA)
    bIter = iter(seqB)

    bKey = None
    try:
        for aKey, aValue in aIter:
            while aKey > bKey:
                bKey, bValue = bIter.next()
            if aKey == bKey:
Ejemplo n.º 2
0
class LanguageModelBuilder(object):
    minCounts = [1, 1, 2, 3]
    discountType = [ZipfGoodTuringDiscounting]

    vocabulary = None
    highestOrder = None
    discounts = None

    def setVocabulary(self, vocabulary):
        self.vocabulary = vocabulary
        self.sentenceStart = vocabulary.index('<s>')
        predictedWords = set(self.vocabulary.indices())
        predictedWords.remove(self.sentenceStart)
        predictedWords.remove(self.vocabulary.noneIndex)
        self.predictedWords = list(predictedWords)
        self.predictedWords.sort()

    def setHighestOrder(self, highestOrder):
        self.highestOrder = highestOrder

    def setDiscountTypes(self, types):
        self.discountType = types

    def estimateDiscounts(self, countsOfCounts):
        self.discounts = []
        lowerOrderDiscount = None
        for order in range(self.highestOrder + 1):
            discount = self.discountType[min(order,
                                             len(self.discountType) - 1)]()
            discount.estimateParameters(countsOfCounts[order],
                                        floor=lowerOrderDiscount)
            self.discounts.append(discount)
            lowerOrderDiscount = discount

    def setCountCutoffs(self, cutoffs):
        self.minCounts = cutoffs

    def countCutoff(self, order):
        return self.minCounts[min(order, len(self.minCounts) - 1)]

    def discount(self, order):
        return self.discounts[order]

    def rawCountsForOrder(self, allCounts, order):
        for history, values in assertIsSorted(allCounts):
            if len(history) < order: continue
            history, oldest = history[:order - 1], history[order - 1]
            yield history, oldest, values

    def groupedCounts(self, allCounts, order):
        it = self.rawCountsForOrder(allCounts, order)
        history, oldest, values = next(it)
        group = []
        accu = CountsAccumulator()
        accu.set(values)
        for h, o, v in it:
            if h == history:
                if o == oldest:
                    accu += v
                else:
                    group.append((oldest, accu.sum()))
                    oldest = o
                    accu.set(v)
            else:
                group.append((oldest, accu.sum()))
                yield history, group
                history = h
                oldest = o
                accu.set(v)
                group = []
        group.append((oldest, accu.sum()))
        yield history, group

    groupedCounts = restartable(groupedCounts)

    def effectiveCounts(self, counts, minCount, discount):
        total = counts.sum()
        effectiveCounts = Counts([
            (predicted, discount(value))
            for predicted, value in counts.threshold(minCount)
        ])
        return effectiveCounts, total

    def parametrizeOrder(self, order):
        self.log('\nbuilding order', order)

        minCount = self.countCutoff(order)
        self.log('count cutoff: ingoring counts < %d' % minCount)

        discount = self.discount(order)
        self.log('discounting:')
        if self.logFile: discount.report(self.logFile)

        return minCount, discount

    def makeZeroOrder(self, allCounts):
        minCount, discount = self.parametrizeOrder(0)

        counts = sumLotsOfCounts(map(lambda item: item[1], allCounts))
        effectiveCounts, total = self.effectiveCounts(counts, minCount,
                                                      discount)
        effectiveTotal = effectiveCounts.sum()

        seenWords = set([w for w, n in effectiveCounts])
        assert self.sentenceStart not in seenWords
        unseenWords = set(self.predictedWords) - seenWords
        assert self.sentenceStart not in unseenWords
        self.log('number of unseen words', len(unseenWords))

        pZero = 1 / len(self.predictedWords)
        backOffMass = total - effectiveTotal
        nZero = backOffMass * pZero
        interpolatedCounts = []
        for predicted, effectiveCount in effectiveCounts:
            interpolatedCounts.append((predicted, effectiveCount + nZero))
        for predicted in unseenWords:
            interpolatedCounts.append((predicted, nZero))
        interpolatedCounts = Counts(interpolatedCounts)

        self.log('%d predicted events' % (interpolatedCounts.size))
        return [(MGram(()), (interpolatedCounts, total))]

    class StoredEffectiveCounts(object):
        def __init__(self):
            self.fname = tempfile.mkstemp('counts')[1]
            self.file = open(self.fname, 'wb')

        def add(self, history, values, total):
            marshal.dump(history, self.file)
            SparseVector.dump(values, self.file)
            marshal.dump(total, self.file)

        def finalize(self):
            self.file.close()
            self.file = None

        def __iter__(self):
            assert self.file is None
            file = open(self.fname, 'rb')
            while True:
                try:
                    history = marshal.load(file)
                    values = SparseVector.load(file)
                    total = marshal.load(file)
                    yield history, (values, total)
                except EOFError:
                    break
            file.close()

        def __del__(self):
            os.unlink(self.fname)

    def build(self, allCounts, result):
        assert self.vocabulary
        assert self.highestOrder is not None
        assert self.discounts is not None

        result.vocabulary = self.vocabulary

        allEffectiveCounts = self.makeZeroOrder(allCounts)

        result_add = result.topSection(0)
        for history, (values, total) in allEffectiveCounts:
            probabilities = values / total
            result_add(history, probabilities)

        for order in range(1, self.highestOrder + 1):
            minCount, discount = self.parametrizeOrder(order)

            allLowerOrderEffectiveCounts = allEffectiveCounts
            groupedCounts = self.groupedCounts(allCounts, order)

            result_add = result.boSection(order - 1)
            allEffectiveCounts = self.StoredEffectiveCounts()
            nHistories = nPredicted = 0
            for (lowerOrderHistory, (lowerOrderEffectiveCounts, lowerOrderTotal), counts) \
             in leftJoin(allLowerOrderEffectiveCounts, groupedCounts):
                if counts is None:
                    lowerOrderDistribution = lowerOrderEffectiveCounts / \
                        lowerOrderTotal
                    result_add(lowerOrderHistory, lowerOrderDistribution)
                    continue

                effectiveCounts = []
                for oldest, values in counts:
                    effVals, total = self.effectiveCounts(
                        values, minCount, discount)
                    if effVals:
                        effectiveCounts.append((oldest, effVals, total))

                effectiveMarginalCounts = sumCounts(
                    [values for oldest, values, total in effectiveCounts])
                effectiveMarginalTotal = effectiveMarginalCounts.sum()

                lowerOrderDistribution = []
                den = lowerOrderTotal - effectiveMarginalTotal
                for predicted, lowerOrderEffectiveCount in lowerOrderEffectiveCounts:
                    num = lowerOrderEffectiveCount - effectiveMarginalCounts[
                        predicted]
                    if num <= 0.0 or den <= 0.0:
                        self.log(
                            'warning: marginal inversion encountered',
                            repr((lowerOrderHistory, predicted,
                                  lowerOrderEffectiveCount,
                                  effectiveMarginalCounts[predicted], den)))
                    else:
                        lowerOrderDistribution.append((predicted, num / den))
                lowerOrderDistribution = Counts(lowerOrderDistribution)

                result_add(lowerOrderHistory, lowerOrderDistribution)

                for oldest, effectiveCountsGroup, total in effectiveCounts:
                    history = lowerOrderHistory + MGram((oldest, ))
                    effectiveTotal = effectiveCountsGroup.sum()
                    backOffMass = total - effectiveTotal
                    assert backOffMass >= 0

                    interpolatedCounts = leftJoinInterpolateAndAddOneSparse(
                        effectiveCountsGroup, backOffMass,
                        lowerOrderDistribution, self.vocabulary.noneIndex,
                        backOffMass)

                    allEffectiveCounts.add(history, interpolatedCounts, total)
                    nHistories += 1
                    nPredicted += interpolatedCounts.size

            allEffectiveCounts.finalize()
            self.log('%d predicted events in %d histories' %
                     (nPredicted, nHistories))

            result_add = result.topSection(order)
            for history, (values, total) in allEffectiveCounts:
                probabilities = values / total
                result_add(history, probabilities)

        result.finalize()
        return result

    logFile = None

    def setLogFile(self, f):
        self.logFile = f

    def log(self, *args):
        if self.logFile is not None:
            print(' '.join(map(str, args)), file=self.logFile)

    def make(self, vocabulary, counts, order):
        self.setVocabulary(vocabulary)
        self.setHighestOrder(order)
        coc = [
            mGramCounts.countsOfCounts(
                mGramCounts.mGramReduceToOrder(counts, order))
            for order in range(order + 1)
        ]
        self.estimateDiscounts(coc)
        result = Lm(order)
        counts = store(contract(counts))
        self.build(counts, result)
        return result
Ejemplo n.º 3
0
    merge items of a sorted iterator
    """
    seq = assertIsSorted(seq)
    it = iter(seq)

    key, value = it.next()
    current = [value]
    for k, value in it:
	if k == key:
	    current.append(value)
	else:
	    yield key, current
	    key = k
	    current = [value]
    yield key, current
aggregate = restartable(aggregate)

# ===========================================================================
def leftJoin(seqA, seqB):
    seqA = assertIsSortedAndConsolidated(seqA)
    seqB = assertIsSortedAndConsolidated(seqB)
    aIter = iter(seqA)
    bIter = iter(seqB)

    bKey = None
    try:
	for aKey, aValue in aIter:
	    while aKey > bKey:
		bKey, bValue = bIter.next()
	    if aKey == bKey:
		yield aKey, aValue, bValue
Ejemplo n.º 4
0
def contract(seq):
    it = iter(seq)
    (history, predicted), value = it.next()
    values = [(predicted, value)]
    for (h, p), v in it:
        if h != history:
            if h < history:
                raise NonMonotonousHistoriesError(history, h)
            yield history, Counts(values)
            history = h
            values = []
        values.append((p, v))
    yield history, Counts(values)


contract = restartable(contract)


class CountsAccumulator(object):
    def __init__(self):
        self.terms = [[], [], []]

    def set(self, initial=None):
        self.terms = [[initial], [], []]

    def shrink(self):
        for i in range(3):
            if len(self.terms[i]) < 64:
                break
            s = sumCounts(self.terms[i])
            try:
Ejemplo n.º 5
0
    pass

def contract(seq):
    it = iter(seq)
    (history, predicted), value = it.next()
    values = [(predicted, value)]
    for (h, p), v in it:
        if h != history:
            if h < history:
                raise NonMonotonousHistoriesError(history, h)
            yield history, Counts(values)
            history = h
            values = []
        values.append((p, v))
    yield history, Counts(values)
contract = restartable(contract)

class CountsAccumulator(object):
    def __init__(self):
        self.terms = [ [], [], [] ]

    def set(self, initial = None):
        self.terms = [ [initial], [], [] ]

    def shrink(self):
        for i in range(3):
            if len(self.terms[i]) < 64:
                break
            s = sumCounts(self.terms[i])
            try:
                self.terms[i+1].append(s)