Esempio n. 1
0
def main(options, args):
    builder = LanguageModelBuilder()
    builder.setLogFile(sys.stdout)

    vocabulary = loadVocabulary(options.vocabulary)
    builder.setVocabulary(vocabulary)

    builder.setHighestOrder(options.order - 1)

    if options.count_cutoffs:
        cutoffs = list(map(int, options.count_cutoffs.split()))
        builder.setCountCutoffs(cutoffs)

    binaryCountFile = options.read + '.bin'
    if os.path.isfile(binaryCountFile):
        counts = StoredCounts(binaryCountFile)
    else:
        counts = loadCounts(options.read, vocabulary, binaryCountFile)

    if options.counts_of_counts:
        coc = eval(gOpenIn(options.counts_of_counts).read())
    else:
        coc = [
            mGramCounts.countsOfCounts(
                mGramCounts.mGramReduceToOrder(counts, order))
            for order in range(options.order)
        ]

    maximumOrder = maximumCountsOrder(coc)
    if builder.highestOrder > maximumOrder:
        print('warning: no counts for orders above %d' % (maximumOrder + 1))
        builder.setHighestOrder(maximumOrder)

    builder.estimateDiscounts(coc)

    if options.lm:
        lm = makeLmWriter(options)
    else:
        lm = LmDummy()

    builder.build(counts, lm)

    if __debug__ and False:  ### TESTING
        print('verifying normalization ...', file=sys.stdout)
        lm2 = Lm(lm)
        lm2.checkNormalisation()
Esempio n. 2
0
def main(options, args):
    builder = LanguageModelBuilder()
    builder.setLogFile(sys.stdout)

    vocabulary = loadVocabulary(options.vocabulary)
    builder.setVocabulary(vocabulary)

    builder.setHighestOrder(options.order - 1)

    if options.count_cutoffs:
	cutoffs = map(int, options.count_cutoffs.split())
	builder.setCountCutoffs(cutoffs)

    binaryCountFile = options.read + '.bin'
    if os.path.isfile(binaryCountFile):
	counts = StoredCounts(binaryCountFile)
    else:
	counts = loadCounts(options.read, vocabulary, binaryCountFile)

    if options.counts_of_counts:
	coc = eval(gOpenIn(options.counts_of_counts).read())
    else:
	coc = [ mGramCounts.countsOfCounts(mGramCounts.mGramReduceToOrder(counts, order))
		for order in range(options.order) ]

    maximumOrder = maximumCountsOrder(coc)
    if builder.highestOrder > maximumOrder:
	print 'warning: no counts for orders above %d' % (maximumOrder+1)
	builder.setHighestOrder(maximumOrder)

    builder.estimateDiscounts(coc)

    if options.lm:
	lm = makeLmWriter(options)
    else:
	lm = LmDummy()

    builder.build(counts, lm)

    if __debug__ and False: ### TESTING
	print >> sys.stdout, 'verifying normalization ...'
	lm2 = Lm(lm)
	lm2.checkNormalisation()