def main(options, args): builder = LanguageModelBuilder() builder.setLogFile(sys.stdout) vocabulary = loadVocabulary(options.vocabulary) builder.setVocabulary(vocabulary) builder.setHighestOrder(options.order - 1) if options.count_cutoffs: cutoffs = list(map(int, options.count_cutoffs.split())) builder.setCountCutoffs(cutoffs) binaryCountFile = options.read + '.bin' if os.path.isfile(binaryCountFile): counts = StoredCounts(binaryCountFile) else: counts = loadCounts(options.read, vocabulary, binaryCountFile) if options.counts_of_counts: coc = eval(gOpenIn(options.counts_of_counts).read()) else: coc = [ mGramCounts.countsOfCounts( mGramCounts.mGramReduceToOrder(counts, order)) for order in range(options.order) ] maximumOrder = maximumCountsOrder(coc) if builder.highestOrder > maximumOrder: print('warning: no counts for orders above %d' % (maximumOrder + 1)) builder.setHighestOrder(maximumOrder) builder.estimateDiscounts(coc) if options.lm: lm = makeLmWriter(options) else: lm = LmDummy() builder.build(counts, lm) if __debug__ and False: ### TESTING print('verifying normalization ...', file=sys.stdout) lm2 = Lm(lm) lm2.checkNormalisation()
def main(options, args): builder = LanguageModelBuilder() builder.setLogFile(sys.stdout) vocabulary = loadVocabulary(options.vocabulary) builder.setVocabulary(vocabulary) builder.setHighestOrder(options.order - 1) if options.count_cutoffs: cutoffs = map(int, options.count_cutoffs.split()) builder.setCountCutoffs(cutoffs) binaryCountFile = options.read + '.bin' if os.path.isfile(binaryCountFile): counts = StoredCounts(binaryCountFile) else: counts = loadCounts(options.read, vocabulary, binaryCountFile) if options.counts_of_counts: coc = eval(gOpenIn(options.counts_of_counts).read()) else: coc = [ mGramCounts.countsOfCounts(mGramCounts.mGramReduceToOrder(counts, order)) for order in range(options.order) ] maximumOrder = maximumCountsOrder(coc) if builder.highestOrder > maximumOrder: print 'warning: no counts for orders above %d' % (maximumOrder+1) builder.setHighestOrder(maximumOrder) builder.estimateDiscounts(coc) if options.lm: lm = makeLmWriter(options) else: lm = LmDummy() builder.build(counts, lm) if __debug__ and False: ### TESTING print >> sys.stdout, 'verifying normalization ...' lm2 = Lm(lm) lm2.checkNormalisation()