def test_ngrams(self): maker = NgramMaker(3) maker.parse(self.text) # for ng in maker: # print ng, " : ", maker.at(ng) self.assertEqual(3, maker.at(('in',)).count) self.assertEqual(2, maker.at(('the',)).count) # check that we have exact number of unigrams (don't forget the <s>, </s> and <punc> self.assertEqual(38 + 3, len(maker.storage().get_n_grams(1)))
def main(args): corpus = '.' outFile = 'lm.txt' max_order = 3 gtNmin = [1, 1, 1] for i in xrange(0, len(args)): if args[i] == '-c': corpus = args[i + 1] elif args[i] == '-lm': outFile = args[i + 1] elif args[i] == '-o': max_order = int(args[i + 1]) elif args[i] == '-gt': nums = args[i + 1].split(',') gtNmin = map(lambda n: int(n), nums) if max_order <= 0: print "Max order must be non-negative" exit(1) if len(gtNmin) != max_order: print 'Using default gtNmin parameter, cause it does not fit the provided max_order' gtNmin = [1 for i in xrange(0, max_order)] print 'Parameters:' print 'corpus:', corpus print 'outFile:', outFile print 'max_order:', max_order print 'gtNmin:', gtNmin reader = Reader(corpus) ng_maker = NgramMaker(max_order) for file in reader: ng_maker.parse(Normalizer.normalize(file)) gt = GoodTuring(ng_maker.storage(), gtNmin) ng_storage = gt.storage() out_file = open(outFile, 'w') out_file.write('\\data\\\n\n') for ng_ord in xrange(1, ng_storage.max_order() + 1): out_file.write("ngram %d=%d\n" % (ng_ord, ng_storage.distinct_n_grams(ng_ord))) for ng_ord in xrange(1, ng_storage.max_order() + 1): out_file.write('\n\\%d-grams:\n' % ng_ord) for ng in sorted(ng_storage.get_n_grams(ng_ord)): ngram = ng_storage.get_n_gram(ng) if ng_ord < ng_storage.max_order(): out_file.write("%.8f\t%s\t0\n" % (ngram.prob, ' '.join(ng))) else: out_file.write("%.8f\t%s\n" % (ngram.prob, ' '.join(ng))) out_file.write('\n\\end\\\n') out_file.close()
def main(args): corpus = '.' outFile = 'lm.txt' max_order = 3 gtNmin = [1,1,1] for i in xrange(0, len(args)): if args[i] == '-c': corpus = args[i + 1] elif args[i] == '-lm': outFile = args[i + 1] elif args[i] == '-o': max_order = int(args[i + 1]) elif args[i] == '-gt': nums = args[i + 1].split(',') gtNmin = map(lambda n: int(n), nums) if max_order <= 0: print "Max order must be non-negative" exit(1) if len(gtNmin) != max_order: print 'Using default gtNmin parameter, cause it does not fit the provided max_order' gtNmin = [1 for i in xrange(0, max_order)] print 'Parameters:' print 'corpus:', corpus print 'outFile:', outFile print 'max_order:', max_order print 'gtNmin:', gtNmin reader = Reader(corpus) ng_maker = NgramMaker(max_order) for file in reader: ng_maker.parse(Normalizer.normalize(file)) gt = GoodTuring(ng_maker.storage(), gtNmin) ng_storage = gt.storage() out_file = open(outFile, 'w') out_file.write('\\data\\\n\n') for ng_ord in xrange(1, ng_storage.max_order() + 1): out_file.write("ngram %d=%d\n" % (ng_ord, ng_storage.distinct_n_grams(ng_ord))) for ng_ord in xrange(1, ng_storage.max_order() + 1): out_file.write('\n\\%d-grams:\n' % ng_ord) for ng in sorted(ng_storage.get_n_grams(ng_ord)): ngram = ng_storage.get_n_gram(ng) if ng_ord < ng_storage.max_order(): out_file.write("%.8f\t%s\t0\n" % (ngram.prob, ' '.join(ng))) else: out_file.write("%.8f\t%s\n" % (ngram.prob, ' '.join(ng))) out_file.write('\n\\end\\\n') out_file.close()