Beispiel #1
0
def main(args):
    corpus = '.'
    outFile = 'lm.txt'
    max_order = 3
    gtNmin = [1, 1, 1]

    for i in xrange(0, len(args)):
        if args[i] == '-c':
            corpus = args[i + 1]
        elif args[i] == '-lm':
            outFile = args[i + 1]
        elif args[i] == '-o':
            max_order = int(args[i + 1])
        elif args[i] == '-gt':
            nums = args[i + 1].split(',')
            gtNmin = map(lambda n: int(n), nums)

    if max_order <= 0:
        print "Max order must be non-negative"
        exit(1)

    if len(gtNmin) != max_order:
        print 'Using default gtNmin parameter, cause it does not fit the provided max_order'
        gtNmin = [1 for i in xrange(0, max_order)]

    print 'Parameters:'
    print 'corpus:', corpus
    print 'outFile:', outFile
    print 'max_order:', max_order
    print 'gtNmin:', gtNmin

    reader = Reader(corpus)
    ng_maker = NgramMaker(max_order)

    for file in reader:
        ng_maker.parse(Normalizer.normalize(file))

    gt = GoodTuring(ng_maker.storage(), gtNmin)
    ng_storage = gt.storage()

    out_file = open(outFile, 'w')
    out_file.write('\\data\\\n\n')
    for ng_ord in xrange(1, ng_storage.max_order() + 1):
        out_file.write("ngram %d=%d\n" %
                       (ng_ord, ng_storage.distinct_n_grams(ng_ord)))

    for ng_ord in xrange(1, ng_storage.max_order() + 1):
        out_file.write('\n\\%d-grams:\n' % ng_ord)
        for ng in sorted(ng_storage.get_n_grams(ng_ord)):
            ngram = ng_storage.get_n_gram(ng)
            if ng_ord < ng_storage.max_order():
                out_file.write("%.8f\t%s\t0\n" % (ngram.prob, ' '.join(ng)))
            else:
                out_file.write("%.8f\t%s\n" % (ngram.prob, ' '.join(ng)))

    out_file.write('\n\\end\\\n')
    out_file.close()
Beispiel #2
0
def main(args):
    corpus    = '.'
    outFile   = 'lm.txt'
    max_order = 3
    gtNmin    = [1,1,1]

    for i in xrange(0, len(args)):
        if args[i] == '-c':
            corpus    = args[i + 1]
        elif args[i] == '-lm':
            outFile   = args[i + 1]
        elif args[i] == '-o':
            max_order = int(args[i + 1])
        elif args[i] == '-gt':
            nums = args[i + 1].split(',')
            gtNmin = map(lambda n: int(n), nums)

    if max_order <= 0:
        print "Max order must be non-negative"
        exit(1)

    if len(gtNmin) != max_order:
        print 'Using default gtNmin parameter, cause it does not fit the provided max_order'
        gtNmin = [1 for i in xrange(0, max_order)]

    print 'Parameters:'
    print 'corpus:', corpus
    print 'outFile:', outFile
    print 'max_order:', max_order
    print 'gtNmin:', gtNmin

    reader   = Reader(corpus)
    ng_maker = NgramMaker(max_order)

    for file in reader:
        ng_maker.parse(Normalizer.normalize(file))

    gt = GoodTuring(ng_maker.storage(), gtNmin)
    ng_storage = gt.storage()

    out_file = open(outFile, 'w')
    out_file.write('\\data\\\n\n')
    for ng_ord in xrange(1, ng_storage.max_order() + 1):
        out_file.write("ngram %d=%d\n" % (ng_ord, ng_storage.distinct_n_grams(ng_ord)))

    for ng_ord in xrange(1, ng_storage.max_order() + 1):
        out_file.write('\n\\%d-grams:\n' % ng_ord)
        for ng in sorted(ng_storage.get_n_grams(ng_ord)):
            ngram = ng_storage.get_n_gram(ng)
            if ng_ord < ng_storage.max_order():
                out_file.write("%.8f\t%s\t0\n" % (ngram.prob, ' '.join(ng)))
            else:
                out_file.write("%.8f\t%s\n" % (ngram.prob, ' '.join(ng)))

    out_file.write('\n\\end\\\n')
    out_file.close()
Beispiel #3
0
    def test_ngrams(self):
        maker = NgramMaker(3)
        maker.parse(self.text)
#        for ng in maker:
#            print ng, " :  ", maker.at(ng)
        self.assertEqual(3, maker.at(('in',)).count)
        self.assertEqual(2, maker.at(('the',)).count)
        # check that we have exact number of unigrams (don't forget the <s>, </s> and <punc>
        self.assertEqual(38 + 3, len(maker.storage().get_n_grams(1)))