コード例 #1
0
ファイル: LM.py プロジェクト: chromebookbob/python2012
def main(args):
    corpus = '.'
    outFile = 'lm.txt'
    max_order = 3
    gtNmin = [1, 1, 1]

    for i in xrange(0, len(args)):
        if args[i] == '-c':
            corpus = args[i + 1]
        elif args[i] == '-lm':
            outFile = args[i + 1]
        elif args[i] == '-o':
            max_order = int(args[i + 1])
        elif args[i] == '-gt':
            nums = args[i + 1].split(',')
            gtNmin = map(lambda n: int(n), nums)

    if max_order <= 0:
        print "Max order must be non-negative"
        exit(1)

    if len(gtNmin) != max_order:
        print 'Using default gtNmin parameter, cause it does not fit the provided max_order'
        gtNmin = [1 for i in xrange(0, max_order)]

    print 'Parameters:'
    print 'corpus:', corpus
    print 'outFile:', outFile
    print 'max_order:', max_order
    print 'gtNmin:', gtNmin

    reader = Reader(corpus)
    ng_maker = NgramMaker(max_order)

    for file in reader:
        ng_maker.parse(Normalizer.normalize(file))

    gt = GoodTuring(ng_maker.storage(), gtNmin)
    ng_storage = gt.storage()

    out_file = open(outFile, 'w')
    out_file.write('\\data\\\n\n')
    for ng_ord in xrange(1, ng_storage.max_order() + 1):
        out_file.write("ngram %d=%d\n" %
                       (ng_ord, ng_storage.distinct_n_grams(ng_ord)))

    for ng_ord in xrange(1, ng_storage.max_order() + 1):
        out_file.write('\n\\%d-grams:\n' % ng_ord)
        for ng in sorted(ng_storage.get_n_grams(ng_ord)):
            ngram = ng_storage.get_n_gram(ng)
            if ng_ord < ng_storage.max_order():
                out_file.write("%.8f\t%s\t0\n" % (ngram.prob, ' '.join(ng)))
            else:
                out_file.write("%.8f\t%s\n" % (ngram.prob, ' '.join(ng)))

    out_file.write('\n\\end\\\n')
    out_file.close()
コード例 #2
0
ファイル: LM.py プロジェクト: chromebookbob/python2012
def main(args):
    corpus    = '.'
    outFile   = 'lm.txt'
    max_order = 3
    gtNmin    = [1,1,1]

    for i in xrange(0, len(args)):
        if args[i] == '-c':
            corpus    = args[i + 1]
        elif args[i] == '-lm':
            outFile   = args[i + 1]
        elif args[i] == '-o':
            max_order = int(args[i + 1])
        elif args[i] == '-gt':
            nums = args[i + 1].split(',')
            gtNmin = map(lambda n: int(n), nums)

    if max_order <= 0:
        print "Max order must be non-negative"
        exit(1)

    if len(gtNmin) != max_order:
        print 'Using default gtNmin parameter, cause it does not fit the provided max_order'
        gtNmin = [1 for i in xrange(0, max_order)]

    print 'Parameters:'
    print 'corpus:', corpus
    print 'outFile:', outFile
    print 'max_order:', max_order
    print 'gtNmin:', gtNmin

    reader   = Reader(corpus)
    ng_maker = NgramMaker(max_order)

    for file in reader:
        ng_maker.parse(Normalizer.normalize(file))

    gt = GoodTuring(ng_maker.storage(), gtNmin)
    ng_storage = gt.storage()

    out_file = open(outFile, 'w')
    out_file.write('\\data\\\n\n')
    for ng_ord in xrange(1, ng_storage.max_order() + 1):
        out_file.write("ngram %d=%d\n" % (ng_ord, ng_storage.distinct_n_grams(ng_ord)))

    for ng_ord in xrange(1, ng_storage.max_order() + 1):
        out_file.write('\n\\%d-grams:\n' % ng_ord)
        for ng in sorted(ng_storage.get_n_grams(ng_ord)):
            ngram = ng_storage.get_n_gram(ng)
            if ng_ord < ng_storage.max_order():
                out_file.write("%.8f\t%s\t0\n" % (ngram.prob, ' '.join(ng)))
            else:
                out_file.write("%.8f\t%s\n" % (ngram.prob, ' '.join(ng)))

    out_file.write('\n\\end\\\n')
    out_file.close()
コード例 #3
0
 def test_norm1(self):
     #        tc = unittest.TestCase()
     text = Normalizer.normalize("test\\data\\hobbit.txt")
     self.assertEqual('<s> in a hole in the ground there lived a hobbit </s> <s> not a nasty <punc> dirty <punc> wet hole <punc> filled with the ends of worms and an oozy smell <punc> nor yet a dry <punc> bare <punc> sandy hole with nothing in it to sit down on or to eat <punc> it was a hobbit <punc> hole <punc> and that means comfort </s>',\
     text)
コード例 #4
0
ファイル: test_text.py プロジェクト: A-Kulikov/python2012
 def test_norm1(self):
 #        tc = unittest.TestCase()
     text = Normalizer.normalize("test\\data\\hobbit.txt")
     self.assertEqual('<s> in a hole in the ground there lived a hobbit </s> <s> not a nasty <punc> dirty <punc> wet hole <punc> filled with the ends of worms and an oozy smell <punc> nor yet a dry <punc> bare <punc> sandy hole with nothing in it to sit down on or to eat <punc> it was a hobbit <punc> hole <punc> and that means comfort </s>',\
     text)
コード例 #5
0
 def setUp(self):
     self.text = Normalizer.normalize("test\\data\\hobbit.txt")