Exemple #1
0
def demo2():
   from nltk_lite import tag
   from nltk_lite.corpora import treebank
   import tnt

   d = list(treebank.tagged())

   
   t = tnt.Tnt(N=1000, C=False)
   s = tnt.Tnt(N=1000, C=True)
   t.train(d[(11)*100:])
   s.train(d[(11)*100:])
   
   for i in range(10):
      tacc = tag.accuracy(t, d[i*100:((i+1)*100)])
      tp_un = float(t.unknown) / float(t.known +t.unknown)
      tp_kn = float(t.known) / float(t.known + t.unknown)
      t.unknown = 0
      t.known = 0

      print 'Capitalisation off:'
      print 'Accuracy:', tacc
      print 'Percentage known:', tp_kn
      print 'Percentage unknown:', tp_un
      print 'Accuracy over known words:', (tacc / tp_kn)
      
      sacc = tag.accuracy(s, d[i*100:((i+1)*100)])
      sp_un = float(s.unknown) / float(s.known +s.unknown)
      sp_kn = float(s.known) / float(s.known + s.unknown)
      s.unknown = 0
      s.known = 0

      print 'Capitalisation on:'
      print 'Accuracy:', sacc
      print 'Percentage known:', sp_kn
      print 'Percentage unknown:', sp_un
      print 'Accuracy over known words:', (sacc / sp_kn)   
Exemple #2
0
def demo2():
    from nltk_lite import tag
    from nltk_lite.corpora import treebank
    import tnt

    d = list(treebank.tagged())

    t = tnt.Tnt(N=1000, C=False)
    s = tnt.Tnt(N=1000, C=True)
    t.train(d[(11) * 100:])
    s.train(d[(11) * 100:])

    for i in range(10):
        tacc = tag.accuracy(t, d[i * 100:((i + 1) * 100)])
        tp_un = float(t.unknown) / float(t.known + t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        t.unknown = 0
        t.known = 0

        print 'Capitalisation off:'
        print 'Accuracy:', tacc
        print 'Percentage known:', tp_kn
        print 'Percentage unknown:', tp_un
        print 'Accuracy over known words:', (tacc / tp_kn)

        sacc = tag.accuracy(s, d[i * 100:((i + 1) * 100)])
        sp_un = float(s.unknown) / float(s.known + s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        s.unknown = 0
        s.known = 0

        print 'Capitalisation on:'
        print 'Accuracy:', sacc
        print 'Percentage known:', sp_kn
        print 'Percentage unknown:', sp_un
        print 'Accuracy over known words:', (sacc / sp_kn)
Exemple #3
0
def demo3():
    from nltk_lite import tag
    from nltk_lite.corpora import treebank
    from nltk_lite.corpora import brown
    import tnt

    d = list(treebank.tagged())
    e = list(brown.tagged())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d) * 0.1)
    e10 = int(len(e) * 0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = tnt.Tnt(N=1000, C=False)
        s = tnt.Tnt(N=1000, C=False)

        dtest = d[(i * d10):((i + 1) * d10)]
        etest = e[(i * e10):((i + 1) * e10)]

        dtrain = d[:(i * d10)] + d[((i + 1) * d10):]
        etrain = e[:(i * e10)] + e[((i + 1) * e10):]

        t.train(dtrain)
        s.train(etrain)

        tacc = tag.accuracy(t, dtest)
        tp_un = float(t.unknown) / float(t.known + t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = tag.accuracy(s, etest)
        sp_un = float(s.unknown) / float(s.known + s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += (tacc / tp_kn)
        sknacc += (sacc / tp_kn)
        tallacc += tacc
        sallacc += sacc

        #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc

    print "brown: acc over words known:", 10 * tknacc
    print "     : overall accuracy:", 10 * tallacc
    print "     : words known:", 10 * tknown
    print "treebank: acc over words known:", 10 * sknacc
    print "        : overall accuracy:", 10 * sallacc
    print "        : words known:", 10 * sknown
def demo(numSents=100, max_rules=200, min_score=2, ruleFile="dump.rules",
         errorOutput = "errors.out", ruleOutput="rules.out",
         randomize=False, train=.8, trace=3):

    from nltk_lite.corpora import treebank
    from nltk_lite import tag
    from nltk_lite.tag import brill

    NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])

    # train is the proportion of data used in training; the rest is reserved
    # for testing.

    print "Loading tagged data..."
    sents = list(treebank.tagged())
    if randomize:
        random.seed(len(sents))
        random.shuffle(sents)

    tagged_data = [t for s in sents[:numSents] for t in s]
    cutoff = int(len(tagged_data)*train)

    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:]

    testing_data = [t[0] for t in gold_data]

    # Unigram tagger

    print "Training unigram tagger:",
    u = tag.Unigram(backoff=NN_CD_tagger)

    # NB training and testing are required to use a list-of-lists structure,
    # so we wrap the flattened corpus data with the extra list structure.
    u.train([training_data])
    print("[accuracy: %f]" % tag.accuracy(u, [gold_data]))

    # Brill tagger

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
        brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
        brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)),
        ]

    #trainer = brill.FastBrillTrainer(u, templates, trace)
    trainer = brill.BrillTrainer(u, templates, trace)
    b = trainer.train(training_data, max_rules, min_score)

    print
    print("Brill accuracy: %f" % tag.accuracy(b, [gold_data]))

    print("\nRules: ")
    printRules = file(ruleOutput, 'w')
    for rule in b.rules():
        print(str(rule))
        printRules.write(str(rule)+"\n\n")
    #b.saveRules(ruleFile)

    testing_data = list(b.tag(testing_data))
    el = errorList(gold_data, testing_data)
    errorFile = file(errorOutput, 'w')

    for e in el:
        errorFile.write(e+"\n\n")
    errorFile.close()
    print "Done; rules and errors saved to %s and %s." % (ruleOutput, errorOutput)
def _demo_tagger(tagger, gold):
    from nltk_lite.tag import accuracy
    acc = accuracy(tagger, gold)
    print 'Accuracy = %4.1f%%' % (100.0 * acc)
Exemple #6
0
def demo3():
   from nltk_lite import tag
   from nltk_lite.corpora import treebank
   from nltk_lite.corpora import brown
   import tnt

   d = list(treebank.tagged())
   e = list(brown.tagged())

   d = d[:1000]
   e = e[:1000]

   d10 = int(len(d)*0.1)
   e10 = int(len(e)*0.1)

   tknacc = 0
   sknacc = 0
   tallacc = 0
   sallacc = 0
   tknown = 0
   sknown = 0

   for i in range(10):

      t = tnt.Tnt(N=1000, C=False)
      s = tnt.Tnt(N=1000, C=False)

      dtest = d[(i*d10):((i+1)*d10)]
      etest = e[(i*e10):((i+1)*e10)]
      
      dtrain = d[:(i*d10)] + d[((i+1)*d10):]
      etrain = e[:(i*e10)] + e[((i+1)*e10):]     

      t.train(dtrain)
      s.train(etrain)
   
      tacc = tag.accuracy(t, dtest)
      tp_un = float(t.unknown) / float(t.known +t.unknown)
      tp_kn = float(t.known) / float(t.known + t.unknown)
      tknown += tp_kn
      t.unknown = 0
      t.known = 0
      
      sacc = tag.accuracy(s, etest)
      sp_un = float(s.unknown) / float(s.known + s.unknown)
      sp_kn = float(s.known) / float(s.known + s.unknown)
      sknown += sp_kn
      s.unknown = 0
      s.known = 0 

      tknacc += (tacc / tp_kn)
      sknacc += (sacc / tp_kn)
      tallacc += tacc
      sallacc += sacc

      #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc
      

   print "brown: acc over words known:", 10*tknacc
   print "     : overall accuracy:", 10*tallacc
   print "     : words known:", 10*tknown
   print "treebank: acc over words known:", 10*sknacc
   print "        : overall accuracy:", 10*sallacc
   print "        : words known:", 10*sknown