def nltkdemo18plus(): """ Return 18 templates, from the original nltk demo, and additionally a few multi-feature ones (the motivation is easy comparison with nltkdemo18) """ return nltkdemo18() + [ Template(Word([-1]), Pos([1])), Template(Pos([-1]), Word([1])), Template(Word([-1]), Word([0]), Pos([1])), Template(Pos([-1]), Word([0]), Word([1])), Template(Pos([-1]), Word([0]), Pos([1])), ]
def brill_rules_pos_wd_feats_offset_4(): """ Return 24 templates of the seminal TBL paper, Brill (1995) """ return [ Template(Word([-1])), Template(Word([-2])), Template(Word([-3])), Template(Word([-4])), Template(Word([0])), Template(Word([1])), Template(Word([2])), Template(Word([3])), Template(Word([4])), ]
def brill_rules_pos_bigram_feats_offset_4(): """ Return 24 templates of the seminal TBL paper, Brill (1995) """ return [ Template(Word([-1, 0])), Template(Word([-2, -1])), Template(Word([-3, -2])), Template(Word([-4, -3])), Template(Word([1, 0])), Template(Word([2, 1])), Template(Word([3, 2])), Template(Word([4, 3])) ]
def demo_multiposition_feature(): """ The feature/s of a template takes a list of positions relative to the current word where the feature should be looked for, conceptually joined by logical OR. For instance, Pos([-1, 1]), given a value V, will hold whenever V is found one step to the left and/or one step to the right. For contiguous ranges, a 2-arg form giving inclusive end points can also be used: Pos(-3, -1) is the same as the arg below. """ postag(templates=[Template(Pos([-3,-2,-1]))])
def demo_generated_templates(): """ Template.expand and Feature.expand are class methods facilitating generating large amounts of templates. See their documentation for details. Note: training with 500 templates can easily fill all available even on relatively small corpora """ wordtpls = Word.expand([-1,0,1], [1,2], excludezero=False) tagtpls = Pos.expand([-2,-1,0,1], [1,2], excludezero=True) templates = list(Template.expand([wordtpls, tagtpls], combinations=(1,3))) print("Generated {0} templates for transformation-based learning".format(len(templates))) postag(templates=templates, incremental_stats=True, template_stats=True)
def fntbl37(): """ Return 37 templates taken from the postagging task of the fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/ (37 is after excluding a handful which do not condition on Pos[0]; fntbl can do that but the current nltk implementation cannot.) """ return [ Template(Word([0]), Word([1]), Word([2])), Template(Word([-1]), Word([0]), Word([1])), Template(Word([0]), Word([-1])), Template(Word([0]), Word([1])), Template(Word([0]), Word([2])), Template(Word([0]), Word([-2])), Template(Word([1, 2])), Template(Word([-2, -1])), Template(Word([1, 2, 3])), Template(Word([-3, -2, -1])), Template(Word([0]), Pos([2])), Template(Word([0]), Pos([-2])), Template(Word([0]), Pos([1])), Template(Word([0]), Pos([-1])), Template(Word([0])), Template(Word([-2])), Template(Word([2])), Template(Word([1])), Template(Word([-1])), Template(Pos([-1]), Pos([1])), Template(Pos([1]), Pos([2])), Template(Pos([-1]), Pos([-2])), Template(Pos([1])), Template(Pos([-1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([1, 2, 3])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([-2, -1])), Template(Pos([1]), Word([0]), Word([1])), Template(Pos([1]), Word([0]), Word([-1])), Template(Pos([-1]), Word([-1]), Word([0])), Template(Pos([-1]), Word([0]), Word([1])), Template(Pos([-2]), Pos([-1])), Template(Pos([1]), Pos([2])), Template(Pos([1]), Pos([2]), Word([1])) ]
def nltkdemo18(): """ Return 18 templates, from the original nltk demo, in multi-feature syntax """ return [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])), Template(Word([1, 2, 3])), Template(Word([-1]), Word([1])), ]
def brill24(): """ Return 24 templates of the seminal TBL paper, Brill (1995) """ return [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Pos([-2]), Pos([-1])), Template(Pos([1]), Pos([2])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-1, 0])), Template(Word([0, 1])), Template(Word([0])), Template(Word([-1]), Pos([-1])), Template(Word([1]), Pos([1])), Template(Word([0]), Word([-1]), Pos([-1])), Template(Word([0]), Word([1]), Pos([1])), ]
def demo_multifeature_template(): """ Templates can have more than a single feature. """ postag(templates=[Template(Word([0]), Pos([-2,-1]))])
print "Unigram accuracy: " print unigram_tagger.evaluate(evaulation_data) # Bigram tagger bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger) print "Bigram accuracy: " print bigram_tagger.evaluate(evaulation_data) # Trigram tagger trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger) print "Trigram accuracy: " print trigram_tagger.evaluate(evaulation_data) # Brill tagger templates templates = [ Template(brill.Pos([1, 1])), Template(brill.Pos([2, 2])), Template(brill.Pos([1, 2])), Template(brill.Pos([1, 3])), Template(brill.Word([1, 1])), Template(brill.Word([2, 2])), Template(brill.Word([1, 2])), Template(brill.Word([1, 3])), Template(brill.Pos([-1, -1]), brill.Pos([1, 1])), Template(brill.Word([-1, -1]), brill.Word([1, 1])), ] # First iteration trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates) brill_tagger = trainer.train(training_data, max_rules, min_score) print "Initial Brill accuracy:"