def compare_templates(self): for i, t in enumerate([ brill.nltkdemo18(), brill.nltkdemo18plus(), brill.brill24(), brill.fntbl37() ]): print "\nTEMPLATE {}==================\n".format(i) self.train(templates=t)
def test_brill_tagger(self): trainer = BrillTaggerTrainer(self.default_tagger, nltkdemo18(), deterministic=True) tagger = trainer.train(self.corpus, max_rules=30) encoded = self.encoder.encode(tagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(tagger._initial_tagger), repr(decoded._initial_tagger)) self.assertEqual(tagger._rules, decoded._rules) self.assertEqual(tagger._training_stats, decoded._training_stats)
def train(self, templates=None, verbose=True): """Train a new Brill tagger.""" if templates is None: templates = brill.nltkdemo18() random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # very simple regular expression tagger regex_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'), (r'.*', 'N')]) if verbose == True: print "Regular expression tagger accuracy:\n{}\n".format( regex_tagger.evaluate(test_data)) # unigram tagger unigram_tagger = UnigramTagger(train=training_data, backoff=regex_tagger) if verbose == True: print "Unigram tagger accuracy:\n{}\n".format( unigram_tagger.evaluate(test_data)) # bigram tagger bigram_tagger = BigramTagger(train=training_data, backoff=unigram_tagger) if verbose == True: print "Bigram tagger accuracy:\n{}\n".format( bigram_tagger.evaluate(test_data)) # trigram tagger trigram_tagger = TrigramTagger(train=training_data, backoff=bigram_tagger) if verbose == True: print "Trigram tagger accuracy:\n{}\n".format( trigram_tagger.evaluate(test_data)) # first iteration trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger, templates=templates) brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Initial Brill tagger accuracy:\n{}\n".format( brill_tagger.evaluate(test_data)) # folding for i in range(0, self.num_groups): # random splitting random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # note that .train method returns a BrillTagger() object brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Brill tagger accuracy, fold {}:\n{}\n".format( i + 1, brill_tagger.evaluate(test_data))
def train(self, templates=None, verbose=True): """Train a new Brill tagger.""" if templates is None: templates = brill.nltkdemo18() random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # very simple regular expression tagger regex_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'), (r'.*', 'N') ]) if verbose == True: print "Regular expression tagger accuracy:\n{}\n".format( regex_tagger.evaluate(test_data)) # unigram tagger unigram_tagger = UnigramTagger(train=training_data, backoff=regex_tagger) if verbose == True: print "Unigram tagger accuracy:\n{}\n".format( unigram_tagger.evaluate(test_data)) # bigram tagger bigram_tagger = BigramTagger(train=training_data, backoff=unigram_tagger) if verbose == True: print "Bigram tagger accuracy:\n{}\n".format( bigram_tagger.evaluate(test_data)) # trigram tagger trigram_tagger = TrigramTagger(train=training_data, backoff=bigram_tagger) if verbose == True: print "Trigram tagger accuracy:\n{}\n".format( trigram_tagger.evaluate(test_data)) # first iteration trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger, templates=templates) brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Initial Brill tagger accuracy:\n{}\n".format( brill_tagger.evaluate(test_data)) # folding for i in range(0, self.num_groups): # random splitting random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # note that .train method returns a BrillTagger() object brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Brill tagger accuracy, fold {}:\n{}\n".format( i+1, brill_tagger.evaluate(test_data))
def compare_templates(self): for i, t in enumerate([brill.nltkdemo18(), brill.nltkdemo18plus(), brill.brill24(), brill.fntbl37()]): print "\nTEMPLATE {}==================\n".format(i) self.train(templates=t)