def compare_templates(self): for i, t in enumerate([ brill.nltkdemo18(), brill.nltkdemo18plus(), brill.brill24(), brill.fntbl37() ]): print "\nTEMPLATE {}==================\n".format(i) self.train(templates=t)
def __init__(self): # bounds = [(1, end)] initial_tagger = get_initial_tagger() rules = brill.fntbl37() self.trainer = BrillTaggerTrainer(initial_tagger, rules, deterministic=True, trace=0) train_sents, test_sents = utils.training_testing_dataset() self.tagger = self.trainer.train(train_sents, max_rules=20) print('Brill tagger training completed')
def train_brill_tagger(initial_tagger, train_sents, end, trace=0, **kwargs): bounds = [(1, end)] # call this to fetch templates directly # NOTE : This is the comment from the method below: #### Return 37 templates taken from the postagging task of the #### fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/ templates = brill.fntbl37() trainer = BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=trace) return trainer.train(train_sents, **kwargs)
def train_evaluate_brills(train_data, test_data): """Training and evaluating of Brill`s tagger""" # Define templates for rules, provided by nltk brill.Template._cleartemplates() templates = brill.fntbl37() # Define initial tagger, tagging by the most common tag initial_tagger = UnigramTagger(train_data) trainer = brill_trainer.BrillTaggerTrainer( initial_tagger=initial_tagger, # better unk words handling templates=templates, trace=3, deterministic=True) tagger = trainer.train(train_data, max_rules=100) # max number of rules to learn 100 print("Accuracy:", tagger.evaluate(test_data)) return tagger.evaluate(test_data)
def makeBrillTagger(): import nltk.tag from nltk.tag import brill train_sents = nltk.corpus.brown.tagged_sents() word_patterns = [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'^a$', 'PREP'), ] raubt_tagger = backoff_tagger(train_sents, [ nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger ], backoff=nltk.tag.RegexpTagger(word_patterns)) # templates = [ # brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), # brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), # brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), # brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), # brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), # brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), # brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), # brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), # brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), # brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)) # ] templates = brill.fntbl37() trainer = nltk.BrillTaggerTrainer(raubt_tagger, templates) braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3) pickle.dump(braubt_tagger, open("data/braubt_tagger.dat", "w")) return braubt_tagger
def contextual_rules(wikicorpus_dir, context_file): sentences = wikicorpus(wikicorpus_dir, words=1000000) ANONYMOUS = "anonymous" for s in sentences: for i, (w, tag) in enumerate(s): if tag == "NP": # NP = proper noun in Parole tagset. s[i] = (ANONYMOUS, "NP") ctx = fntbl37() tagger = UnigramTagger(sentences) tagger = BrillTaggerTrainer(tagger, ctx, trace=0) tagger = tagger.train(sentences, max_rules=100) #print tagger.evaluate(wikicorpus(10000, start=1)) with open(context_file, "w") as f: for rule in tagger.rules(): f.write("%s\n" % rule)
def customize_tagger(train_sets, test_sets=None, tagger_name='Brill_Tagger', return_tagger=False): """ use train set to train customized tagger :param tagger_name: :param train_sets: :param test_sets: :return: trained tagger's score """ tagger = nltk.DefaultTagger('NN') tagger = nltk.UnigramTagger(train_sets, backoff=tagger) tagger = nltk.BigramTagger(train_sets, backoff=tagger) # tagger = nltk.tag.PerceptronTagger() # # os.environ['JAVAHOME'] = java_path # tagger = StanfordPOSTagger(stanford_tagger_model_path, stanford_tagger_jar_path) templates = brill.fntbl37() brill_tagger = brill_trainer.BrillTaggerTrainer(tagger, templates, trace=3) brill_tagger = brill_tagger.train(train_sets, max_rules=300) # TaggerUtil.persistenize_tagger_model(tagger, 'Multigram_Tagger') TaggerUtil.persistenize_tagger_model(brill_tagger, tagger_name) score = -1 if test_sets: score = brill_tagger.evaluate(test_sets) if return_tagger: return score, brill_tagger else: return score
unigram_tagger = tag.UnigramTagger(training_data, backoff=nn_cd_tagger) print("Unigram accuracy: ") print(unigram_tagger.evaluate(evaulation_data)) # Bigram tagger bigram_tagger = tag.BigramTagger(training_data, backoff=unigram_tagger) print("Bigram accuracy: ") print(bigram_tagger.evaluate(evaulation_data)) # Trigram tagger trigram_tagger = tag.TrigramTagger(training_data, backoff=bigram_tagger) print("Trigram accuracy: ") print(trigram_tagger.evaluate(evaulation_data)) # Brill tagger templates templates = brill.fntbl37() # First iteration trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates) brill_tagger = trainer.train(training_data, max_rules, min_score) print("Initial Brill accuracy:") print(brill_tagger.evaluate(evaulation_data)) # 10 Folding for i in range(1,5): # Random splitting random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list,random.random) cutoff = int(development_size*train) training_data = tagged_data_list[:cutoff]
unigram_tagger = tag.UnigramTagger(training_data, backoff=nn_cd_tagger) print("Unigram accuracy: ") print(unigram_tagger.evaluate(evaulation_data)) # Bigram tagger bigram_tagger = tag.BigramTagger(training_data, backoff=unigram_tagger) print("Bigram accuracy: ") print(bigram_tagger.evaluate(evaulation_data)) # Trigram tagger trigram_tagger = tag.TrigramTagger(training_data, backoff=bigram_tagger) print("Trigram accuracy: ") print(trigram_tagger.evaluate(evaulation_data)) # Brill tagger templates templates = brill.fntbl37() # First iteration trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates) brill_tagger = trainer.train(training_data, max_rules, min_score) print("Initial Brill accuracy:") print(brill_tagger.evaluate(evaulation_data)) # 10 Folding for i in range(1, 5): # Random splitting random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list, random.random) cutoff = int(development_size * train) training_data = tagged_data_list[:cutoff]
def compare_templates(self): for i, t in enumerate([brill.nltkdemo18(), brill.nltkdemo18plus(), brill.brill24(), brill.fntbl37()]): print "\nTEMPLATE {}==================\n".format(i) self.train(templates=t)