def train_brill_tagger(initial_tagger, train_sents, **kwargs): templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(train_sents, **kwargs)
def train_brill_tagger(initial_tagger, train_sents, **kwargs): templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] #templates = nltkdemo18() # nltkdemo18plus() # fntbl37() # brill24() trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=True) return trainer.train(train_sents, **kwargs)
def train_brill_tagger(initial_tagger, training, **kwargs): """ Function to train a brill tagger. Uses rules to correct the results of a tagger """ templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(training, **kwargs)
def test_pos_template(self): train_sents = treebank.tagged_sents()[:1000] tagger = UnigramTagger(train_sents) trainer = brill_trainer.BrillTaggerTrainer( tagger, [brill.Template(brill.Pos([-1]))]) brill_tagger = trainer.train(train_sents) # Example from https://github.com/nltk/nltk/issues/769 result = brill_tagger.tag('This is a foo bar sentence'.split()) expected = [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('foo', None), ('bar', 'NN'), ('sentence', None)] self.assertEqual(result, expected)
def train_evaluate_brills(train_data, test_data): """Training and evaluating of Brill`s tagger""" # Define templates for rules, provided by nltk brill.Template._cleartemplates() templates = brill.fntbl37() # Define initial tagger, tagging by the most common tag initial_tagger = UnigramTagger(train_data) trainer = brill_trainer.BrillTaggerTrainer( initial_tagger=initial_tagger, # better unk words handling templates=templates, trace=3, deterministic=True) tagger = trainer.train(train_data, max_rules=100) # max number of rules to learn 100 print("Accuracy:", tagger.evaluate(test_data)) return tagger.evaluate(test_data)
def train_brill_tagger(initial_tagger, train_sents, **kwargs): ''' some suggested rules for the template change the POS of a word, depending on the POS of the previous word change the POS of a word, depending on the POS of any of the two previous words change the POS of a word, depending on the POS of any of the three previous words change the POS of a word, depending on the POS of the previous word and the POS of the next word change the POS of a word, depending on the previous word change the POS of a word, depending on any of the two previous words change the POS of a word, depending on any of the three previous words change the POS of a word, depending on the previous word and the next word ''' # Template generates rule for the Brill Rules that Brill tagger gonna use it templates = [ brill.Template(brill.Pos([-1])), # rule can be generated using the previous POS tag brill.Template(brill.Pos([1])), # look at the next POS tag to generate a rule brill.Template(brill.Pos([-2])), # rule can be generated using the two previous POS tag brill.Template(brill.Pos([2])), # rule can be generated using the next two POS tag brill.Template(brill.Pos([-2, -1])), # look at the combination of the previous two words to learn transformation rule brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] ''' BrillTaggerTrainer(1st, 2nd, 3rd, ...) 1st param initial_tagger: (Tagger) the baseline tagger 2nd param templates : (list of templates) templates to be used in training 3rd param trace: (int) verbosity level == information level u want to see 4th param deterministic: (bool) if True, adjudicate ties deterministically 5th ruleformat: (str) format of reported rules ''' trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=True) return trainer.train(train_sents, max_rules=100, min_score=2)
def test_pos_template(self): train_sents = treebank.tagged_sents()[:1000] tagger = UnigramTagger(train_sents) trainer = brill_trainer.BrillTaggerTrainer( tagger, [brill.Template(brill.Pos([-1]))]) brill_tagger = trainer.train(train_sents) # Example from https://github.com/nltk/nltk/issues/769 result = brill_tagger.tag("This is a foo bar sentence".split()) expected = [ ("This", "DT"), ("is", "VBZ"), ("a", "DT"), ("foo", None), ("bar", "NN"), ("sentence", None), ] self.assertEqual(result, expected)
def train_brill_tagger(tagged_sents): # The brill tagger module in NLTK. Template._cleartemplates() templates = brill24() # or fntbl37 # default_tagger = nltk.DefaultTagger('MORA_HAUPT') patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'), (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'), (r'.*', 'MORA_HAUPT')] # default regex_tagger = nltk.RegexpTagger(patterns) tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger) # cutoff = 3, if necessary tagger2 = BigramTagger(tagged_sents, backoff=tagger1) tagger3 = TrigramTagger(tagged_sents, backoff=tagger2) tagger4 = brill_trainer.BrillTaggerTrainer(tagger3, templates, trace=3) tagger5 = tagger4.train(tagged_sents, max_rules=200) print return tagger5
def customize_tagger(train_sets, test_sets=None, tagger_name='Brill_Tagger', return_tagger=False): """ use train set to train customized tagger :param tagger_name: :param train_sets: :param test_sets: :return: trained tagger's score """ tagger = nltk.DefaultTagger('NN') tagger = nltk.UnigramTagger(train_sets, backoff=tagger) tagger = nltk.BigramTagger(train_sets, backoff=tagger) # tagger = nltk.tag.PerceptronTagger() # # os.environ['JAVAHOME'] = java_path # tagger = StanfordPOSTagger(stanford_tagger_model_path, stanford_tagger_jar_path) templates = brill.fntbl37() brill_tagger = brill_trainer.BrillTaggerTrainer(tagger, templates, trace=3) brill_tagger = brill_tagger.train(train_sets, max_rules=300) # TaggerUtil.persistenize_tagger_model(tagger, 'Multigram_Tagger') TaggerUtil.persistenize_tagger_model(brill_tagger, tagger_name) score = -1 if test_sets: score = brill_tagger.evaluate(test_sets) if return_tagger: return score, brill_tagger else: return score
def train_brill_tagger(initial_tagger, train_sents, **kwargs): templates = [ brill.Template(brill.Pos([-1])), #a rule can be generated using the previous part-of-speech tag brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), #you can look at the combination of the previous two words to learn a transformation rule brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])), ] trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(train_sents, **kwargs)
sents_RNC = list(read_corpus_to_nltk(media)) #чтение подкорпуса LENTA with open('LENTA_RNC.txt', encoding='utf-8') as LENTA: sents1 = list(read_corpus_to_nltk(LENTA)) #чтение подкорпуса VK with open('VK_RNC.txt', encoding='utf-8') as VK: sents2 = list(read_corpus_to_nltk(VK)) #чтение подкорпуса JZ with open('JZ_RNC.txt', encoding='utf-8') as JZ: sents3 = list(read_corpus_to_nltk(JZ)) tagger = PMContextTagger(sents_RNC) #выбираем обучающий корпус sents_RNC или sents_OC tagger = bt.BrillTaggerTrainer(tagger, templates, trace=3) tagger = tagger.train(sents_RNC, max_rules=400) #задаем max кол-во правил tagger.print_template_statistics(printunused=False) #таблица статистических параметров #читаем и разбиваем на токены файл, который размечаем обученным теггером (взят VK_TEST без разметки) inFile = nltk.word_tokenize(open('VK_TEST.txt', mode='r', encoding='utf-8').read()) #вывод в файл tagged_text.txt размеченного текста with open('tagged_text.txt', mode='w', encoding='utf-8') as tagged: print(tagger.tag(inFile), file=tagged) #вывод в файл tagger_result.txt результатов оценки на разных подкорпусах, списка правил) with open('tagger_result.txt', mode='w', encoding='utf-8') as result: print('Оценка результатов по выборке LENTA: ', tagger.evaluate(sents1), file=result) print('Оценка результатов по выборке VK: ', tagger.evaluate(sents2), file=result) print('Оценка результатов по выборке JZ: ', tagger.evaluate(sents3), file=result)
# Bigram tagger bigram_tagger = tag.BigramTagger(training_data, backoff=unigram_tagger) print("Bigram accuracy: ") print(bigram_tagger.evaluate(evaulation_data)) # Trigram tagger trigram_tagger = tag.TrigramTagger(training_data, backoff=bigram_tagger) print("Trigram accuracy: ") print(trigram_tagger.evaluate(evaulation_data)) # Brill tagger templates templates = brill.fntbl37() # First iteration trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates) brill_tagger = trainer.train(training_data, max_rules, min_score) print("Initial Brill accuracy:") print(brill_tagger.evaluate(evaulation_data)) # 10 Folding for i in range(1, 5): # Random splitting random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list, random.random) cutoff = int(development_size * train) training_data = tagged_data_list[:cutoff] evaulation_data = tagged_data_list[cutoff:development_size] print("Fold: ")
brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Pos([1])), ] train_sents = nltk.corpus.nps_chat.tagged_posts() initial_tagger = create_backoff_tagger() trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) brill_tagger = trainer.train(train_sents) from pickle import dump output = open('brill_tagger.pk1', 'wb') dump(brill_tagger, output, -1) output.close() # import pickle # f = open('brill_tagger.pickle', 'wb') # pickle.dump(brill_tagger, f) # f.close()
test = bloques[iter] train = [] for element in bloques: if element != test: for item in element: train.append(item) # Entrenamiento del etiquetador # Brill tagger baseline_data = train baseline = UnigramTagger(baseline_data) #baseline = hmm.HiddenMarkovModelTagger.train(baseline_data) templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] tagger_brill_tr = brill_trainer.BrillTaggerTrainer(initial_tagger=baseline, templates=templates, trace=3) tagger_brill = tagger_brill_tr.train(train, max_rules=10) ''' # CRF tagger tagger_crf = crf.CRFTagger() tagger_crf.train(train, "model") print("CRF Fold", iter) ''' ''' # Perceptron tagger tagger_perceptron = perceptron.PerceptronTagger(load = False) tagger_perceptron.train(train) ''' # Evaluación del etiquetador