def train_brill_tagger(initial_tagger, train_sents, **kwargs):
    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])),
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger,
                                               templates,
                                               deterministic=True)
    return trainer.train(train_sents, **kwargs)
Beispiel #2
0
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
	templates = [
		brill.Template(brill.Pos([-1])),
		brill.Template(brill.Pos([1])),
		brill.Template(brill.Pos([-2])),
		brill.Template(brill.Pos([2])),
		brill.Template(brill.Pos([-2, -1])),
		brill.Template(brill.Pos([1, 2])),
		brill.Template(brill.Pos([-3, -2, -1])),
		brill.Template(brill.Pos([1, 2, 3])),
		brill.Template(brill.Pos([-1]), brill.Pos([1])),
		brill.Template(brill.Word([-1])),
		brill.Template(brill.Word([1])),
		brill.Template(brill.Word([-2])),
		brill.Template(brill.Word([2])),
		brill.Template(brill.Word([-2, -1])),
		brill.Template(brill.Word([1, 2])),
		brill.Template(brill.Word([-3, -2, -1])),
		brill.Template(brill.Word([1, 2, 3])),
		brill.Template(brill.Word([-1]), brill.Word([1])),
	]

	#templates = nltkdemo18() # nltkdemo18plus() # fntbl37() # brill24()

	trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=True)

	return trainer.train(train_sents, **kwargs)
Beispiel #3
0
def train_brill_tagger(initial_tagger, training, **kwargs):
    """
        Function to train a brill tagger. Uses rules to correct the results of a tagger
    """
    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])),
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger,
                                               templates,
                                               deterministic=True)
    return trainer.train(training, **kwargs)
Beispiel #4
0
 def test_pos_template(self):
     train_sents = treebank.tagged_sents()[:1000]
     tagger = UnigramTagger(train_sents)
     trainer = brill_trainer.BrillTaggerTrainer(
         tagger, [brill.Template(brill.Pos([-1]))])
     brill_tagger = trainer.train(train_sents)
     # Example from https://github.com/nltk/nltk/issues/769
     result = brill_tagger.tag('This is a foo bar sentence'.split())
     expected = [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('foo', None),
                 ('bar', 'NN'), ('sentence', None)]
     self.assertEqual(result, expected)
Beispiel #5
0
def train_evaluate_brills(train_data, test_data):
    """Training and evaluating of Brill`s tagger"""
    # Define templates for rules, provided by nltk
    brill.Template._cleartemplates()
    templates = brill.fntbl37()
    # Define initial tagger, tagging by the most common tag
    initial_tagger = UnigramTagger(train_data)
    trainer = brill_trainer.BrillTaggerTrainer(
        initial_tagger=initial_tagger,  # better unk words handling
        templates=templates,
        trace=3,
        deterministic=True)
    tagger = trainer.train(train_data,
                           max_rules=100)  # max number of rules to learn 100
    print("Accuracy:", tagger.evaluate(test_data))
    return tagger.evaluate(test_data)
Beispiel #6
0
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
	'''
		some suggested rules for the template
			change the POS of a word, depending on the POS of the previous word
			change the POS of a word, depending on the POS of any of the two previous words
			change the POS of a word, depending on the POS of any of the three previous words
			change the POS of a word, depending on the POS of the previous word and the POS of the next word
			change the POS of a word, depending on the previous word
			change the POS of a word, depending on any of the two previous words
			change the POS of a word, depending on any of the three previous words
			change the POS of a word, depending on the previous word and the next word
	'''

	# Template generates rule for the Brill Rules that Brill tagger gonna use it
	 
	templates = [
	       brill.Template(brill.Pos([-1])),  # rule can be generated using the previous POS tag
	       brill.Template(brill.Pos([1])),  # look at the next POS tag to generate a rule
	       brill.Template(brill.Pos([-2])),  # rule can be generated using the two previous POS tag
	       brill.Template(brill.Pos([2])),  # rule can be generated using the next two POS tag
	       brill.Template(brill.Pos([-2, -1])),  # look at the combination of the previous two words to learn transformation rule
	       brill.Template(brill.Pos([1, 2])),
	       brill.Template(brill.Pos([-3, -2, -1])),
	       brill.Template(brill.Pos([1, 2, 3])),
	       brill.Template(brill.Pos([-1]), brill.Pos([1])),
	       brill.Template(brill.Word([-1])),
	       brill.Template(brill.Word([1])),
	       brill.Template(brill.Word([-2])),
		   brill.Template(brill.Word([2])),
		   brill.Template(brill.Word([-2, -1])),
		   brill.Template(brill.Word([1, 2])),
		   brill.Template(brill.Word([-3, -2, -1])),
		   brill.Template(brill.Word([1, 2, 3])),
		   brill.Template(brill.Word([-1]), brill.Word([1])),
		]

	'''
		BrillTaggerTrainer(1st, 2nd, 3rd, ...)
			1st param initial_tagger: (Tagger) the baseline tagger
			2nd param templates : (list of templates) templates to be used in training
			3rd param trace: (int) verbosity level == information level u want to see
			4th param deterministic: (bool) if True, adjudicate ties deterministically
			5th ruleformat: (str) format of reported rules
	'''
	
	trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=True)
	return trainer.train(train_sents, max_rules=100, min_score=2)
 def test_pos_template(self):
     train_sents = treebank.tagged_sents()[:1000]
     tagger = UnigramTagger(train_sents)
     trainer = brill_trainer.BrillTaggerTrainer(
         tagger, [brill.Template(brill.Pos([-1]))])
     brill_tagger = trainer.train(train_sents)
     # Example from https://github.com/nltk/nltk/issues/769
     result = brill_tagger.tag("This is a foo bar sentence".split())
     expected = [
         ("This", "DT"),
         ("is", "VBZ"),
         ("a", "DT"),
         ("foo", None),
         ("bar", "NN"),
         ("sentence", None),
     ]
     self.assertEqual(result, expected)
Beispiel #8
0
def train_brill_tagger(tagged_sents):

    # The brill tagger module in NLTK.
    Template._cleartemplates()
    templates = brill24()  # or fntbl37
    # default_tagger = nltk.DefaultTagger('MORA_HAUPT')
    patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e
        (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'),
                (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'),
                (r'.*', 'MORA_HAUPT')]  # default
    regex_tagger = nltk.RegexpTagger(patterns)
    tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger)
    # cutoff = 3, if necessary
    tagger2 = BigramTagger(tagged_sents, backoff=tagger1)
    tagger3 = TrigramTagger(tagged_sents, backoff=tagger2)
    tagger4 = brill_trainer.BrillTaggerTrainer(tagger3, templates, trace=3)
    tagger5 = tagger4.train(tagged_sents, max_rules=200)

    print
    return tagger5
Beispiel #9
0
    def customize_tagger(train_sets,
                         test_sets=None,
                         tagger_name='Brill_Tagger',
                         return_tagger=False):
        """
        use train set to train customized tagger
        :param tagger_name:
        :param train_sets:
        :param test_sets:
        :return: trained tagger's score
        """
        tagger = nltk.DefaultTagger('NN')
        tagger = nltk.UnigramTagger(train_sets, backoff=tagger)
        tagger = nltk.BigramTagger(train_sets, backoff=tagger)

        # tagger = nltk.tag.PerceptronTagger()
        #
        # os.environ['JAVAHOME'] = java_path
        # tagger = StanfordPOSTagger(stanford_tagger_model_path, stanford_tagger_jar_path)

        templates = brill.fntbl37()
        brill_tagger = brill_trainer.BrillTaggerTrainer(tagger,
                                                        templates,
                                                        trace=3)
        brill_tagger = brill_tagger.train(train_sets, max_rules=300)

        # TaggerUtil.persistenize_tagger_model(tagger, 'Multigram_Tagger')
        TaggerUtil.persistenize_tagger_model(brill_tagger, tagger_name)

        score = -1
        if test_sets:
            score = brill_tagger.evaluate(test_sets)

        if return_tagger:
            return score, brill_tagger
        else:
            return score
Beispiel #10
0
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
    templates = [
        brill.Template(brill.Pos([-1])), #a rule can be generated using the previous part-of-speech tag
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])), #you can look at the combination of the previous two words to learn a transformation rule
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True)
    return trainer.train(train_sents, **kwargs)
    sents_RNC = list(read_corpus_to_nltk(media))

#чтение подкорпуса LENTA
with open('LENTA_RNC.txt', encoding='utf-8') as LENTA:
    sents1 = list(read_corpus_to_nltk(LENTA))

#чтение подкорпуса VK
with open('VK_RNC.txt', encoding='utf-8') as VK:
    sents2 = list(read_corpus_to_nltk(VK))

#чтение подкорпуса JZ
with open('JZ_RNC.txt', encoding='utf-8') as JZ:
    sents3 = list(read_corpus_to_nltk(JZ))
    
tagger = PMContextTagger(sents_RNC) #выбираем обучающий корпус sents_RNC или sents_OC
tagger = bt.BrillTaggerTrainer(tagger, templates, trace=3)
tagger = tagger.train(sents_RNC, max_rules=400) #задаем max кол-во правил
tagger.print_template_statistics(printunused=False) #таблица статистических параметров

#читаем и разбиваем на токены файл, который размечаем обученным теггером (взят VK_TEST без разметки)
inFile = nltk.word_tokenize(open('VK_TEST.txt', mode='r', encoding='utf-8').read())

#вывод в файл tagged_text.txt размеченного текста
with open('tagged_text.txt', mode='w', encoding='utf-8') as tagged:
    print(tagger.tag(inFile), file=tagged)

#вывод в файл tagger_result.txt результатов оценки на разных подкорпусах, списка правил)
with open('tagger_result.txt', mode='w', encoding='utf-8') as result:
    print('Оценка результатов по выборке LENTA: ', tagger.evaluate(sents1), file=result)
    print('Оценка результатов по выборке VK: ', tagger.evaluate(sents2), file=result)
    print('Оценка результатов по выборке JZ: ', tagger.evaluate(sents3), file=result)
Beispiel #12
0
# Bigram tagger
bigram_tagger = tag.BigramTagger(training_data, backoff=unigram_tagger)
print("Bigram accuracy: ")
print(bigram_tagger.evaluate(evaulation_data))

# Trigram tagger
trigram_tagger = tag.TrigramTagger(training_data, backoff=bigram_tagger)
print("Trigram accuracy: ")
print(trigram_tagger.evaluate(evaulation_data))

# Brill tagger templates
templates = brill.fntbl37()

# First iteration
trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates)
brill_tagger = trainer.train(training_data, max_rules, min_score)
print("Initial Brill accuracy:")
print(brill_tagger.evaluate(evaulation_data))

# 10 Folding
for i in range(1, 5):

    # Random splitting
    random.seed(len(tagged_data_list))
    random.shuffle(tagged_data_list, random.random)
    cutoff = int(development_size * train)
    training_data = tagged_data_list[:cutoff]
    evaulation_data = tagged_data_list[cutoff:development_size]

    print("Fold: ")
    brill.Template(brill.Pos([1, 2])),
    brill.Template(brill.Pos([-3, -2, -1])),
    brill.Template(brill.Pos([1, 2, 3])),
    brill.Template(brill.Pos([-1]), brill.Pos([1])),
    brill.Template(brill.Word([-1])),
    brill.Template(brill.Word([1])),
    brill.Template(brill.Word([-2])),
    brill.Template(brill.Word([2])),
    brill.Template(brill.Word([-2, -1])),
    brill.Template(brill.Word([1, 2])),
    brill.Template(brill.Word([-3, -2, -1])),
    brill.Template(brill.Word([1, 2, 3])),
    brill.Template(brill.Word([-1]), brill.Pos([1])),
]

train_sents = nltk.corpus.nps_chat.tagged_posts()
initial_tagger = create_backoff_tagger()
trainer = brill_trainer.BrillTaggerTrainer(initial_tagger,
                                           templates,
                                           deterministic=True)
brill_tagger = trainer.train(train_sents)

from pickle import dump
output = open('brill_tagger.pk1', 'wb')
dump(brill_tagger, output, -1)
output.close()

# import pickle
# f = open('brill_tagger.pickle', 'wb')
# pickle.dump(brill_tagger, f)
# f.close()
Beispiel #14
0
    test = bloques[iter]
    train = []
    for element in bloques:
        if element != test:
            for item in element:
                train.append(item)

    # Entrenamiento del etiquetador

    # Brill tagger
    baseline_data = train
    baseline = UnigramTagger(baseline_data)
    #baseline = hmm.HiddenMarkovModelTagger.train(baseline_data)
    templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]
    tagger_brill_tr = brill_trainer.BrillTaggerTrainer(initial_tagger=baseline,
                                                       templates=templates,
                                                       trace=3)
    tagger_brill = tagger_brill_tr.train(train, max_rules=10)
    '''
    # CRF tagger
    tagger_crf = crf.CRFTagger()
    tagger_crf.train(train, "model")
    print("CRF Fold", iter)
    '''
    '''
    # Perceptron tagger
    tagger_perceptron = perceptron.PerceptronTagger(load = False)
    tagger_perceptron.train(train)
    '''

    # Evaluación del etiquetador