コード例 #1
0
ファイル: brill.py プロジェクト: menzenski/Razmetka
 def compare_templates(self):
     for i, t in enumerate([
             brill.nltkdemo18(),
             brill.nltkdemo18plus(),
             brill.brill24(),
             brill.fntbl37()
     ]):
         print "\nTEMPLATE {}==================\n".format(i)
         self.train(templates=t)
コード例 #2
0
ファイル: BrillTagger.py プロジェクト: AnandN5/qint_nlp
    def __init__(self):
        # bounds = [(1, end)]
        initial_tagger = get_initial_tagger()
        rules = brill.fntbl37()

        self.trainer = BrillTaggerTrainer(initial_tagger,
                                          rules,
                                          deterministic=True,
                                          trace=0)
        train_sents, test_sents = utils.training_testing_dataset()
        self.tagger = self.trainer.train(train_sents, max_rules=20)
        print('Brill tagger training completed')
コード例 #3
0
def train_brill_tagger(initial_tagger, train_sents, end, trace=0, **kwargs):
	bounds = [(1, end)]
	
	# call this to fetch templates directly
	# NOTE : This is the comment from the method below:
	#### Return 37 templates taken from the postagging task of the
	#### fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/
	templates = brill.fntbl37()
	
	trainer = BrillTaggerTrainer(initial_tagger, templates,
		deterministic=True, trace=trace)
	return trainer.train(train_sents, **kwargs)
コード例 #4
0
ファイル: brills_tagger.py プロジェクト: maobedkova/StatNLP
def train_evaluate_brills(train_data, test_data):
    """Training and evaluating of Brill`s tagger"""
    # Define templates for rules, provided by nltk
    brill.Template._cleartemplates()
    templates = brill.fntbl37()
    # Define initial tagger, tagging by the most common tag
    initial_tagger = UnigramTagger(train_data)
    trainer = brill_trainer.BrillTaggerTrainer(
        initial_tagger=initial_tagger,  # better unk words handling
        templates=templates,
        trace=3,
        deterministic=True)
    tagger = trainer.train(train_data,
                           max_rules=100)  # max number of rules to learn 100
    print("Accuracy:", tagger.evaluate(test_data))
    return tagger.evaluate(test_data)
コード例 #5
0
def makeBrillTagger():
    import nltk.tag
    from nltk.tag import brill
    train_sents = nltk.corpus.brown.tagged_sents()
    word_patterns = [
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
        (r'.*ould$', 'MD'),
        (r'.*ing$', 'VBG'),
        (r'.*ed$', 'VBD'),
        (r'.*ness$', 'NN'),
        (r'.*ment$', 'NN'),
        (r'.*ful$', 'JJ'),
        (r'.*ious$', 'JJ'),
        (r'.*ble$', 'JJ'),
        (r'.*ic$', 'JJ'),
        (r'.*ive$', 'JJ'),
        (r'.*ic$', 'JJ'),
        (r'.*est$', 'JJ'),
        (r'^a$', 'PREP'),
    ]

    raubt_tagger = backoff_tagger(train_sents, [
        nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger,
        nltk.tag.TrigramTagger
    ],
                                  backoff=nltk.tag.RegexpTagger(word_patterns))

    # templates = [
    #     brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
    #     brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
    #     brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
    #     brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
    #     brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
    #     brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
    #     brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
    #     brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
    #     brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
    #     brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
    #     ]
    templates = brill.fntbl37()
    trainer = nltk.BrillTaggerTrainer(raubt_tagger, templates)
    braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3)
    pickle.dump(braubt_tagger, open("data/braubt_tagger.dat", "w"))
    return braubt_tagger
コード例 #6
0
ファイル: pattern_wikicorpus.py プロジェクト: jgsogo/lingwars
def contextual_rules(wikicorpus_dir, context_file):
    sentences = wikicorpus(wikicorpus_dir, words=1000000)

    ANONYMOUS = "anonymous"
    for s in sentences:
        for i, (w, tag) in enumerate(s):
            if tag == "NP": # NP = proper noun in Parole tagset.
                s[i] = (ANONYMOUS, "NP")

    ctx = fntbl37()

    tagger = UnigramTagger(sentences)
    tagger = BrillTaggerTrainer(tagger, ctx, trace=0)
    tagger = tagger.train(sentences, max_rules=100)

    #print tagger.evaluate(wikicorpus(10000, start=1))

    with open(context_file, "w") as f:
        for rule in tagger.rules():
            f.write("%s\n" % rule)
コード例 #7
0
    def customize_tagger(train_sets,
                         test_sets=None,
                         tagger_name='Brill_Tagger',
                         return_tagger=False):
        """
        use train set to train customized tagger
        :param tagger_name:
        :param train_sets:
        :param test_sets:
        :return: trained tagger's score
        """
        tagger = nltk.DefaultTagger('NN')
        tagger = nltk.UnigramTagger(train_sets, backoff=tagger)
        tagger = nltk.BigramTagger(train_sets, backoff=tagger)

        # tagger = nltk.tag.PerceptronTagger()
        #
        # os.environ['JAVAHOME'] = java_path
        # tagger = StanfordPOSTagger(stanford_tagger_model_path, stanford_tagger_jar_path)

        templates = brill.fntbl37()
        brill_tagger = brill_trainer.BrillTaggerTrainer(tagger,
                                                        templates,
                                                        trace=3)
        brill_tagger = brill_tagger.train(train_sets, max_rules=300)

        # TaggerUtil.persistenize_tagger_model(tagger, 'Multigram_Tagger')
        TaggerUtil.persistenize_tagger_model(brill_tagger, tagger_name)

        score = -1
        if test_sets:
            score = brill_tagger.evaluate(test_sets)

        if return_tagger:
            return score, brill_tagger
        else:
            return score
コード例 #8
0
ファイル: training_tagger.py プロジェクト: bugraoral/TextRank
unigram_tagger = tag.UnigramTagger(training_data, backoff=nn_cd_tagger) 
print("Unigram accuracy: ")
print(unigram_tagger.evaluate(evaulation_data))

# Bigram tagger 
bigram_tagger = tag.BigramTagger(training_data, backoff=unigram_tagger)
print("Bigram accuracy: ")
print(bigram_tagger.evaluate(evaulation_data))

# Trigram tagger 
trigram_tagger = tag.TrigramTagger(training_data, backoff=bigram_tagger)
print("Trigram accuracy: ")
print(trigram_tagger.evaluate(evaulation_data))

# Brill tagger templates
templates = brill.fntbl37()

# First iteration
trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates)
brill_tagger = trainer.train(training_data, max_rules, min_score) 
print("Initial Brill accuracy:")
print(brill_tagger.evaluate(evaulation_data))

# 10 Folding
for i in range(1,5):

    # Random splitting
    random.seed(len(tagged_data_list)) 
    random.shuffle(tagged_data_list,random.random) 
    cutoff = int(development_size*train) 
    training_data = tagged_data_list[:cutoff] 
コード例 #9
0
ファイル: training_tagger.py プロジェクト: Temerrut/TextRank
unigram_tagger = tag.UnigramTagger(training_data, backoff=nn_cd_tagger)
print("Unigram accuracy: ")
print(unigram_tagger.evaluate(evaulation_data))

# Bigram tagger
bigram_tagger = tag.BigramTagger(training_data, backoff=unigram_tagger)
print("Bigram accuracy: ")
print(bigram_tagger.evaluate(evaulation_data))

# Trigram tagger
trigram_tagger = tag.TrigramTagger(training_data, backoff=bigram_tagger)
print("Trigram accuracy: ")
print(trigram_tagger.evaluate(evaulation_data))

# Brill tagger templates
templates = brill.fntbl37()

# First iteration
trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates)
brill_tagger = trainer.train(training_data, max_rules, min_score)
print("Initial Brill accuracy:")
print(brill_tagger.evaluate(evaulation_data))

# 10 Folding
for i in range(1, 5):

    # Random splitting
    random.seed(len(tagged_data_list))
    random.shuffle(tagged_data_list, random.random)
    cutoff = int(development_size * train)
    training_data = tagged_data_list[:cutoff]
コード例 #10
0
ファイル: brill.py プロジェクト: menzenski/Razmetka
 def compare_templates(self):
     for i, t in enumerate([brill.nltkdemo18(), brill.nltkdemo18plus(),
                            brill.brill24(), brill.fntbl37()]):
         print "\nTEMPLATE {}==================\n".format(i)
         self.train(templates=t)