Beispiel #1
0
def train(train_sentences):
    print "- Default Tagger"
    default_tagger = DefaultTagger('NC')

    print "- Unigram Tagger"
    unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger)

    print "- Templates"
    #These templates define the features to be used for the brill tagger
    # relatively to the word position.
    Template._cleartemplates()
    templates = [
        Template(Pos([-1])),
        Template(Pos([-1]), Word([0])),
        Template(Pos([-2])),
        Template(Pos([-2]), Word([0])),
        Template(Pos([1])),
    ]
    print "- Brill Tagger"
    tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1)
    tagger = tt.train(train_sentences, max_rules=1000)

    print "- Done."

    return tagger
Beispiel #2
0
    def __init__(self, args, model_name, load_model=False):
        super().__init__(args, model_name, load_model)
        if not load_model:
            base_tagger = HMM(args, "hmm", load_model=True)
            if not base_tagger.saved_model_exists():
                raise FileNotFoundError(f"Brill base tagger '{base_tagger.model_name}' missing!")

            features = [
                Template(Pos([-1])),
                Template(Pos([1])),
                Template(Pos([-2])),
                Template(Pos([2])),
                Template(Pos([-2, -1])),
                Template(Pos([1, 2])),
                Template(Pos([-3, -2, -1])),
                Template(Pos([1, 2, 3])),
                Template(Pos([-1]), Pos([1])),
                Template(Word([-1])),
                Template(Word([1])),
                Template(Word([-2])),
                Template(Word([2])),
                Template(Word([-2, -1])),
                Template(Word([1, 2])),
                Template(Word([-3, -2, -1])),
                Template(Word([1, 2, 3])),
                Template(Word([-1]), Word([1])),
                ]
            self.model = nltk.BrillTaggerTrainer(base_tagger.model, features)
Beispiel #3
0
def meta_comparison(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    init_tagger = RegexpTagger(patterns)

    evaluations = []

    for i in range(1, nb_iterations):
        # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()
        template = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        tt = BrillTaggerTrainer(init_tagger, template, trace=3)
        currentTagger = tt.train(train_sentences, max_rules=i * 50)
        current_evaluation = currentTagger.evaluate(test_sentences)
        evaluations.append(current_evaluation)

    return evaluations
Beispiel #4
0
def brill_rules_pos_wd_feats_offset_4():
    """
    Return 24 templates of the seminal TBL paper, Brill (1995)
    """
    return [
        Template(Word([-1])),
        Template(Word([-2])),
        Template(Word([-3])),
        Template(Word([-4])),
        Template(Word([0])),
        Template(Word([1])),
        Template(Word([2])),
        Template(Word([3])),
        Template(Word([4])),
    ]
Beispiel #5
0
def brill_rules_pos_bigram_feats_offset_4():
    """
    Return 24 templates of the seminal TBL paper, Brill (1995)
    """
    return [
        Template(Word([-1, 0])),
        Template(Word([-2, -1])),
        Template(Word([-3, -2])),
        Template(Word([-4, -3])),
        Template(Word([1, 0])),
        Template(Word([2, 1])),
        Template(Word([3, 2])),
        Template(Word([4, 3]))
    ]
Beispiel #6
0
def create_tagger(train_sents):

    t0 = nltk.DefaultTagger('S')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)

    if brill_value is True:
        Template._cleartemplates()

        templates = [  #REDUIT#
            Template(Word([0]), Word([-1])),
            Template(Word([0]), Word([1])),
        ]

        t4 = BrillTaggerTrainer(t3, templates, trace=3)

        tagger = t4.train(train_sents, max_rules=20, min_score=0, min_acc=None)
    else:
        tagger = t3

    return tagger
Beispiel #7
0
def demo_generated_templates():
    """
    Template.expand and Feature.expand are class methods facilitating
    generating large amounts of templates. See their documentation for
    details.

    Note: training with 500 templates can easily fill all available
    even on relatively small corpora
    """
    wordtpls = Word.expand([-1,0,1], [1,2], excludezero=False)
    tagtpls = Pos.expand([-2,-1,0,1], [1,2], excludezero=True)
    templates = list(Template.expand([wordtpls, tagtpls], combinations=(1,3)))
    print("Generated {0} templates for transformation-based learning".format(len(templates)))
    postag(templates=templates, incremental_stats=True, template_stats=True)
def demo_generated_templates():
    """
    Template.expand and Feature.expand are class methods facilitating
    generating large amounts of templates. See their documentation for
    details.

    Note: training with 500 templates can easily fill all available
    even on relatively small corpora
    """
    wordtpls = Word.expand([-1,0,1], [1,2], excludezero=False)
    tagtpls = Pos.expand([-2,-1,0,1], [1,2], excludezero=True)
    templates = list(Template.expand([wordtpls, tagtpls], combinations=(1,3)))
    print("Generated {0} templates for transformation-based learning".format(len(templates)))
    postag(templates=templates, incremental_stats=True, template_stats=True)
Beispiel #9
0
def Brill_recursion(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]

    # init_tagger = CRFTagger(feature_func=feature_func)
    # init_tagger.train(train_sentences, 'model.crf.tagger')
    init_tagger = RegexpTagger(patterns)
    currentTagger = None
    current_evaluation = 0.0
    evaluations = []

    for i in range(nb_iterations):
        #Not sure if we need to use BrillTagger or BrillTaggerTrainer??
        #https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()
        templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        if i == 0:
            tt = BrillTaggerTrainer(init_tagger, templates, trace=3)
            currentTagger = tt.train(train_sentences)
            current_evaluation = currentTagger.evaluate(test_sentences)
            evaluations.append(current_evaluation)

        else:
            tt = BrillTaggerTrainer(currentTagger, templates, trace=3)
            tagger = tt.train(train_sentences)
            current_evaluation = tagger.evaluate(test_sentences)
            evaluations.append(current_evaluation)
            currentTagger = tagger

    print(current_evaluation)
    return evaluations
Beispiel #10
0
def demo_multifeature_template():
    """
    Templates can have more than a single feature.
    """
    postag(templates=[Template(Word([0]), Pos([-2,-1]))])
Beispiel #11
0
brown_sents = brown.sents()
size = int(len(brown_sents) * 0.7)

training_data = brown.tagged_sents()[:size]

templates = [
    Template(Pos([-1])),
    Template(Pos([1])),
    Template(Pos([-2])),
    Template(Pos([2])),
    Template(Pos([-2, -1])),
    Template(Pos([1, 2])),
    Template(Pos([-3, -2, -1])),
    Template(Pos([1, 2, 3])),
    Template(Pos([-1]), Pos([1])),
    Template(Word([-1])),
    Template(Word([1])),
    Template(Word([-2])),
    Template(Word([2])),
    Template(Word([-2, -1])),
    Template(Word([1, 2])),
    Template(Word([-3, -2, -1])),
    Template(Word([1, 2, 3])),
    Template(Word([-1]), Word([1]))
]

baseline = CRFTagger()

baseline.set_model_file("model.crf.tagger")

#training brill tagger
Beispiel #12
0
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cadinal numbers
    (r'.*', 'NN'),  # nouns (default)
]

regexp_tagger = nltk.RegexpTagger(patterns)

## Part 2: Transformation-based learning and tagging

# Define rule templates
templates = [
    ## original templates ##
    Template(Pos([-1])),  # previous  POS tag
    Template(Pos([-1]), Word([0])),  # previous POS tag + current word

    ## my new templates ##
    Template(Pos([-2]),
             Pos([-1])),  # previous two POS tags (conjunctive)   (0%)
    Template(Pos([-2, -1])),  # previous two POS tags (disjunctive)   (<2%)
    Template(Word([0]),
             Word([-1])),  # current word + previous word          (0%)
    Template(Pos([-2]),
             Word([0])),  # prev prev POS tag + current word      (<1%)
    Template(Word([-1])),  # previous word                         (<0.1%)
    Template(Pos([-1]),
             Word([-1])),  # previous POS tag + previous word      (0%)
    #Template(Word([0]), Word([1]))      # current word + next word              (<0%)
    Template(Pos([-1]),
             Pos([0])),  # previous POS tag + current POS tag    (0%)
Beispiel #13
0
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'),   # articles
(r'.*able$', 'JJ'),                # adjectives
(r'.*ness$', 'NN'),                # nouns formed from adjectives
(r'.*ly$', 'RB'),                  # adverbs
(r'.*s$', 'NNS'),                  # plural nouns
(r'.*ing$', 'VBG'),                # gerunds
(r'.*ed$', 'VBD'),                 # past tense verbs
(r'.*', 'NN')                      # nouns (default)
])

baseline = backoff
baseline.evaluate(gold_data)

Template._cleartemplates() #clear any templates created in earlier tests
templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

tt = BrillTaggerTrainer(baseline, templates, trace=3)

tagger1 = tt.train(training_data, max_rules=10)
tagger1.rules()[1:3]
train_stats = tagger1.train_stats()

tagger1.print_template_statistics(printunused=False)

tagger1.evaluate(gold_data)
tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)

tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)

print(tagger2.evaluate(gold_data))  # doctest: +ELLIPSIS