Exemple #1
0
def template_comparison(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    init_tagger = RegexpTagger(patterns)
    templates = [
        nltk.tag.brill.nltkdemo18(),
        nltk.tag.brill.nltkdemo18plus(),
        nltk.tag.brill.fntbl37(),
        nltk.tag.brill.brill24()
    ]
    evaluations = []

    for t in templates:
        # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()

        tt = BrillTaggerTrainer(init_tagger, t, trace=3)
        currentTagger = tt.train(train_sentences)
        current_evaluation = currentTagger.evaluate(test_sentences)
        evaluations.append(current_evaluation)

    return evaluations
Exemple #2
0
def meta_comparison(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    init_tagger = RegexpTagger(patterns)

    evaluations = []

    for i in range(1, nb_iterations):
        # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()
        template = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        tt = BrillTaggerTrainer(init_tagger, template, trace=3)
        currentTagger = tt.train(train_sentences, max_rules=i * 50)
        current_evaluation = currentTagger.evaluate(test_sentences)
        evaluations.append(current_evaluation)

    return evaluations
Exemple #3
0
def train(train_sentences):
    print "- Default Tagger"
    default_tagger = DefaultTagger('NC')

    print "- Unigram Tagger"
    unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger)

    print "- Templates"
    #These templates define the features to be used for the brill tagger
    # relatively to the word position.
    Template._cleartemplates()
    templates = [
        Template(Pos([-1])),
        Template(Pos([-1]), Word([0])),
        Template(Pos([-2])),
        Template(Pos([-2]), Word([0])),
        Template(Pos([1])),
    ]
    print "- Brill Tagger"
    tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1)
    tagger = tt.train(train_sentences, max_rules=1000)

    print "- Done."

    return tagger
Exemple #4
0
def Brill_recursion(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]

    # init_tagger = CRFTagger(feature_func=feature_func)
    # init_tagger.train(train_sentences, 'model.crf.tagger')
    init_tagger = RegexpTagger(patterns)
    currentTagger = None
    current_evaluation = 0.0
    evaluations = []

    for i in range(nb_iterations):
        #Not sure if we need to use BrillTagger or BrillTaggerTrainer??
        #https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()
        templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        if i == 0:
            tt = BrillTaggerTrainer(init_tagger, templates, trace=3)
            currentTagger = tt.train(train_sentences)
            current_evaluation = currentTagger.evaluate(test_sentences)
            evaluations.append(current_evaluation)

        else:
            tt = BrillTaggerTrainer(currentTagger, templates, trace=3)
            tagger = tt.train(train_sentences)
            current_evaluation = tagger.evaluate(test_sentences)
            evaluations.append(current_evaluation)
            currentTagger = tagger

    print(current_evaluation)
    return evaluations
Exemple #5
0
def create_tagger(train_sents):

    t0 = nltk.DefaultTagger('S')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)

    if brill_value is True:
        Template._cleartemplates()

        templates = [  #REDUIT#
            Template(Word([0]), Word([-1])),
            Template(Word([0]), Word([1])),
        ]

        t4 = BrillTaggerTrainer(t3, templates, trace=3)

        tagger = t4.train(train_sents, max_rules=20, min_score=0, min_acc=None)
    else:
        tagger = t3

    return tagger
    def __init__(self, tagged_sents, anonProperNouns=False, initialTagger=None, max_rules=250, min_score=2,
                 min_acc=None, template='fntbl37'):
        '''
        Construct a new MTEBrillTagger and train it with the sentences from tagged_sents.

        :param tagged_sents: Tagged sentences to train the tagger.
        :type tagged_sents: [[(word:str, tag:str)]]
        :param anonProperNouns: Set 'True' to replace every proper noun with an anonymous string. Currently only for MTE tags.
        :type anonProperNouns: bool
        :param initialTagger: If None or unset, use UnigramTagger as initial tagger; use specified one else ('self._tagger = initialTagger')
        :type initialTagger: Tagger
        :param max_rules: tagger generates at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :param template: template set to use to train Brill Tagger. Can be the name of a function from nltk.tag.brill that returns a template set or a list of templates.
        :type template: str or list
 
        '''

        self._tagged_sents = []
        ANON = "anon"

        if anonProperNouns:
            for s in tagged_sents:
                tmp = []
                for (w, tag) in s:
                    if tag.startswith("#Np"):
                        tmp.append((ANON, "#Np"))
                    else:
                        tmp.append((w, tag))
                self._tagged_sents.append(tmp)
        else:
            self._tagged_sents = tagged_sents

        Template._cleartemplates()

        # If 'template' parameter is 'None' default to fntbl37 template set
        if template is None:
            templates = fntbl37()

        # Check if 'template' parameter is a list. If it is try to use it directly
        elif type(template) is list:
            templates = template

        # Check if 'template' is a string. If it is try to get the template set from nltk
        elif type(template) is str:
            if template == "fntbl37":
                templates = fntbl37()
            elif template == "brill24":
                templates = brill24()
            elif template == "nltkdemo18":
                templates = nltkdemo18()
            elif template == "nltkdemo18plus":
                templates = nltkdemo18plus()
            elif template == "baseline":
                templates = None
            else:
                raise ValueError("Method returning templates not found!")

        # If it is any other type, raise error
        else:
            raise ValueError(
                "Please specify the name of a function that returns a list of templates or a list of templates directly!")

        if initialTagger is None:
            self._tagger = UnigramTagger(self._tagged_sents)
        else:
            self._tagger = initialTagger

        if templates is not None:
            self._tagger = BrillTaggerTrainer(self._tagger, templates, trace=3)
            self._tagger = self._tagger.train(self._tagged_sents, max_rules=max_rules, min_score=min_score,
                                              min_acc=min_acc)
Exemple #7
0
import pickle
import nltk.tag
from nltk.corpus import brown
from nltk.tag import CRFTagger
from nltk.tbl.template import Template
from nltk.tag.brill import Pos, Word
from nltk.tag import BrillTaggerTrainer

#preparing baseline CRFTagger and trainingData for brill tagger
brown_sents = brown.sents()
size = int(len(brown_sents) * 0.7)

training_data = brown.tagged_sents()[:size]

templates = [
    Template(Pos([-1])),
    Template(Pos([1])),
    Template(Pos([-2])),
    Template(Pos([2])),
    Template(Pos([-2, -1])),
    Template(Pos([1, 2])),
    Template(Pos([-3, -2, -1])),
    Template(Pos([1, 2, 3])),
    Template(Pos([-1]), Pos([1])),
    Template(Word([-1])),
    Template(Word([1])),
    Template(Word([-2])),
    Template(Word([2])),
    Template(Word([-2, -1])),
    Template(Word([1, 2])),
    Template(Word([-3, -2, -1])),
Exemple #8
0
backoff = RegexpTagger([
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'),   # articles
(r'.*able$', 'JJ'),                # adjectives
(r'.*ness$', 'NN'),                # nouns formed from adjectives
(r'.*ly$', 'RB'),                  # adverbs
(r'.*s$', 'NNS'),                  # plural nouns
(r'.*ing$', 'VBG'),                # gerunds
(r'.*ed$', 'VBD'),                 # past tense verbs
(r'.*', 'NN')                      # nouns (default)
])

baseline = backoff
baseline.evaluate(gold_data)

Template._cleartemplates() #clear any templates created in earlier tests
templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

tt = BrillTaggerTrainer(baseline, templates, trace=3)

tagger1 = tt.train(training_data, max_rules=10)
tagger1.rules()[1:3]
train_stats = tagger1.train_stats()

tagger1.print_template_statistics(printunused=False)

tagger1.evaluate(gold_data)
tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)

tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)
Exemple #9
0
random.shuffle(bloques)
for iter in range(10):
    test = bloques[iter]
    train = []
    for element in bloques:
        if element != test:
            for item in element:
                train.append(item)

    # Entrenamiento del etiquetador

    # Brill tagger
    baseline_data = train
    baseline = UnigramTagger(baseline_data)
    #baseline = hmm.HiddenMarkovModelTagger.train(baseline_data)
    templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]
    tagger_brill_tr = brill_trainer.BrillTaggerTrainer(initial_tagger=baseline,
                                                       templates=templates,
                                                       trace=3)
    tagger_brill = tagger_brill_tr.train(train, max_rules=10)
    '''
    # CRF tagger
    tagger_crf = crf.CRFTagger()
    tagger_crf.train(train, "model")
    print("CRF Fold", iter)
    '''
    '''
    # Perceptron tagger
    tagger_perceptron = perceptron.PerceptronTagger(load = False)
    tagger_perceptron.train(train)
    '''
Exemple #10
0
    def __init__(self, args, model_name, load_model=False):
        super().__init__(args, model_name, load_model)
        if not load_model:
            base_tagger = HMM(args, "hmm", load_model=True)
            if not base_tagger.saved_model_exists():
                raise FileNotFoundError(f"Brill base tagger '{base_tagger.model_name}' missing!")

            features = [
                Template(Pos([-1])),
                Template(Pos([1])),
                Template(Pos([-2])),
                Template(Pos([2])),
                Template(Pos([-2, -1])),
                Template(Pos([1, 2])),
                Template(Pos([-3, -2, -1])),
                Template(Pos([1, 2, 3])),
                Template(Pos([-1]), Pos([1])),
                Template(Word([-1])),
                Template(Word([1])),
                Template(Word([-2])),
                Template(Word([2])),
                Template(Word([-2, -1])),
                Template(Word([1, 2])),
                Template(Word([-3, -2, -1])),
                Template(Word([1, 2, 3])),
                Template(Word([-1]), Word([1])),
                ]
            self.model = nltk.BrillTaggerTrainer(base_tagger.model, features)