Example #1
0
def meta_comparison(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    init_tagger = RegexpTagger(patterns)

    evaluations = []

    for i in range(1, nb_iterations):
        # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()
        template = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        tt = BrillTaggerTrainer(init_tagger, template, trace=3)
        currentTagger = tt.train(train_sentences, max_rules=i * 50)
        current_evaluation = currentTagger.evaluate(test_sentences)
        evaluations.append(current_evaluation)

    return evaluations
Example #2
0
def train(train_sentences):
    print "- Default Tagger"
    default_tagger = DefaultTagger('NC')

    print "- Unigram Tagger"
    unigram_tagger = UnigramTagger(train_sentences, backoff=default_tagger)

    print "- Templates"
    #These templates define the features to be used for the brill tagger
    # relatively to the word position.
    Template._cleartemplates()
    templates = [
        Template(Pos([-1])),
        Template(Pos([-1]), Word([0])),
        Template(Pos([-2])),
        Template(Pos([-2]), Word([0])),
        Template(Pos([1])),
    ]
    print "- Brill Tagger"
    tt = BrillTaggerTrainer(unigram_tagger, templates, trace=1)
    tagger = tt.train(train_sentences, max_rules=1000)

    print "- Done."

    return tagger
Example #3
0
def template_comparison(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    init_tagger = RegexpTagger(patterns)
    templates = [
        nltk.tag.brill.nltkdemo18(),
        nltk.tag.brill.nltkdemo18plus(),
        nltk.tag.brill.fntbl37(),
        nltk.tag.brill.brill24()
    ]
    evaluations = []

    for t in templates:
        # https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()

        tt = BrillTaggerTrainer(init_tagger, t, trace=3)
        currentTagger = tt.train(train_sentences)
        current_evaluation = currentTagger.evaluate(test_sentences)
        evaluations.append(current_evaluation)

    return evaluations
    def test_brill_tagger(self):
        trainer = BrillTaggerTrainer(self.default_tagger, nltkdemo18(),
                                     deterministic=True)
        tagger = trainer.train(self.corpus, max_rules=30)

        encoded = self.encoder.encode(tagger)
        decoded = self.decoder.decode(encoded)

        self.assertEqual(repr(tagger._initial_tagger),
                         repr(decoded._initial_tagger))
        self.assertEqual(tagger._rules, decoded._rules)
        self.assertEqual(tagger._training_stats, decoded._training_stats)
Example #5
0
def gen_ap_regex():
    print "============================================================"
    print "Generate Regex from learned Brill tagging rules."
    # Parameters:
    training = my_corpus.tagged_sents()
    templates = nltk.tag.brill.fntbl37()
    n_rules = 30

    # Taggers:
    print "Initializing ..."
    regex_tagger = nltk.RegexpTagger([
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'.*', 'NN')  # nouns (default)
    ])
    u_gram_tag = nltk.UnigramTagger(training, backoff=regex_tagger)
    b_gram_tag = nltk.BigramTagger(training, backoff=u_gram_tag)
    t_gram_tag = nltk.TrigramTagger(training, backoff=b_gram_tag)

    print "Training brill tagger ..."
    tt = BrillTaggerTrainer(t_gram_tag, templates, trace=3)
    brill_tagger = tt.train(training, max_rules=n_rules)
    print "Training finished."

    print "Template size:", len(templates)
    range_l, range_r = get_template_range(templates)
    print "Template range:", range_l, range_r
    print "Total rules:", len(brill_tagger.rules())
    print "Generating Regex for the AP ..."

    for rule in brill_tagger.rules():
        regex, report_tag = rule_to_regex(rule, range_l, range_r)
        print report_tag, ":", regex

    print "Done."
Example #6
0
def create_tagger(train_sents):

    t0 = nltk.DefaultTagger('S')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)

    if brill_value is True:
        Template._cleartemplates()

        templates = [  #REDUIT#
            Template(Word([0]), Word([-1])),
            Template(Word([0]), Word([1])),
        ]

        t4 = BrillTaggerTrainer(t3, templates, trace=3)

        tagger = t4.train(train_sents, max_rules=20, min_score=0, min_acc=None)
    else:
        tagger = t3

    return tagger
Example #7
0
def Brill_recursion(nb_iterations):
    # Taken from http://www.nltk.org/book/ch05.html
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]

    # init_tagger = CRFTagger(feature_func=feature_func)
    # init_tagger.train(train_sentences, 'model.crf.tagger')
    init_tagger = RegexpTagger(patterns)
    currentTagger = None
    current_evaluation = 0.0
    evaluations = []

    for i in range(nb_iterations):
        #Not sure if we need to use BrillTagger or BrillTaggerTrainer??
        #https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.brill_trainer
        Template._cleartemplates()
        templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

        if i == 0:
            tt = BrillTaggerTrainer(init_tagger, templates, trace=3)
            currentTagger = tt.train(train_sentences)
            current_evaluation = currentTagger.evaluate(test_sentences)
            evaluations.append(current_evaluation)

        else:
            tt = BrillTaggerTrainer(currentTagger, templates, trace=3)
            tagger = tt.train(train_sentences)
            current_evaluation = tagger.evaluate(test_sentences)
            evaluations.append(current_evaluation)
            currentTagger = tagger

    print(current_evaluation)
    return evaluations
Example #8
0
def test_cross_validation(btagger, tmpl, nrule):
    print "============================================================"
    print "Cross Validation on corpus."

    # Parameters:
    if tmpl == 'brill24':
        templates = nltk.tag.brill.brill24()
    elif tmpl == 'fntbl37':
        templates = nltk.tag.brill.fntbl37()
    else:
        assert False
    n_rules = nrule
    fold = 5
    ap_freq = 133000000.0

    accuracy1 = []
    accuracy2 = []
    brill_time = []
    ap_time = []
    regex_tagger = nltk.RegexpTagger([
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'.*', 'NN')  # nouns (default)
    ])
    for k in range(fold):
        if my_corpus == treebank:
            training = [
                x for i, x in enumerate(my_corpus.tagged_sents())
                if i % fold != k
            ]
            validation = [
                x for i, x in enumerate(my_corpus.tagged_sents())
                if i % fold == k
            ]
            testing = [
                x for i, x in enumerate(my_corpus.sents()) if i % fold == k
            ]
        elif my_corpus == brown:
            training = [
                x for i, x in enumerate(my_corpus.tagged_sents())
                if i % fold != k
            ]
            validation = [
                x for i, x in enumerate(my_corpus.tagged_sents())
                if i % fold == k
            ]
            testing = [
                x for i, x in enumerate(my_corpus.sents()) if i % fold == k
            ]
            #training = [x for i, x in enumerate(my_corpus.tagged_sents(categories='news')) if i % fold != k]
            #validation = [x for i, x in enumerate(my_corpus.tagged_sents(categories='news')) if i % fold == k]
            #testing = [x for i, x in enumerate(my_corpus.sents(categories='news')) if i % fold == k]
        else:
            assert False

        print "\n\nFold", k, "Initializing ...", tmpl, ", ", btagger
        if btagger == 'r':
            baseline = regex_tagger
        elif btagger == 'u':
            u_gram_tag = nltk.UnigramTagger(training, backoff=regex_tagger)
            baseline = u_gram_tag
        elif btagger == 'b':
            u_gram_tag = nltk.UnigramTagger(training, backoff=regex_tagger)
            b_gram_tag = nltk.BigramTagger(training, backoff=u_gram_tag)
            baseline = b_gram_tag
        elif btagger == 't':
            u_gram_tag = nltk.UnigramTagger(training, backoff=regex_tagger)
            b_gram_tag = nltk.BigramTagger(training, backoff=u_gram_tag)
            t_gram_tag = nltk.TrigramTagger(training, backoff=b_gram_tag)
            baseline = t_gram_tag
        elif btagger == 's':  # stanford
            stagger = nltk.tag.stanford.StanfordPOSTagger(
                stanford_path + 'models/english-bidirectional-distsim.tagger',
                stanford_path + 'stanford-postagger.jar')
            baseline = stagger
        else:
            assert False

        print "Evaluating testing accuracy ..."
        begin = time.time()
        baseline_accuracy = baseline.evaluate(validation)
        end = time.time()
        print "Baseline Testing Time =", end - begin, "second"
        t1 = end - begin
        print "Baseline Accuracy =", baseline_accuracy

        tt = BrillTaggerTrainer(baseline, templates, trace=3)
        print "Training Brill tagger ..."
        begin = time.time()
        brill_tagger = tt.train(training, max_rules=n_rules)
        end = time.time()
        print "Brill Tagger Training Time =", end - begin, "second"
        print "Find rules:", len(brill_tagger.rules())

        print "Testing ..."
        begin = time.time()
        brill_accuracy = brill_tagger.evaluate(validation)
        end = time.time()
        print "Brill Tagger Test Time =", end - begin, "second"
        t2 = end - begin
        print "Brill Tagger Accuracy =", brill_accuracy
        print "Accuracy improvement:", brill_accuracy - baseline_accuracy

        accuracy1.append(baseline_accuracy)
        accuracy2.append(brill_accuracy)
        brill_time.append(end - begin)

        # analyze the AP running time
        print "Generating AP input string ..."
        baseline_tagged = baseline.tag_sents(testing)
        ap_input = ''
        for s in baseline_tagged:
            for w, t in s:
                ap_input += w + '/' + t + ' '
            ap_input += "./-NONE- ./-NONE- ./-NONE- "
        total_length = len(ap_input)
        print "AP input bytes:", total_length, "(", total_length / ap_freq, "second)"
        print "Speedup:", (t2 - t1) / (total_length / ap_freq)
        #print ap_input
        ap_time.append(total_length / ap_freq)

    print "\n\nCross Validation Results (baseline):"
    print accuracy1
    print "\nCross Validation Results (brill):"
    print accuracy2
    print "Brill running time:"
    print brill_time
    print "AP running time:"
    print ap_time
    print "Average Accuracy:", sum(accuracy1) / len(accuracy1), sum(
        accuracy2) / len(accuracy2)
    print "Average Accuracy:", sum(accuracy2) / len(accuracy2)

    avg_brill_time = sum(brill_time) / float(len(brill_time))
    avg_ap_time = sum(ap_time) / float(len(ap_time))
    print "Average Brill Time:", avg_brill_time
    print "Average AP Time:", avg_ap_time
    print "Speedup (ap over brill):", avg_brill_time / avg_ap_time
Example #9
0
def gen_tagging_rules(nrule, my_corpus):
    print "============================================================"

    # Parameters:
    templates = nltk.tag.brill.fntbl37()
    n_rules = nrule
    fold = 5
    ap_freq = 133000000.0
    do_evaluate = False
    input_string_file = "input.txt"
    regex_file = "regex.txt"
    out1 = open(input_string_file, "w+")
    out2 = open(regex_file, "w+")

    # Backoff tagger for the unigram tagger
    regex_tagger = nltk.RegexpTagger([
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'.*', 'NN')  # nouns (default)
    ])

    # k-fold cross validation
    for k in range(fold):
        # Just do 1 fold here for generating regex
        if k != 0: continue

        print "\n== Preparing training data ..."
        if my_corpus == treebank:
            training = [
                x for i, x in enumerate(my_corpus.tagged_sents())
                if i % fold != k
            ]
            validation = [
                x for i, x in enumerate(my_corpus.tagged_sents())
                if i % fold == k
            ]
            testing = [
                x for i, x in enumerate(my_corpus.sents()) if i % fold == k
            ]
        elif my_corpus == brown:
            #training = [x for i, x in enumerate(my_corpus.tagged_sents(categories='news')) if i % fold != k]
            #validation = [x for i, x in enumerate(my_corpus.tagged_sents(categories='news')) if i % fold == k]
            #testing = [x for i, x in enumerate(my_corpus.sents(categories='news')) if i % fold == k]
            training = [
                x for i, x in enumerate(my_corpus.tagged_sents())
                if i % fold != k
            ]
            validation = [
                x for i, x in enumerate(my_corpus.tagged_sents())
                if i % fold == k
            ]
            testing = [
                x for i, x in enumerate(my_corpus.sents()) if i % fold == k
            ]
        else:
            assert False

        #training = [x for i, x in enumerate(my_corpus.tagged_sents(categories='news')) if i % fold != k]
        #validation = [x for i, x in enumerate(my_corpus.tagged_sents(categories='news')) if i % fold == k]
        #testing = [x for i, x in enumerate(my_corpus.sents(categories='news')) if i % fold == k]

        print "\n== Initializing the baseline tagger ..."
        u_gram_tag = nltk.UnigramTagger(training, backoff=regex_tagger)
        baseline = u_gram_tag

        if do_evaluate:
            print "\n== Evaluating testing accuracy of the baseline tagger ..."
            begin = time.time()
            baseline_accuracy = baseline.evaluate(validation)
            end = time.time()
            t_baseline = end - begin
            print "Baseline Testing Time =", t_baseline, "second"
            print "Baseline Accuracy =", baseline_accuracy

        print "\n== Training the Brill tagger ..."
        tt = BrillTaggerTrainer(baseline, templates, trace=3)
        begin = time.time()
        brill_tagger = tt.train(training, max_rules=n_rules)
        end = time.time()
        t_brilltrain = end - begin
        print "Brill Tagger Training Time =", t_brilltrain, "second"
        print "Learn rules:", len(brill_tagger.rules())

        if do_evaluate:
            print "\n== Testing the Brill Tagger ..."
            begin = time.time()
            brill_accuracy = brill_tagger.evaluate(validation)
            end = time.time()
            t_brilltest = end - begin
            print "Brill Tagger Testing Time =", t_brilltest, "second"
            print "Brill Tagger Accuracy =", brill_accuracy
            print "Accuracy improvement:", brill_accuracy - baseline_accuracy

        # analyze the AP running time
        print "\n== Generating AP input string ..."
        baseline_tagged = baseline.tag_sents(testing)
        ap_input = ' '
        for s in baseline_tagged:
            for w, t in s:
                ap_input += w + '/' + t + ' '
            ap_input += "/ / / "
        total_length = len(ap_input)
        print "AP input bytes:", total_length, "(", total_length / ap_freq, "second)"
        print "Write the input string to", input_string_file, "..."
        out1.write(ap_input)
        out1.write('\n')
        print "The input string is written to", input_string_file

        print "\n== Generating Regex for the AP ..."
        print "Template size:", len(templates)
        range_l, range_r = get_template_range(templates)
        print "Template range:", range_l, range_r
        print "Total rules:", len(brill_tagger.rules())

        for rule in brill_tagger.rules():
            regex, report_tag = rule_to_regex(rule, range_l, range_r)
            print report_tag, ":", regex
            #out2.write(report_tag + " : " + regex + "\n")
            out2.write(regex + "\n")

        print "\nDone."
        print "\n*************************************"
        print "The regexes are written to", regex_file
        print "*************************************"
        out1.close()
        out2.close()
    def __init__(self, tagged_sents, anonProperNouns=False, initialTagger=None, max_rules=250, min_score=2,
                 min_acc=None, template='fntbl37'):
        '''
        Construct a new MTEBrillTagger and train it with the sentences from tagged_sents.

        :param tagged_sents: Tagged sentences to train the tagger.
        :type tagged_sents: [[(word:str, tag:str)]]
        :param anonProperNouns: Set 'True' to replace every proper noun with an anonymous string. Currently only for MTE tags.
        :type anonProperNouns: bool
        :param initialTagger: If None or unset, use UnigramTagger as initial tagger; use specified one else ('self._tagger = initialTagger')
        :type initialTagger: Tagger
        :param max_rules: tagger generates at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :param template: template set to use to train Brill Tagger. Can be the name of a function from nltk.tag.brill that returns a template set or a list of templates.
        :type template: str or list
 
        '''

        self._tagged_sents = []
        ANON = "anon"

        if anonProperNouns:
            for s in tagged_sents:
                tmp = []
                for (w, tag) in s:
                    if tag.startswith("#Np"):
                        tmp.append((ANON, "#Np"))
                    else:
                        tmp.append((w, tag))
                self._tagged_sents.append(tmp)
        else:
            self._tagged_sents = tagged_sents

        Template._cleartemplates()

        # If 'template' parameter is 'None' default to fntbl37 template set
        if template is None:
            templates = fntbl37()

        # Check if 'template' parameter is a list. If it is try to use it directly
        elif type(template) is list:
            templates = template

        # Check if 'template' is a string. If it is try to get the template set from nltk
        elif type(template) is str:
            if template == "fntbl37":
                templates = fntbl37()
            elif template == "brill24":
                templates = brill24()
            elif template == "nltkdemo18":
                templates = nltkdemo18()
            elif template == "nltkdemo18plus":
                templates = nltkdemo18plus()
            elif template == "baseline":
                templates = None
            else:
                raise ValueError("Method returning templates not found!")

        # If it is any other type, raise error
        else:
            raise ValueError(
                "Please specify the name of a function that returns a list of templates or a list of templates directly!")

        if initialTagger is None:
            self._tagger = UnigramTagger(self._tagged_sents)
        else:
            self._tagger = initialTagger

        if templates is not None:
            self._tagger = BrillTaggerTrainer(self._tagger, templates, trace=3)
            self._tagger = self._tagger.train(self._tagged_sents, max_rules=max_rules, min_score=min_score,
                                              min_acc=min_acc)
class MTEBrillTagger:
    '''
    This is a BrillTagger for text annotated using the MTE tag set.
    It should not be used for other tag sets, as it works with MTE tags internally.
    '''

    def __init__(self, tagged_sents, anonProperNouns=False, initialTagger=None, max_rules=250, min_score=2,
                 min_acc=None, template='fntbl37'):
        '''
        Construct a new MTEBrillTagger and train it with the sentences from tagged_sents.

        :param tagged_sents: Tagged sentences to train the tagger.
        :type tagged_sents: [[(word:str, tag:str)]]
        :param anonProperNouns: Set 'True' to replace every proper noun with an anonymous string. Currently only for MTE tags.
        :type anonProperNouns: bool
        :param initialTagger: If None or unset, use UnigramTagger as initial tagger; use specified one else ('self._tagger = initialTagger')
        :type initialTagger: Tagger
        :param max_rules: tagger generates at most max_rules rules
        :type max_rules: int
        :param min_score: stop training when no rules better than min_score can be found
        :type min_score: int
        :param min_acc: discard any rule with lower accuracy than min_acc
        :type min_acc: float or None
        :param template: template set to use to train Brill Tagger. Can be the name of a function from nltk.tag.brill that returns a template set or a list of templates.
        :type template: str or list
 
        '''

        self._tagged_sents = []
        ANON = "anon"

        if anonProperNouns:
            for s in tagged_sents:
                tmp = []
                for (w, tag) in s:
                    if tag.startswith("#Np"):
                        tmp.append((ANON, "#Np"))
                    else:
                        tmp.append((w, tag))
                self._tagged_sents.append(tmp)
        else:
            self._tagged_sents = tagged_sents

        Template._cleartemplates()

        # If 'template' parameter is 'None' default to fntbl37 template set
        if template is None:
            templates = fntbl37()

        # Check if 'template' parameter is a list. If it is try to use it directly
        elif type(template) is list:
            templates = template

        # Check if 'template' is a string. If it is try to get the template set from nltk
        elif type(template) is str:
            if template == "fntbl37":
                templates = fntbl37()
            elif template == "brill24":
                templates = brill24()
            elif template == "nltkdemo18":
                templates = nltkdemo18()
            elif template == "nltkdemo18plus":
                templates = nltkdemo18plus()
            elif template == "baseline":
                templates = None
            else:
                raise ValueError("Method returning templates not found!")

        # If it is any other type, raise error
        else:
            raise ValueError(
                "Please specify the name of a function that returns a list of templates or a list of templates directly!")

        if initialTagger is None:
            self._tagger = UnigramTagger(self._tagged_sents)
        else:
            self._tagger = initialTagger

        if templates is not None:
            self._tagger = BrillTaggerTrainer(self._tagger, templates, trace=3)
            self._tagger = self._tagger.train(self._tagged_sents, max_rules=max_rules, min_score=min_score,
                                              min_acc=min_acc)

    def evaluate(self, test_sents):
        '''
        Gives the precision of the tagger with the given test sentences.
        *Use the metrics method for more output!*
        :param test_sents: The sentences to thest the tagger with
        :type test_sents: [[(str, str)]]
        '''
        return self._tagger.evaluate(test_sents)

    def metrics(self, gold, printout=True, confusion_matrix=False, oov=True):
        '''
        More sophisticated evalution method gives more numbers.

        :param gold: The sentences to use for testing
        :type gold: [[(str, str)]]
        :param printout: Should I print the results or just return them?
        :type printout: bool
        :param confusion_matrix: Should I create a Confusion Matrix?
        :type confusion_matrix: bool
        :param oov: Should the out of vocabulary words be calculated
        :type oov: bool
        :return: (acc, prec, rec, fsc, aov, None) the first five are the accuracy, precision, recall, fscore, and out of vocabulary words. The last one is the Confusion Matrix if requested, else None
        :rtype: (double, double, double, double, double, ConfusionMatrix or None)
        '''

        tagger_out = self._tagger.tag_sents(untag(sent) for sent in gold)
        gold_tokens = sum(gold, [])
        test_tokens = sum(tagger_out, [])
        gold_tokens_set = set(gold_tokens)
        test_tokens_set = set(test_tokens)

        gold_tags = [t for (_, t) in gold_tokens]
        test_tags = [t for (_, t) in test_tokens]

        # calculate out of vocabulary words
        if oov:
            d = {word: True for (word, _) in reduce(lambda a, b: a + b, self._tagged_sents, [])}
            aov = reduce(lambda a, b: a + 1 if not b in d else a, [w for (w, _) in gold_tokens], 0)
            aov = (aov * 100.0) / len(gold_tokens)
        else:
            aov = '-1'

        acc = accuracy(gold_tokens, test_tokens)
        prc = precision(gold_tokens_set, test_tokens_set)
        rec = recall(gold_tokens_set, test_tokens_set)
        fms = f_measure(gold_tokens_set, test_tokens_set)
        cfm = None

        if confusion_matrix:
            cfm = ConfusionMatrix(gold_tags, test_tags)

        if printout:
            print("accuracy:          " + str(acc))
            print("precision:         " + str(prc))
            print("recall:            " + str(rec))
            print("f-score:           " + str(fms))
            print("out of vocabulary: " + str(aov) + " %")
            if confusion_matrix:
                print(cfm)

        return acc, prc, rec, fms, aov, cfm
templates = [
    brill.Template(brill.Pos([1, 1])),
    brill.Template(brill.Pos([2, 2])),
    brill.Template(brill.Pos([1, 2])),
    brill.Template(brill.Pos([1, 3])),
    brill.Template(brill.Pos([1, 1])),
    brill.Template(brill.Pos([2, 2])),
    brill.Template(brill.Pos([1, 2])),
    brill.Template(brill.Pos([1, 3])),
    brill.Template(brill.Word([-1, -1])),
    brill.Template(brill.Word([-1, -1]))
]

trainer_initial_pos = BrillTaggerTrainer(initial_tagger=custom_pos_tagger,
                                         templates=templates,
                                         trace=3,
                                         deterministic=True)
brill_tagger = trainer_initial_pos.train(train_data, max_rules=10)

# In[ ]:

# In[94]:

train_sentences = [
    ('Total runs scored by SC Ganguly in match 5?', 'runs'),
    ('SC Ganguly score in match 1?', 'runs'),
    ('how many runs did Ganguly score in match 2?', 'runs'),
    ("Sachin's score in 4th match?", 'runs'),
    ('how much did McCullum scored in match 3?', 'runs'),
    ('how much did McCullum scored in match 4?', 'runs'),
    ('McCullum runs in match 3?', 'runs'),
Example #13
0
(r'.*able$', 'JJ'),                # adjectives
(r'.*ness$', 'NN'),                # nouns formed from adjectives
(r'.*ly$', 'RB'),                  # adverbs
(r'.*s$', 'NNS'),                  # plural nouns
(r'.*ing$', 'VBG'),                # gerunds
(r'.*ed$', 'VBD'),                 # past tense verbs
(r'.*', 'NN')                      # nouns (default)
])

baseline = backoff
baseline.evaluate(gold_data)

Template._cleartemplates() #clear any templates created in earlier tests
templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]

tt = BrillTaggerTrainer(baseline, templates, trace=3)

tagger1 = tt.train(training_data, max_rules=10)
tagger1.rules()[1:3]
train_stats = tagger1.train_stats()

tagger1.print_template_statistics(printunused=False)

tagger1.evaluate(gold_data)
tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)

tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99)

print(tagger2.evaluate(gold_data))  # doctest: +ELLIPSIS
tagger2.rules()[2:4]
Example #14
0
def entrenar_bill(initial_tagger,tagger_name):
    brill_tagger = BrillTaggerTrainer(initial_tagger=initial_tagger,templates=brill.brill24())
    tagger1 = brill_tagger.train(train_reducido[:1000])
    evaluacion = tagger1.evaluate(test_reducido[:1000])
    xlabels.append("Brill Tagger "+tagger_name)
    accuracys.append(evaluacion)
Example #15
0
def postag(
    templates=None,
    tagged_data=None,
    num_sents=1000,
    max_rules=300,
    min_score=3,
    min_acc=None,
    train=0.8,
    trace=3,
    randomize=False,
    ruleformat="str",
    incremental_stats=False,
    template_stats=False,
    error_output=None,
    serialize_output=None,
    learning_curve_output=None,
    learning_curve_take=300,
    baseline_backoff_tagger=None,
    separate_baseline_data=False,
    cache_baseline_tagger=None):
    """
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    """

    # defaults
    baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
    if templates is None:
        from nltk.tag.brill import describe_template_sets, brill24
        # some pre-built template sets taken from typical systems or publications are
        # available. Print a list with describe_template_sets()
        # for instance:
        templates = brill24()
    (training_data, baseline_data, gold_data, testing_data) = \
       _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data)

    # creating (or reloading from cache) a baseline tagger (unigram tagger)
    # this is just a mechanism for getting deterministic output from the baseline between
    # python versions
    if cache_baseline_tagger:
        if not os.path.exists(cache_baseline_tagger):
            baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
            with open(cache_baseline_tagger, 'w') as print_rules:
                pickle.dump(baseline_tagger, print_rules)
            print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger))
        with open(cache_baseline_tagger, "r") as print_rules:
            baseline_tagger= pickle.load(print_rules)
            print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger))
    else:
        baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
        print("Trained baseline tagger")
    if gold_data:
        print("    Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data)))

    # creating a Brill tagger
    tbrill = time.time()
    trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat)
    print("Training tbl tagger...")
    brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
    print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill))
    if gold_data:
        print("    Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data))

    # printing the learned rules, if learned silently
    if trace == 1:
        print("\nLearned rules: ")
        for (ruleno, rule) in enumerate(brill_tagger.rules(),1):
            print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat)))


    # printing template statistics (optionally including comparison with the training data)
    # note: if not separate_baseline_data, then baseline accuracy will be artificially high
    if  incremental_stats:
        print("Incrementally tagging the test data, collecting individual rule statistics")
        (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data)
        print("    Rule statistics collected")
        if not separate_baseline_data:
            print("WARNING: train_stats asked for separate_baseline_data=True; the baseline "
                  "will be artificially high")
        trainstats = brill_tagger.train_stats()
        if template_stats:
            brill_tagger.print_template_statistics(teststats)
        if learning_curve_output:
            _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take)
            print("Wrote plot of learning curve to {0}".format(learning_curve_output))
    else:
        print("Tagging the test data")
        taggedtest = brill_tagger.tag_sents(testing_data)
        if template_stats:
            brill_tagger.print_template_statistics()

    # writing error analysis to file
    if error_output is not None:
        with open(error_output, 'w') as f:
            f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
            f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n')
        print("Wrote tagger errors including context to {0}".format(error_output))

    # serializing the tagger to a pickle file and reloading (just to see it works)
    if serialize_output is not None:
        taggedtest = brill_tagger.tag_sents(testing_data)
        with open(serialize_output, 'w') as print_rules:
            pickle.dump(brill_tagger, print_rules)
        print("Wrote pickled tagger to {0}".format(serialize_output))
        with open(serialize_output, "r") as print_rules:
            brill_tagger_reloaded = pickle.load(print_rules)
        print("Reloaded pickled tagger from {0}".format(serialize_output))
        taggedtest_reloaded = brill_tagger.tag_sents(testing_data)
        if taggedtest == taggedtest_reloaded:
            print("Reloaded tagger tried on test set, results identical")
        else:
            print("PROBLEM: Reloaded tagger gave different results on test set")
Example #16
0
templates = [
    Template(Pos([-1])),
    Template(Pos([1])),
    Template(Pos([-2])),
    Template(Pos([2])),
    Template(Pos([-2, -1])),
    Template(Pos([1, 2])),
    Template(Pos([-3, -2, -1])),
    Template(Pos([1, 2, 3])),
    Template(Pos([-1]), Pos([1])),
    Template(Word([-1])),
    Template(Word([1])),
    Template(Word([-2])),
    Template(Word([2])),
    Template(Word([-2, -1])),
    Template(Word([1, 2])),
    Template(Word([-3, -2, -1])),
    Template(Word([1, 2, 3])),
    Template(Word([-1]), Word([1]))
]

baseline = CRFTagger()

baseline.set_model_file("model.crf.tagger")

#training brill tagger
tt = BrillTaggerTrainer(baseline, templates, trace=3)
taggerFinal = tt.train(training_data, max_rules=10)

pickle.dump(taggerFinal, open("BrillFinal.p", "wb"))
Example #17
0
def postag(
    templates=None,
    tagged_data=None,
    num_sents=1000,
    max_rules=300,
    min_score=3,
    min_acc=None,
    train=0.8,
    trace=3,
    randomize=False,
    ruleformat="str",
    incremental_stats=False,
    template_stats=False,
    error_output=None,
    serialize_output=None,
    learning_curve_output=None,
    learning_curve_take=300,
    baseline_backoff_tagger=None,
    separate_baseline_data=False,
    cache_baseline_tagger=None):
    """
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    """

    # defaults
    baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER
    if templates is None:
        from nltk.tag.brill import describe_template_sets, brill24
        # some pre-built template sets taken from typical systems or publications are
        # available. Print a list with describe_template_sets()
        # for instance:
        templates = brill24()
    (training_data, baseline_data, gold_data, testing_data) = \
       _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data)

    # creating (or reloading from cache) a baseline tagger (unigram tagger)
    # this is just a mechanism for getting deterministic output from the baseline between
    # python versions
    if cache_baseline_tagger:
        if not os.path.exists(cache_baseline_tagger):
            baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
            with open(cache_baseline_tagger, 'w') as print_rules:
                pickle.dump(baseline_tagger, print_rules)
            print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger))
        with open(cache_baseline_tagger, "r") as print_rules:
            baseline_tagger= pickle.load(print_rules)
            print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger))
    else:
        baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger)
        print("Trained baseline tagger")
    if gold_data:
        print("    Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data)))

    # creating a Brill tagger
    tbrill = time.time()
    trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat)
    print("Training tbl tagger...")
    brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc)
    print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill))
    if gold_data:
        print("    Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data))

    # printing the learned rules, if learned silently
    if trace == 1:
        print("\nLearned rules: ")
        for (ruleno, rule) in enumerate(brill_tagger.rules(),1):
            print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat)))


    # printing template statistics (optionally including comparison with the training data)
    # note: if not separate_baseline_data, then baseline accuracy will be artificially high
    if  incremental_stats:
        print("Incrementally tagging the test data, collecting individual rule statistics")
        (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data)
        print("    Rule statistics collected")
        if not separate_baseline_data:
            print("WARNING: train_stats asked for separate_baseline_data=True; the baseline "
                  "will be artificially high")
        trainstats = brill_tagger.train_stats()
        if template_stats:
            brill_tagger.print_template_statistics(teststats)
        if learning_curve_output:
            _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take)
            print("Wrote plot of learning curve to {0}".format(learning_curve_output))
    else:
        print("Tagging the test data")
        taggedtest = brill_tagger.batch_tag(testing_data)
        if template_stats:
            brill_tagger.print_template_statistics()

    # writing error analysis to file
    if error_output is not None:
        with open(error_output, 'w') as f:
            f.write('Errors for Brill Tagger %r\n\n' % serialize_output)
            for e in error_list(gold_data, taggedtest):
                f.write(e+'\n')
        print("Wrote tagger errors including context to {0}".format(error_output))

    # serializing the tagger to a pickle file and reloading (just to see it works)
    if serialize_output is not None:
        taggedtest = brill_tagger.batch_tag(testing_data)
        with open(serialize_output, 'w') as print_rules:
            pickle.dump(brill_tagger, print_rules)
        print("Wrote pickled tagger to {0}".format(serialize_output))
        with open(serialize_output, "r") as print_rules:
            brill_tagger_reloaded = pickle.load(print_rules)
        print("Reloaded pickled tagger from {0}".format(serialize_output))
        taggedtest_reloaded = brill_tagger.batch_tag(testing_data)
        if taggedtest == taggedtest_reloaded:
            print("Reloaded tagger tried on test set, results identical")
        else:
            print("PROBLEM: Reloaded tagger gave different results on test set")
Example #18
0
word_patterns = [
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'.*ould$', 'MD'),
    (r'.*ing$', 'VBG'),
    (r'.*ed$', 'VBD'),
    (r'.*ness$', 'NN'),
    (r'.*ment$', 'NN'),
    (r'.*ful$', 'JJ'),
    (r'.*ious$', 'JJ'),
    (r'.*ble$', 'JJ'),
    (r'.*ic$', 'JJ'),
    (r'.*ive$', 'JJ'),
    (r'.*ic$', 'JJ'),
    (r'.*est$', 'JJ'),
    (r'^a$', 'PREP'),
]

print("Initializing the train")

raubt_tagger = backoff_tagger(treebank_train, [nltk.tag.AffixTagger,
    nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger],
    backoff=nltk.tag.RegexpTagger(word_patterns))

templates = brill.fntbl37()
 
trainer = BrillTaggerTrainer(raubt_tagger, templates)
braubt_tagger = trainer.train(treebank_train, max_rules=100, min_score=3)

print("evaluate the model")
print("BRAUBT: ", braubt_tagger.evaluate(treebank_test))