Esempio n. 1
0
def create_tagger(train_sents):
    ct = CombinedTagger()
    #    ct.example_train(train_sents, True)
    ct.unmarshal("tresoldi")

    tokens = "Mauro viu o livro sobre a mesa".split()
    print(list(ct.tag(tokens)))

    # tests
    acc = tag.accuracy(ct, [train_sents])
    print('Accuracy = %4.2f%%' % (100 * acc))
Esempio n. 2
0
def create_tagger (train_sents):
    ct = CombinedTagger()
#    ct.example_train(train_sents, True)
    ct.unmarshal("tresoldi")
    
    tokens = "Mauro viu o livro sobre a mesa".split()
    print list(ct.tag(tokens))

    # tests
    acc = tag.accuracy(ct, [train_sents])
    print 'Accuracy = %4.2f%%' % (100 * acc)
Esempio n. 3
0
def _demo_tagger(tagger, gold):
    from en.parser.nltk_lite.tag import accuracy
    acc = accuracy(tagger, gold)
    print('Accuracy = %4.1f%%' % (100.0 * acc))
Esempio n. 4
0
def demo(num_sents=100,
         max_rules=200,
         min_score=2,
         error_output="errors.out",
         rule_output="rules.out",
         randomize=False,
         train=.8,
         trace=3):
    """
    Brill Tagger Demonstration

    @param num_sents: how many sentences of training and testing data to use
    @type num_sents: L{int}
    @param max_rules: maximum number of rule instances to create
    @type max_rules: L{int}
    @param min_score: the minimum score for a rule in order for it to be considered
    @type min_score: L{int}
    @param error_output: the file where errors will be saved
    @type error_output: L{string}
    @param rule_output: the file where rules will be saved
    @type rule_output: L{string}
    @param randomize: whether the training data should be a random subset of the corpus
    @type randomize: L{boolean}
    @param train: the fraction of the the corpus to be used for training (1=all)
    @type train: L{float}
    @param trace: the level of diagnostic tracing output to produce (0-3)
    @type train: L{int}
    """

    from en.parser.nltk_lite.corpora import treebank
    from en.parser.nltk_lite import tag
    from en.parser.nltk_lite.tag import brill

    NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])

    # train is the proportion of data used in training; the rest is reserved
    # for testing.

    print "Loading tagged data..."
    sents = list(treebank.tagged())
    if randomize:
        random.seed(len(sents))
        random.shuffle(sents)

    tagged_data = [t for s in sents[:num_sents] for t in s]
    cutoff = int(len(tagged_data) * train)

    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:]

    testing_data = [t[0] for t in gold_data]

    # Unigram tagger

    print "Training unigram tagger:",
    u = tag.Unigram(backoff=NN_CD_tagger)

    # NB training and testing are required to use a list-of-lists structure,
    # so we wrap the flattened corpus data with the extra list structure.
    u.train([training_data])
    print("[accuracy: %f]" % tag.accuracy(u, [gold_data]))

    # Brill tagger

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (2, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 3)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (2, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 3)),
        brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                      (1, 1)),
        brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                      (1, 1)),
    ]

    #trainer = brill.FastBrillTrainer(u, templates, trace)
    trainer = brill.BrillTrainer(u, templates, trace)
    b = trainer.train(training_data, max_rules, min_score)

    print
    print("Brill accuracy: %f" % tag.accuracy(b, [gold_data]))

    print("\nRules: ")
    printRules = file(rule_output, 'w')
    for rule in b.rules():
        print(str(rule))
        printRules.write(str(rule) + "\n\n")

    testing_data = list(b.tag(testing_data))
    el = errorList(gold_data, testing_data)
    errorFile = file(error_output, 'w')

    for e in el:
        errorFile.write(e + "\n\n")
    errorFile.close()
    print "Done; rules and errors saved to %s and %s." % (rule_output,
                                                          error_output)
Esempio n. 5
0
def demo(num_sents=100, max_rules=200, min_score=2, error_output = "errors.out",
         rule_output="rules.out", randomize=False, train=.8, trace=3):
    """
    Brill Tagger Demonstration

    @param num_sents: how many sentences of training and testing data to use
    @type num_sents: L{int}
    @param max_rules: maximum number of rule instances to create
    @type max_rules: L{int}
    @param min_score: the minimum score for a rule in order for it to be considered
    @type min_score: L{int}
    @param error_output: the file where errors will be saved
    @type error_output: L{string}
    @param rule_output: the file where rules will be saved
    @type rule_output: L{string}
    @param randomize: whether the training data should be a random subset of the corpus
    @type randomize: L{boolean}
    @param train: the fraction of the the corpus to be used for training (1=all)
    @type train: L{float}
    @param trace: the level of diagnostic tracing output to produce (0-3)
    @type train: L{int}
    """

    from en.parser.nltk_lite.corpora import treebank
    from en.parser.nltk_lite import tag
    from en.parser.nltk_lite.tag import brill

    NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])

    # train is the proportion of data used in training; the rest is reserved
    # for testing.

    print "Loading tagged data..."
    sents = list(treebank.tagged())
    if randomize:
        random.seed(len(sents))
        random.shuffle(sents)

    tagged_data = [t for s in sents[:num_sents] for t in s]
    cutoff = int(len(tagged_data)*train)

    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:]

    testing_data = [t[0] for t in gold_data]

    # Unigram tagger

    print "Training unigram tagger:",
    u = tag.Unigram(backoff=NN_CD_tagger)

    # NB training and testing are required to use a list-of-lists structure,
    # so we wrap the flattened corpus data with the extra list structure.
    u.train([training_data])
    print("[accuracy: %f]" % tag.accuracy(u, [gold_data]))

    # Brill tagger

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
        brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
        brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)),
        ]

    #trainer = brill.FastBrillTrainer(u, templates, trace)
    trainer = brill.BrillTrainer(u, templates, trace)
    b = trainer.train(training_data, max_rules, min_score)

    print
    print("Brill accuracy: %f" % tag.accuracy(b, [gold_data]))

    print("\nRules: ")
    printRules = file(rule_output, 'w')
    for rule in b.rules():
        print(str(rule))
        printRules.write(str(rule)+"\n\n")

    testing_data = list(b.tag(testing_data))
    el = errorList(gold_data, testing_data)
    errorFile = file(error_output, 'w')

    for e in el:
        errorFile.write(e+"\n\n")
    errorFile.close()
    print "Done; rules and errors saved to %s and %s." % (rule_output, error_output)
Esempio n. 6
0
def _demo_tagger(tagger, gold):
    from en.parser.nltk_lite.tag import accuracy
    acc = accuracy(tagger, gold)
    print 'Accuracy = %4.1f%%' % (100.0 * acc)