def TrainSenseTagger(Pcfg,CFDist):
    logger.info("Training unigram tagger:")
    SenseUnigramTagger = UnigramTagger(TAG='SENSE',TEXT='STEM')
    #SenseUnigramTagger.train(taggedData)
    SenseUnigramTagger._freqdist = invertConditionalFreqDist(CFDist)
    SenseDefaultTagger = DefaultTagger('APPEAR', TAG='SENSE',TEXT='STEM')
    backoff = BackoffTagger([SenseUnigramTagger,SenseDefaultTagger], TAG='SENSE',TEXT='STEM')
    return backoff
Ejemplo n.º 2
0
def TrainSenseTagger(Pcfg, CFDist):
    logger.info("Training unigram tagger:")
    SenseUnigramTagger = UnigramTagger(TAG="SENSE", TEXT="STEM")
    # SenseUnigramTagger.train(taggedData)
    SenseUnigramTagger._freqdist = invertConditionalFreqDist(CFDist)
    SenseDefaultTagger = DefaultTagger("APPEAR", TAG="SENSE", TEXT="STEM")
    backoff = BackoffTagger([SenseUnigramTagger, SenseDefaultTagger], TAG="SENSE", TEXT="STEM")
    return backoff
Ejemplo n.º 3
0
def test(
    numFiles=100,
    max_rules=200,
    min_score=2,
    ruleFile="dump.rules",
    errorOutput="errors.out",
    ruleOutput="rules.out",
    randomize=False,
    train=0.8,
    trace=3,
):

    NN_CD_tagger = RegexpTagger([(r"^[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")], TAG="POS")

    # train is the proportion of data used in training; the rest is reserved
    # for testing.

    print "Loading tagged data..."
    taggedData = getWSJTokens(numFiles, randomize)

    trainCutoff = int(len(taggedData) * train)
    trainingData = Token(SUBTOKENS=taggedData[0:trainCutoff])
    goldData = Token(SUBTOKENS=taggedData[trainCutoff:])
    testingData = goldData.exclude("POS")

    # Unigram tagger

    print "Training unigram tagger:",
    u = UnigramTagger(TAG="POS")
    u.train(trainingData)
    backoff = BackoffTagger([u, NN_CD_tagger], TAG="POS")
    print ("[accuracy: %f]" % tagger_accuracy(backoff, [goldData]))

    # Brill tagger

    templates = [
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
        ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)),
        #        ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
    ]

    # trainer = FastBrillTaggerTrainer(backoff, templates, trace, TAG='POS')
    trainer = BrillTaggerTrainer(backoff, templates, trace, TAG="POS")
    b = trainer.train(trainingData, max_rules, min_score)

    print
    print ("Brill accuracy: %f" % tagger_accuracy(b, [goldData]))

    print ("\nRules: ")
    printRules = file(ruleOutput, "w")
    for rule in b.rules():
        print (str(rule))
        printRules.write(str(rule) + "\n\n")
    # b.saveRules(ruleFile)

    b.tag(testingData)
    el = errorList(goldData, testingData)
    errorFile = file(errorOutput, "w")

    for e in el:
        errorFile.write(e + "\n\n")
    errorFile.close()
    print ("Done.")
    return b
def test(numFiles=100,
         max_rules=200,
         min_score=2,
         ruleFile="dump.rules",
         errorOutput="errors.out",
         ruleOutput="rules.out",
         randomize=False,
         train=.8,
         trace=3):

    NN_CD_tagger = RegexpTagger([(r'^[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')],
                                TAG='POS')

    # train is the proportion of data used in training; the rest is reserved
    # for testing.

    print "Loading tagged data..."
    taggedData = getWSJTokens(numFiles, randomize)

    trainCutoff = int(len(taggedData) * train)
    trainingData = Token(SUBTOKENS=taggedData[0:trainCutoff])
    goldData = Token(SUBTOKENS=taggedData[trainCutoff:])
    testingData = goldData.exclude('POS')

    # Unigram tagger

    print "Training unigram tagger:",
    u = UnigramTagger(TAG='POS')
    u.train(trainingData)
    backoff = BackoffTagger([u, NN_CD_tagger], TAG='POS')
    print("[accuracy: %f]" % tagger_accuracy(backoff, [goldData]))

    # Brill tagger

    templates = [
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)),
        SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
        #        SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
        ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)),
        #        ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
    ]

    #trainer = FastBrillTaggerTrainer(backoff, templates, trace, TAG='POS')
    trainer = BrillTaggerTrainer(backoff, templates, trace, TAG='POS')
    b = trainer.train(trainingData, max_rules, min_score)

    print
    print("Brill accuracy: %f" % tagger_accuracy(b, [goldData]))

    print("\nRules: ")
    printRules = file(ruleOutput, 'w')
    for rule in b.rules():
        print(str(rule))
        printRules.write(str(rule) + "\n\n")
    #b.saveRules(ruleFile)

    b.tag(testingData)
    el = errorList(goldData, testingData)
    errorFile = file(errorOutput, 'w')

    for e in el:
        errorFile.write(e + "\n\n")
    errorFile.close()
    print("Done.")
    return b