コード例 #1
0
ファイル: q2.py プロジェクト: kyajmiller/LING-539
def getDefaultTaggerAccuracy(testingSet):
    # gets the accuracy of the DefaultTagger

    # get untagged sentences and gold POS tags
    untaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet]
    goldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet]

    # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this
    # altogether
    # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary
    defaultTagger = DefaultTagger("NN")
    defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences)

    # calculate accuracy
    totalTags = 0
    matches = 0
    # iterate through sentences
    for sentencePOSTags in goldPOSTags:
        # iterate through tags
        for individualPOSTag in sentencePOSTags:
            totalTags += 1
            # if the gold tag is NN, then match
            if individualPOSTag == "NN":
                matches += 1

    accuracy = (matches / totalTags) * 100
    return accuracy
コード例 #2
0
ファイル: crf.py プロジェクト: Batene/Bamanankan
    def tag_sents(self, sents):
        '''
        Tag a list of sentences. NB before using this function, user should specify the mode_file either by 
                       - Train a new model using ``train'' function 
                       - Use the pre-trained model which is set via ``set_model_file'' function  
        :params sentences : list of sentences needed to tag. 
        :type sentences : list(list(str))
        :return : list of tagged sentences. 
        :rtype : list (list (tuple(str,str))) 
        '''
        if self._model_file == '':
            raise Exception(' No model file is found !! Please use train or set_model_file function')
        
        # We need the list of sentences instead of the list generator for matching the input and output

################ added by Kathrin #########################################
        default = DefaultTagger('None')
        sents = default.tag_sents(sents)
###########################################################################
        result = []  
        for tokens in sents:
            features = [self._feature_func(tokens,i) for i in range(len(tokens))]
            labels = self._tagger.tag(features)
                
            if len(labels) != len(tokens):
                raise Exception(' Predicted Length Not Matched, Expect Errors !')
############### added by Kathrin ############################################
            tokens = [i[0] for i in tokens]
#############################################################################
            tagged_sent = list(zip(tokens,labels))
            result.append(tagged_sent)
            
        return result 
コード例 #3
0
ファイル: q2.py プロジェクト: kyajpauley/ling-539
def getDefaultTaggerAccuracy(testingSet):
    # gets the accuracy of the DefaultTagger

    # get untagged sentences and gold POS tags
    untaggedSentences = [[taggedWord[0] for taggedWord in sentence]
                         for sentence in testingSet]
    goldPOSTags = [[taggedWord[1] for taggedWord in sentence]
                   for sentence in testingSet]

    # declare tagger; honestly this is unncessary, as every tag is going to be 'NN' so we could really just skip this
    # altogether
    # I went with NN as it was the default value shown in the ntlk DefaultTagger documentation, completely arbitrary
    defaultTagger = DefaultTagger('NN')
    defaultTaggedSentences = defaultTagger.tag_sents(untaggedSentences)

    # calculate accuracy
    totalTags = 0
    matches = 0
    # iterate through sentences
    for sentencePOSTags in goldPOSTags:
        # iterate through tags
        for individualPOSTag in sentencePOSTags:
            totalTags += 1
            # if the gold tag is NN, then match
            if individualPOSTag == 'NN':
                matches += 1

    accuracy = (matches / totalTags) * 100
    return accuracy
コード例 #4
0
ファイル: tutPosTagging01.py プロジェクト: bindaasamit/pycode
######### DEFAULT TAGGER ###############

#Assigning the default Tag
from nltk.tag import DefaultTagger, untag
tagger=DefaultTagger('NN')
tokens=[['Hello','World'],['How','are','you','?']]
print tagger.tag(tokens)

print tagger.tag_sents(tokens)

#Untagging
tagged=tagger.tag(tokens)
print untag(tagged)

#Evaluating the tagger accuracy
from nltk.corpus import treebank
test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)
コード例 #5
0
ファイル: default_tagger.py プロジェクト: neuroph12/nlpy
# every tagger has a tag() method.
# DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method.
from nltk.tag import DefaultTagger
from nltk.corpus import treebank

tagger = DefaultTagger('NN')
print(tagger.tag(['Hello', 'World']))

# thought it's too simple, we can try to evaluate it
test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))

# for sentences
print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']]))

# untagging
from nltk.tag import untag

print(untag([('Hello', 'NN'), ('World', 'NN')]))
コード例 #6
0
# POS Taggers
# module load python/3.5
# brown and treebank corpora

from nltk.corpus import treebank
train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:]

# my first tagger
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
print(tagger.tag_sents([['Hello', '.'], ['My', 'name', 'is', 'Steve']]))
print(tagger.evaluate(test_sents))

# unigrams
from nltk.tag import UnigramTagger
unigram_tagger = UnigramTagger(train_sents)
tagger = UnigramTagger(train_sents, cutoff=3)
print(tagger.evaluate(test_sents))

# bigrams
from nltk.tag import BigramTagger
bigram_tagger = BigramTagger(train_sents)
tagger = BigramTagger(train_sents, cutoff=3)
print(tagger.evaluate(test_sents))

# trigrams
from nltk.tag import TrigramTagger
trigram_tagger = TrigramTagger(train_sents)
tagger = TrigramTagger(train_sents, cutoff=3)
print(tagger.evaluate(test_sents))