コード例 #1
0
class Chunker:

	_tagger = DefaultTagger
	
	def __init__(self, words, sents):
		self._tagger = DefaultTagger('NN')
		self.tag_words(words, sents)

	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)

	def get_accuracy(self, sentences=[]):

		if sentences == []:
			test_sents = treebank.tagged_sents()[6000:]
		else:
			test_sents = sentences
		print self._tagger.evaluate(test_sents)
コード例 #2
0
 def test_default_tagger(self):
     test_list = make_sentence_list(path.join(self.test_dir, 'test.tsv'))
     tagger = DefaultTagger('N')
     split = int(len(test_list) * .90)
     train_data = test_list[:split]
     test_data = test_list[split:]
     print(tagger.evaluate(train_data))
     print(tagger.evaluate(test_data))
コード例 #3
0
def find_accuracy(train_set, test_set):
    #skal alt her være test-set?
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    train_set_most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(train_set_most_frequent_tag)
    accuracy_result = default_tagger.evaluate(test_set)
    return accuracy_result
コード例 #4
0
ファイル: aeb_tagging.py プロジェクト: karenlmcneil/Final
def evaluate_nltk_pos_taggers(gold_standard_filename, num_folds=10, loo=False):
    """
    Evaluates the NLTK backoff taggers on the corpus data. Uses cross-validation.
    :param gold_standard_filename: tsv file of format: word \t POS \n
    :param num_folds: int: number of folds for cross-validation
    :param loo: bool: whether to use Leave One Out cross-validation
    :return:
    """
    tagged_sents = make_sentence_list(gold_standard_filename)
    backoff = DefaultTagger('N')
    tagger_classes = [UnigramTagger, BigramTagger, TrigramTagger]
    scores = {
        'DefaultTagger': [],
        'UnigramTagger': [],
        'BigramTagger': [],
        'TrigramTagger': [],
        'BrillTagger': [],
    }

    # k-fold cross-validation
    if loo:  # Leave One Out cross-validation
        num_folds = len(tagged_sents)-1
    subset_size = int(len(tagged_sents) / num_folds)
    for i in range(num_folds):

        # training and testing data for this round
        X_test = tagged_sents[i * subset_size:][:subset_size]
        X_train = tagged_sents[:i * subset_size] + tagged_sents[(i + 1) * subset_size:]

        # compute score for taggers
        default_score = backoff.evaluate(X_train)
        trigram, tagger_scores = backoff_tagger(X_train, X_test,
                                                tagger_classes, backoff=backoff)
        uni_score, bi_score, tri_score = tagger_scores
        brill_tagger = train_brill_tagger(trigram, X_train)
        brill_score = brill_tagger.evaluate(X_test)
        brill_tagger.print_template_statistics(printunused=False)

        # save scores
        scores['DefaultTagger'].append(default_score)
        scores['UnigramTagger'].append(uni_score)
        scores['BigramTagger'].append(bi_score)
        scores['TrigramTagger'].append(tri_score)
        scores['BrillTagger'].append(brill_score)

    for k, v in scores.items():  # average scores across folds
        if v:
            scores[k] = sum(v)/len(v)
            print(k, ": {:2.2%}".format(scores[k]))
    return scores
コード例 #5
0
def find_combined_taggers_accuracy(train_set, test_set):
    # finding most used tag
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(most_frequent_tag)

    # default tagger
    default_tagger_result = default_tagger.evaluate(test_set)
    print("Default Tagger accuracy: ", default_tagger_result)

    # regex tagger
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    regex_tagger = RegexpTagger(patterns)
    regex_tagger_result = regex_tagger.evaluate(test_set)
    print("Regex Tagger Accuracy: ", regex_tagger_result)

    # unigram tagger with default tagger as backoff
    unigram_tagger = UnigramTagger(train_set, backoff=default_tagger)
    unigram_tagger_result = unigram_tagger.evaluate(test_set)
    print("Unigram Tagger accuracy (Backoff = Default Tagger): ",
          unigram_tagger_result)

    # bigram tagger with different backoffs
    bigram_tagger = BigramTagger(train_set)
    bigram_tagger_backoff_unigram = BigramTagger(train_set,
                                                 backoff=unigram_tagger)
    bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger)

    bigram_tagger_result = bigram_tagger.evaluate(test_set)
    bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate(
        test_set)
    bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate(
        test_set)

    print("Bigram Tagger Accuracy: ", bigram_tagger_result)
    print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ",
          bigram_tagger_backoff_regex_result)
    print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ",
          bigram_tagger_backoff_unigram_result)
コード例 #6
0
ファイル: tutPosTagging01.py プロジェクト: bindaasamit/pycode
######### DEFAULT TAGGER ###############

#Assigning the default Tag
from nltk.tag import DefaultTagger, untag
tagger=DefaultTagger('NN')
tokens=[['Hello','World'],['How','are','you','?']]
print tagger.tag(tokens)

print tagger.tag_sents(tokens)

#Untagging
tagged=tagger.tag(tokens)
print untag(tagged)

#Evaluating the tagger accuracy
from nltk.corpus import treebank
test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)
コード例 #7
0

# building your own tagger

# preparing the data
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
print train_data[0]

# default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')

print dt.evaluate(test_data)

print dt.tag(tokens)


# regex tagger
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
コード例 #8
0
ファイル: myPos.py プロジェクト: gangulyarin/NLP-Snippets
import nltk
from nltk.corpus import treebank
from nltk.tag import DefaultTagger

from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
#print(train_data[0])

dt = DefaultTagger('NN')
print(dt.evaluate(test_data))

nt = ClassifierBasedPOSTagger(train=train_data,
                              classifier_builder=NaiveBayesClassifier.train)
print(nt.evaluate(test_data))
コード例 #9
0
# Use the test data to evaluate taggers and see how they perform on the sample sentence.

# Build a custom tagger by extending class TaggerI, from the nltk.tag package and implementing the tag function.
# Use the evaluate function to assess the performance of the tagger.

# --- Tagger ---
# INPUT: Sentence tokens
# OUTPUT: List of pairs where each item corresponds to a token of the input with its POS tag

# 1. BACKOFF TAGGER (a tagger that is consulted by another when not able to tag a token):
#    Assigns the same tag to all tokens (tag specified as argument, NN in this case)
dt = DefaultTagger("NN")

# Measure accuracy on test data (i.e. Gold Standard). Test data should be tagged to compare these
# tags against new ones computed by the evaluated tagger (dt in this case)
print(dt.evaluate(gold=test_data))
print(dt.tag(tokens=tokens))

# 2. REGEX TAGGER:
#    Assigns tags to tokens by comparing their word strings to a series of regular expressions

# Define regex patterns used that determine the tags of tokens. Note that when tagging a token, expressions
# are evaluated bottom up and thus, the last one defines the default tag
patterns = [
    (r".*ing$", "VBG"),  # Gerunds
    (r".*ed$", "VBD"),  # Simple past
    (r".*es$", "VBZ"),  # 3rd singular present
    (r".*ould$", "MD"),  # Modals
    (r".*'s$", "NN$"),  # Possesive pronouns
    (r".*s$", "NNS"),  # Plural nouns
    (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # Cardinal numbers
コード例 #10
0
from nltk.corpus import treebank, wordnet
from nltk.probability import FreqDist
from nltk.tag.sequential import ClassifierBasedPOSTagger
from nltk.tag import brill, brill_trainer, tnt, SequentialBackoffTagger
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger, AffixTagger
from samples import sample

# Test and training variables
test_sents = treebank.tagged_sents()[3000:]
train_sents = treebank.tagged_sents()[:3000]
tk_sample = word_tokenize(sample)

# Default tagger - Nouns
df_tagger = DefaultTagger('NN')
tagged = df_tagger.tag(tk_sample)
accuracy = df_tagger.evaluate(test_sents)
print(f"Tagged text: {tagged}; acc = {accuracy}\n")

# Unigram tagger
ug_tagger = UnigramTagger(train_sents)
tagged = ug_tagger.tag(tk_sample)
accuracy = ug_tagger.evaluate(test_sents)
print(f"Tagged text: {tagged}; acc = {accuracy}\n")

# Backoff tagger: rely on other tagger(backoff) when the current one does not know how to evaluate
ugb_tagger = UnigramTagger(train_sents, backoff=df_tagger)
accuracy = ugb_tagger.evaluate(test_sents)
print(f"Accuracy of backoff: {accuracy}\n")

# Saving pickle and testing it.
with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'wb') as file:
コード例 #11
0
print(tagger.tag(['we', 'are','going']))# WRONG


#SequentialBackoffTagger implements the tag() method, which calls the
#choose_tag() method of the subclass for each index in the tokens list while accumulating
#a history of the previously tagged tokens


"""DefaultTagger is a subclass of SequentialBackoffTagger. Every subclass of
SequentialBackoffTagger must implement the choose_tag() method, which
takes three arguments:
    * The list of tokens
    * The index of the current token whose tag we want to choose
    * The history, which is a list of the previous tags
    SequentialBackoffTagger implements the tag() method, which calls the
choose_tag() method of the subclass for each index in the tokens list while accumulating
a history of the previously tagged tokens. This history is the reason for the Sequential in
SequentialBackoffTagger. We'll get to the backoff portion of the name in the Combining
taggers with backoff tagging recipe. """


#accuracy text
"""So, by just choosing NN for every tag, we can achieve 14 % accuracy testing on one-fourth
of the treebank corpus."""
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))



コード例 #12
0
ファイル: default_tagger.py プロジェクト: anderscui/nlpy
# every tagger has a tag() method.
# DefaultTagger is a subclass of SequentialBackoffTagger which has a choose_tag() method.
from nltk.tag import DefaultTagger
from nltk.corpus import treebank

tagger = DefaultTagger('NN')
print(tagger.tag(['Hello', 'World']))

# thought it's too simple, we can try to evaluate it
test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))

# for sentences
print(tagger.tag_sents([['Hello', 'World', '.'], ['How', 'are', 'you', '?']]))

# untagging
from nltk.tag import untag

print(untag([('Hello', 'NN'), ('World', 'NN')]))
コード例 #13
0
"""
""" 1. create a tagger utilising: 
       n-gram, unigram, regexp and default taggers """
tag2_eval = dict()
# train with backoff
tic()
tag2_input = create_regexp_list('Open_Word_Patterns.xlsx', RESOURCES_DIR)
tag2_tagger = DefaultTagger('NO')
tag2_tagger = RegexpTagger(tag2_input, backoff=tag2_tagger)
tag2_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag2_tagger)
tag2_tagger = BigramTagger(train_sents, backoff=tag2_tagger)
tag2_tagger = TrigramTagger(train_sents, backoff=tag2_tagger)
tag2_eval['train_time'] = toc()
# test
tic()
tag2_eval['test_accuracy'] = tag2_tagger.evaluate(val_sents)
tag2_eval['test_time'] = toc()
# display results
display_training_metrics(tag2_eval)
""" 2. create a tagger utilising: 
       n-gram, unigram, affix and default taggers """
tag1_eval = dict()
# train with backoff
tic()
tag1_tagger = DefaultTagger('NO')
tag1_tagger = AffixTagger(train_sents, affix_length=-1, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-2, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-3, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-4, backoff=tag1_tagger)
tag1_tagger = AffixTagger(train_sents, affix_length=-5, backoff=tag1_tagger)
tag1_tagger = UnigramTagger(train_sents, cutoff=3, backoff=tag1_tagger)
コード例 #14
0
import nltk
from nltk.corpus import treebank  #import treebank corpus
from nltk.tag import DefaultTagger  #import DefaultTagger

tagger = DefaultTagger('NN')  #Default Tagger with assigning NN tag
treebank_tagged_sents = treebank.tagged_sents(
)  #initialising treebank_tagged_sents
tagger.tag(treebank_tagged_sents)  #tag treebank_tagged_sents
print('Accuracy %4.1f%%' %
      (100.0 *
       tagger.evaluate(treebank_tagged_sents)))  #calculate and print Accuracy
コード例 #15
0
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag']).T

#%%
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
print(train_data[0])

#%%
# default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')

# accuracy on test data
dt.evaluate(test_data)

# tagging our sample headline
dt.tag(nltk.word_tokenize(sentence))

#%%
# regex tagger
from nltk.tag import RegexpTagger
# define refex tag patterns
patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NNS'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
コード例 #16
0
ファイル: indivTaggers.py プロジェクト: Batene/Bamanankan
def indivDefault(bambara):
    default = DefaultTagger('n')
    print(default.evaluate(bambara.test_sents))
    return default