Example #1
0
def find_combined_taggers_accuracy(train_set, test_set):
    # finding most used tag
    train_words = [word for sent in train_set for word in sent]
    train_set_tags = [tag for (word, tag) in train_words]
    most_frequent_tag = FreqDist(train_set_tags).max()
    default_tagger = DefaultTagger(most_frequent_tag)

    # default tagger
    default_tagger_result = default_tagger.evaluate(test_set)
    print("Default Tagger accuracy: ", default_tagger_result)

    # regex tagger
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')  # nouns (default)
    ]
    regex_tagger = RegexpTagger(patterns)
    regex_tagger_result = regex_tagger.evaluate(test_set)
    print("Regex Tagger Accuracy: ", regex_tagger_result)

    # unigram tagger with default tagger as backoff
    unigram_tagger = UnigramTagger(train_set, backoff=default_tagger)
    unigram_tagger_result = unigram_tagger.evaluate(test_set)
    print("Unigram Tagger accuracy (Backoff = Default Tagger): ",
          unigram_tagger_result)

    # bigram tagger with different backoffs
    bigram_tagger = BigramTagger(train_set)
    bigram_tagger_backoff_unigram = BigramTagger(train_set,
                                                 backoff=unigram_tagger)
    bigram_tagger_backoff_regex = BigramTagger(train_set, backoff=regex_tagger)

    bigram_tagger_result = bigram_tagger.evaluate(test_set)
    bigram_tagger_backoff_regex_result = bigram_tagger_backoff_regex.evaluate(
        test_set)
    bigram_tagger_backoff_unigram_result = bigram_tagger_backoff_unigram.evaluate(
        test_set)

    print("Bigram Tagger Accuracy: ", bigram_tagger_result)
    print("Bigram Tagger Accuracy (Backoff = Regex Tagger): ",
          bigram_tagger_backoff_regex_result)
    print("Bigram Tagger Accuracy (Backoff = Unigram Tagger): ",
          bigram_tagger_backoff_unigram_result)
Example #2
0
# regex tagger
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
        (r'.*ing$', 'VBG'),               # gerunds
        (r'.*ed$', 'VBD'),                # simple past
        (r'.*es$', 'VBZ'),                # 3rd singular present
        (r'.*ould$', 'MD'),               # modals
        (r'.*\'s$', 'NN$'),               # possessive nouns
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default) ... 
]
rt = RegexpTagger(patterns)

print rt.evaluate(test_data)
print rt.tag(tokens)


## N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

print ut.evaluate(test_data)
print ut.tag(tokens)
Example #3
0
# Define regex patterns used that determine the tags of tokens. Note that when tagging a token, expressions
# are evaluated bottom up and thus, the last one defines the default tag
patterns = [
    (r".*ing$", "VBG"),  # Gerunds
    (r".*ed$", "VBD"),  # Simple past
    (r".*es$", "VBZ"),  # 3rd singular present
    (r".*ould$", "MD"),  # Modals
    (r".*'s$", "NN$"),  # Possesive pronouns
    (r".*s$", "NNS"),  # Plural nouns
    (r"^-?[0-9]+(.[0-9]+)?$", "CD"),  # Cardinal numbers
    (r".*", "NN")  # Nouns (default)
]

rt = RegexpTagger(regexps=patterns)

print(rt.evaluate(test_data))
print(rt.tag(tokens))

# 3. N-GRAM TAGGERS:
#    Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes,
#    letters, characters or syllabes. Shingles: n-grams where items are just words.
#    UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger

# Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations)
ut = UnigramTagger(train=train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# Test the performance of each N-Gram tagger
print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data)))
print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data)))
    a las queridas expresiones regulares con un RegexpTagger
'''

from nltk.tag import RegexpTagger
regexp_tagger = RegexpTagger(
         [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
          ( r'(The|the|A|a|An|an)$', 'AT'),   # articles
          ( r'.*able$', 'JJ'),                # adjectives
          ( r'.*ness$', 'NN'),         # nouns formed from adj
          ( r'.*ly$', 'RB'),           # adverbs
          ( r'.*s$', 'NNS'),           # plural nouns
          ( r'.*ing$', 'VBG'),         # gerunds
          (r'.*ed$', 'VBD'),           # past tense verbs
          (r'.*', 'NN')                # nouns (default)
          ])
print("Regexp Tagger: {}".format(regexp_tagger.evaluate(test_data)))

''' Visto lo anterior, podemos poner al tagger regexp como backoff
    de los N-gram creados anteriormente.

    O podríamos ponerlo 1º, pero me fio más de los preentrenados
    que de unas reglas puestas a capón.
'''

regexp_tagger = RegexpTagger(
         [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
          ( r'(The|the|A|a|An|an)$', 'AT'),   # articles
          ( r'.*able$', 'JJ'),                # adjectives
          ( r'.*ness$', 'NN'),         # nouns formed from adj
          ( r'.*ly$', 'RB'),           # adverbs
          ( r'.*s$', 'NNS'),           # plural nouns
Example #5
0
(r'.*able$', 'JJ'),                # adjectives
(r'.*ness$', 'NN'),                # nouns formed from adjectives
(r'.*ly$', 'RB'),                  # adverbs
(r'.*s$', 'NNS'),                  # plural nouns
(r'.*ing$', 'VBG'),                # gerunds
(r'.*ed$', 'VBD'),                 # past tense verbs
(r'.*', 'NN')                      # nouns (default)
])
entrenar_bill(tagger,"RegexpTagger")


# In[ ]:


tagger = UnigramTagger(train_reducido[:1000])
tagger.evaluate(test_reducido[:1000])
entrenar_bill(tagger,"UnigramTagger")


# In[ ]:


tagger = BigramTagger(train_reducido[:1000])
tagger.evaluate(test_reducido[:1000])
entrenar_bill(tagger,"BigramTagger")


# In[ ]:


ct = CRFTagger()
Example #6
0
from nltk.tag import RegexpTagger
from tag_util import patterns, test_sents


tagger = RegexpTagger(patterns)
print(tagger.evaluate(test_sents))