def regex_tag():
    raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship'
    raw_incorrect = 'I love AIT because AIT is interesting and professors here give a lot of challenging assignment'
    patterns = [
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # simple past
        (r'.*es$', 'VBZ'),  # 3rd singular present
        (r'.*ould$', 'MD'),  # modals
        (r'.*\'s$', 'NN$'),  # possessive nouns
        (r'.*s$', 'NNS'),  # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')
    ]  # nouns (default)
    regexp_tagger = RegexpTagger(patterns)
    tagged = regexp_tagger.tag(word_tokenize(raw))
    tagged_incorrect = regexp_tagger.tag(word_tokenize(raw_incorrect))
    print(tagged)
    print(tagged_incorrect)
    score = regexp_tagger.evaluate(brown_tagged_sents)
    print(score)
Beispiel #2
0
    (r'.*ould$', 'MD'),  #modal
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  #plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  #nouns (default)
]
regexp_tagger = RegexpTagger(patterns)
uniB = UnigramTagger(brownT90, backoff=defaultTB90)
biB = BigramTagger(brownT90, backoff=uniB)
triB = TrigramTagger(brownT90, backoff=biB)

uniC = UnigramTagger(chatT50, backoff=defaultTChat50)
biC = BigramTagger(chatT50, backoff=uniC)
triC = TrigramTagger(chatT50, backoff=uniC)

print("Regextag50/50: ", regexp_tagger.evaluate(brownT50))
print("Default: ", defaultTB90.evaluate(brownT50))

print("Bigram Brown 50/50: ",
      BigramTagger(brownT50, backoff=defaultTB50).evaluate(brownT50))
print("Default: ", defaultTB50.evaluate(brownT50))

print("Bigram Brown 90/10: ",
      BigramTagger(brownT90, backoff=defaultTB90).evaluate(brownT90))
print("Default: ", defaultTB90.evaluate(brownT90))

print("Unigram chat 50/50: ",
      UnigramTagger(chatT50, backoff=defaultTChat50).evaluate(chatT50))
print("Default: ", defaultTChat50.evaluate(chatT50))

print("Unigram chat 90/10: ",
Beispiel #3
0
# Apply the *RegexpTagger* for tagging the first 3 sentences of the brown corpus.

# In[5]:


regexp_tagger.tag(brown.sents()[3])


# Evaluate the tagger using category _news_ of the brown corpus. The `evaluate()`-method returns the accuracy (i.e. the rate of correct Tag-assignments) of the tagger on this test-corpus.

# In[6]:


brown_tagged_sents=brown.tagged_sents(categories='news')
print(regexp_tagger.evaluate(brown_tagged_sents))


# ## Unigram Tagger

# In[7]:


from nltk import UnigramTagger, DefaultTagger, BigramTagger
from nltk import FreqDist,ConditionalFreqDist


# A UnigramTagger requires a tagged corpus. From the tagged corpus it learns a mapping from word to pos-tag by determining for each word the most frequent tag in the corpus. The trained tagger then assigns to each word the most frequent pos-tag as determined in the training corpus.
# 
# In this notebook the pos-tagged Brown Corpus is applied. The tagset used in this corpus is quite sophisticated. It can be obtained by the following command:
dt.evaluate(test_data)

patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default) ...
]

rt = RegexpTagger(patterns)

rt.evaluate(test_data)

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

ut.evaluate(test_data)


def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff


ct = combined_tagger(train_data=train_data,
Beispiel #5
0
        (r'.*s$', 'NNS'),                 # plural nouns
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'.*', 'NN')                     # nouns (default)
    ]
    

additions = [
        (r'[.?;!:]', '.'),
        ('\\($', '('),
        (r'.*ly$', 'ADV'),
        ('n[o\']t$', '*'),
        (r'^,$', ',')
    ]

ret = RegexpTagger(patterns)
print ret.evaluate(brown.tagged_sents(categories='news'))
    
for pattern in additions:
    patterns.insert(-1, pattern)
    print "added pattern {}".format(pattern)
    ret = RegexpTagger(patterns)
    print ret.evaluate(brown.tagged_sents(categories='news'))
    
# 0.203263917895
# added pattern ('[.?;!:]', '.')
# 0.247538635957
# added pattern ('\\($', '(')
# 0.24901048193
# added pattern ('.*ly$', 'ADV')
# 0.248314338564
# added pattern ("n[o']t$", '*')