def default_tag():
    #Tagging any word by assigning the most frequent tag in a given corpus

    tags = []
    for (word, tag) in brown.tagged_words(categories='news'):
        tags.append(tag)
    most_freq_tag = FreqDist(tags).max()
    raw = 'I love AIT because AIT is interesting and professors here give a lot of challenging assignment'
    tokens = word_tokenize(raw)

    #Here is our tagger, it means in default, it will assign 'NN' tag to a word input
    default_tagger = DefaultTagger('NN')
    tagged = default_tagger.tag(tokens)
    print(tagged)
    score = default_tagger.evaluate(brown_tagged_sents)
    print(score)
Esempio n. 2
0
#getting the most common tag in the brown corpus
tags = [tag for (word, tag) in brown.tagged_words()]
most_common_tag = nltk.FreqDist(tags).max()
print(most_common_tag)

from nltk import DefaultTagger

barack = """Barack Hussein Obama (born August 4, 1961) is an American politician 
who served as the 44th President of the United States from January 20, 2009, to January 20, 2017.
A member of the Democratic Party, he was the first African American to assume the presidency 
and previously served as a United States Senator from Illinois (2005–2008)."""

tokenized_barack = word_tokenize(barack)
default_tagger = DefaultTagger(most_common_tag)
def_tagged_barack = default_tagger.tag(tokenized_barack)
print(def_tagged_barack)

#Lookup Tagger
#Ngram tagger
message = "the quick brown fox jumped over the lazy dog"
training_tag = pos_tag(word_tokenize(message))
print(training_tag)
#training the ngram tagger
ngram_tagger = nltk.NgramTagger(n=2, train=[training_tag])

message2 = "the lazy dog jumped over the quick brown fox"
message2_tags = ngram_tagger.tag(word_tokenize(message2))
print(message2_tags)

print(list(nltk.ngrams(pos_tag(word_tokenize(message)), n=2)))
Esempio n. 3
0
import nltk
nltk.download('brown')

from nltk.corpus import brown

brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal')
brown_news_words = brown.tagged_words(categories='news', tagset='universal')

brown_train = brown_news_tagged[100:]
brown_test = brown_news_tagged[:100]

from nltk.tag import untag
test_sent = untag(brown_test[0])
print("Tagged: ", brown_test[0])
print()
print("Untagged: ", test_sent)

# A default tagger assigns the same tag to all words
from nltk import DefaultTagger
default_tagger = DefaultTagger('NOUN')
default_tagger.tag('This is a test'.split())
nltk.help.upenn_tagset('VB.*')

text = nltk.word_tokenize("I cannot bear the pain of bear")
out = nltk.pos_tag(text)
out = nltk.tag.str2tuple('bear/NN')
print(out)
print((out[0], out[1]))
print(nltk.tag.tuple2str(out))

treebank_tagged = treebank.tagged_words(tagset='universal')
tag = nltk.FreqDist(tag for (word, tag) in treebank_tagged)
out = tag.most_common()
print(out)

tag = DefaultTagger('NN')
out = tag.tag(['Beautiful', 'morning'])
print(out)
"""英语的十大词类
1.名词noun n.
2.代词pronoun pron.
3.形容词adjective adj.
4.副词 adverb adv.
5.动词verb v.
6.数词numeral num.
7.冠词article art.
8.介词preposition prep.
9.连词conjunction conj.
10.感叹词interjection interj.
"""
print("**********************************")
out = nltk.corpus.words.fileids()