def default_tag(): #Tagging any word by assigning the most frequent tag in a given corpus tags = [] for (word, tag) in brown.tagged_words(categories='news'): tags.append(tag) most_freq_tag = FreqDist(tags).max() raw = 'I love AIT because AIT is interesting and professors here give a lot of challenging assignment' tokens = word_tokenize(raw) #Here is our tagger, it means in default, it will assign 'NN' tag to a word input default_tagger = DefaultTagger('NN') tagged = default_tagger.tag(tokens) print(tagged) score = default_tagger.evaluate(brown_tagged_sents) print(score)
#getting the most common tag in the brown corpus tags = [tag for (word, tag) in brown.tagged_words()] most_common_tag = nltk.FreqDist(tags).max() print(most_common_tag) from nltk import DefaultTagger barack = """Barack Hussein Obama (born August 4, 1961) is an American politician who served as the 44th President of the United States from January 20, 2009, to January 20, 2017. A member of the Democratic Party, he was the first African American to assume the presidency and previously served as a United States Senator from Illinois (2005–2008).""" tokenized_barack = word_tokenize(barack) default_tagger = DefaultTagger(most_common_tag) def_tagged_barack = default_tagger.tag(tokenized_barack) print(def_tagged_barack) #Lookup Tagger #Ngram tagger message = "the quick brown fox jumped over the lazy dog" training_tag = pos_tag(word_tokenize(message)) print(training_tag) #training the ngram tagger ngram_tagger = nltk.NgramTagger(n=2, train=[training_tag]) message2 = "the lazy dog jumped over the quick brown fox" message2_tags = ngram_tagger.tag(word_tokenize(message2)) print(message2_tags) print(list(nltk.ngrams(pos_tag(word_tokenize(message)), n=2)))
import nltk nltk.download('brown') from nltk.corpus import brown brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal') brown_news_words = brown.tagged_words(categories='news', tagset='universal') brown_train = brown_news_tagged[100:] brown_test = brown_news_tagged[:100] from nltk.tag import untag test_sent = untag(brown_test[0]) print("Tagged: ", brown_test[0]) print() print("Untagged: ", test_sent) # A default tagger assigns the same tag to all words from nltk import DefaultTagger default_tagger = DefaultTagger('NOUN') default_tagger.tag('This is a test'.split())
nltk.help.upenn_tagset('VB.*') text = nltk.word_tokenize("I cannot bear the pain of bear") out = nltk.pos_tag(text) out = nltk.tag.str2tuple('bear/NN') print(out) print((out[0], out[1])) print(nltk.tag.tuple2str(out)) treebank_tagged = treebank.tagged_words(tagset='universal') tag = nltk.FreqDist(tag for (word, tag) in treebank_tagged) out = tag.most_common() print(out) tag = DefaultTagger('NN') out = tag.tag(['Beautiful', 'morning']) print(out) """英语的十大词类 1.名词noun n. 2.代词pronoun pron. 3.形容词adjective adj. 4.副词 adverb adv. 5.动词verb v. 6.数词numeral num. 7.冠词article art. 8.介词preposition prep. 9.连词conjunction conj. 10.感叹词interjection interj. """ print("**********************************") out = nltk.corpus.words.fileids()