def nltk_base_books(): # get context of word text2.concordance("monstrous") text2.similar("monstrous") text2.common_contexts(["monstrous", "very"]) # position of word - dispersion plot text4.dispersion_plot( ["citizens", "democracy", "freedom", "duties", "America"]) # generate random text from a book text2.generate() # counting vocabulary in a text len(text2) # count all words and puntuatations; i.e. count tokens len(set(text2)) # count uniques; i.e. count vocab or word types sorted(set(text4)) # sorted list of vocab or word types len(set(text2)) / len(text2) # measure of lexical richness of the text text2.count("pretty") # specific word count text2.count("pretty") / len(text2) # specific word count as % of full text # indexing -- equiv( py lists) text2[123] text2.index("pretty") text2[0:15] # string ops as usual "the quick brown fox".split() " === ".join(text2[:20]) "Hello " * 3
nltk.download('webtext') nltk.download('treebank') from nltk.corpus import inaugural text = inaugural.raw() wordcloud = WordCloud(max_font_size=60).generate(text) plt.figure(figsize=(16, 12)) # plot wordcloud in matplotlib plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show() from nltk.book import text4 as inaugural_speeches plt.figure(figsize=(16, 5)) topics = ['sports', 'news', 'Government'] inaugural_speeches.dispersion_plot(topics) from nltk.corpus import brown stop_words = set(STOPWORDS) topics = ['Sports - الرياضة', 'News - الاخبار', 'Government - السياسة'] for topic in topics: words = [ word for word in brown.words(categories=topic) if word.lower() not in stop_words and word.isalpha() ] freqdist = nltk.FreqDist(words) print( topic, 'more :', ' , '.join([word.lower() for word, count in freqdist.most_common(5)])) print( topic, 'less :', ' , '.join(
from nltk.book import text1 from nltk.book import text4 from nltk.book import text6 print(text1.concordance("monstrous")) print(text1.similar("monstrous")) print(text1.collocations()) text4.dispersion_plot( ["citizens", "democracy", "freedom", "duties", "America"]) print(text6.count("Very")) print(text6.count('the') / float(len(text6)) * 100) print(text4.count("bless")) print(text4[100]) print(text4.index('the')) print(text4[524]) print(text4.index('men')) print(text4[0:len(text4)])
# examine just the contexts that are shared by two or more words text4.common_contexts(["war", "freedom"]) # Counting Vocabulary: the length of a text from start to finish, # in terms of the words and punctuation symbols that appear. All tokens. len(text4) # How many distinct words does the book of Genesis contain? # The vocabulary of a text is just the set of tokens that it uses. len(set(text4)) #types len(text4) / len( set(text4)) # Each word used on average x times. Richness of the text. #Location of a word in the text: how many spaces from the beginning does it appear? #This positional information can be displayed using a dispersion plot. #You need NumPy and Matplotlib. text4.dispersion_plot( ["citizens", "democracy", "freedom", "war", "America", "vote"]) # Generating some random text in the various styles we have just seen. text4.generate() # count how often a word occurs in a text, text4.count("democracy") # compute what percentage of the text is taken up by a specific word 100 * text4.count('democracy') / len(text4) # Define functions: def lexical_diversity(text): return len(text) / len(set(text))
''' 下载测试数据 ''' # nltk.download() print '===============查找关键词==================' t1.concordance("america") print '===============查找相似上下文===============' t1.similar("america") print '=============共同的语法结构=================' t1.common_contexts(['in', 'of']) print '=================词汇分布图=================' t4.dispersion_plot(['citizens', 'democaracy', 'freedom', 'america']) print '=================统计最常出现的词================' freList = nk.FreqDist(t1) freList.plot(50, cumulative=False) print '=================统计长度超过15的词===============' v = set(t1) long_words = filter(lambda x: len(x) > 15, v)[:10] print long_words print '=================常用双连词搭配===============' tuple = nk.bigrams(['all', 'in', 'of', 'take', 'like']) for x in tuple: print x
text4.similar("vote") # examine just the contexts that are shared by two or more words text4.common_contexts(["war", "freedom"]) # Counting Vocabulary: the length of a text from start to finish, # in terms of the words and punctuation symbols that appear. All tokens. len(text4) # How many distinct words does the book of Genesis contain? # The vocabulary of a text is just the set of tokens that it uses. len(set(text4)) #types len(text4) / len(set(text4)) # Each word used on average x times. Richness of the text. #Location of a word in the text: how many spaces from the beginning does it appear? #This positional information can be displayed using a dispersion plot. #You need NumPy and Matplotlib. text4.dispersion_plot(["citizens", "democracy", "freedom", "war", "America", "vote"]) # Generating some random text in the various styles we have just seen. text4.generate() # count how often a word occurs in a text, text4.count("democracy") # compute what percentage of the text is taken up by a specific word 100 * text4.count('democracy') / len(text4) # Define functions: def lexical_diversity(text): return len(text) / len(set(text)) def percentage(count, total): return 100 * count / total