word_len = [len(w) for w in text1] print word_len # Example Description # fdist = FreqDist(samples) create a frequency distribution containing the given samples # fdist[sample] += 1 increment the count for this sample # fdist['monstrous'] count of the number of times a given sample occurred # fdist.freq('monstrous') frequency of a given sample # fdist.N() total number of samples # fdist.most_common(n) the n most common samples and their frequencies # for sample in fdist: iterate over the samples # fdist.max() sample with the greatest count # fdist.tabulate() tabulate the frequency distribution # fdist.plot() graphical plot of the frequency distribution # fdist.plot(cumulative=True) cumulative plot of the frequency distribution # fdist1 |= fdist2 update fdist1 with counts from fdist2 # fdist1 < fdist2 test if samples in fdist1 occur less frequently than in fdist2 fdlist = FreqDist(len(w) for w in text1) print dict(fdlist) print fdlist.most_common(3) print fdlist.max() print fdlist[2] print fdlist.tabulate() fdlist.plot() fdlist.plot(cumulative=True)
if args.stop_punctuation: stoplist += [x.decode('UTF8') for x in set(list(punctuation))] stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014'] stoplist.append('--') words = [word for word in word_tokenize(text) if word not in stoplist] if args.stem: st = LancasterStemmer() words = [st.stem(word) for word in words] freq_dist = FreqDist(words) print('Total words: ' + str(orig_freq_dist.N())) print('Total after filter: ' + str(freq_dist.N())) # B() gives list of unique words print('Unique words: ' + str(freq_dist.B())) print('Unique words ratio: ' + str(float(freq_dist.B()) / float(freq_dist.N()))) print('\n') if args.words: for word in args.words: print(word + ': ' + str(freq_dist[word])) print(word + ' freq: ' + str(freq_dist.freq(word))) print('\n') # Show top 30 print('Top ' + str(args.num_words) + ' words:') freq_dist.tabulate(args.num_words)
# Words that are more frequent than 7 times and are more than 7 characters long rare_and_long = sorted(w for w in set(brown.words()) if len(w) > 7 and fd[w] > 7) ### SUMMARY # Other functions and attributes of frequency distribution object fd = FreqDist(brown.words()) fd['County'] # count of a specific word fd.freq('County') # frequency of a specific word fd.N() # total number of samples fd.most_common(10) for sample in fd: print sample fd.max() fd.tabulate() fd.plot() fd1 |= fd2 # update fd1 with counts from fd2 fd1 < fd2 # test if samples in fd1 occur less frequenctly than in fd2 ### IMPORTING TEXT # NLTK comes with a collection of texts to get started. To import a specific text: from nltk.book import text1 from nltk.book import sent7 ### USING CONDITIONALS # select words based on their length [w for w in sent7 if len(w) < 4] # select words based on other attributes w.startswith('t') # same as w[0]=='t'
#!/usr/bin/python # coding: utf-8 # 2013/03/20 from nltk import FreqDist fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成 fdist.inc(sample) # sampleで指定されたデータの数を1増やす fdist['データ'] # 指定されたデータの出現数 fdist.freq('データ') # 指定されたデータの頻度 fdist.N() # サンプルの総数 fdist.keys() # 頻度の順にソートされたサンプル for sample in fdist: # 頻度の順にサンプルをイテレート pass fdist.max() # 数の最も多いサンプル fdist.tabulate() # 頻度分布を表形式で表示 fdist.plot() # 頻度分布をプロット fdist.plot(cumulative=True) # 累積頻度をプロット fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト
# lemma def lemma(text): lmtzr = WordNetLemmatizer() return [lmtzr.lemmatize(w) for w in text] nostop_title = lemma(remove_stopwords(text_title)) # check the collocations of text nostop_title = nltk.Text(nostop_title) nostop_title.collocations() fdist_title = FreqDist(nostop_title) # Frequency distribution of text fdist_title.most_common(50) # most common 50 fdist_title['science'] # return count of a given word fdist_title.max() # max counts fdist_title.plot(50, cumulative=True) # plot fdist_title.plot(50) fdist_title.tabulate(50) # tabulate total_words = len(set(nostop_title)) print("The total number of words in title of dsc is: " + str(total_words)) avg_words = fdist_title.N() / total_words print("Each word appears in title of dsc is: " + str(int(avg_words))) # bigrams, trigrams from nltk import bigrams from nltk import trigrams word_pair = list(bigrams(nostop_title)) word_triple = list(trigrams(nostop_title)) bigrams_title = FreqDist(word_pair) trigrams_title = FreqDist(word_triple) bigrams_title.most_common(50) bigrams_title.plot(50,cumulative=True) trigrams_title.most_common(20)
print(idx, word) # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen ################################################################## ## 统计词的长度频率 fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt')) print(fdist) # <FreqDist with 16 samples and 98171 outcomes> print(fdist.items()) # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)]) print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)] ################################################################## ## 统计 英文字符 fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha()) # 可以不用 [] 将生成器 list 化 print(fdist.most_common(5)) # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)] print([char for (char, count) in fdist.most_common()]) # 26 个字母使用频率排序 ################################################################## ## most_common(n); 得到前 n 个按频率排序后的词 print(fd.most_common(5)) # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)] fd.tabulate() # 表格形式给出 most_common() # 或者使用 Counter 来实现 from collections import Counter print(Counter(fd).most_common(5)) # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)] # 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the. # 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系. # 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k); # f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数. # 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数 # 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" . # 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现, # 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系 ################################################################## ## 使用 NLTK 对齐普夫定律进行作图
import nltk from nltk.tokenize import TweetTokenizer from nltk import FreqDist from nltk.corpus import PlaintextCorpusReader tweettknzr = TweetTokenizer() file0 = nltk.corpus.gutenberg.fileids()[0] emmatext = nltk.corpus.gutenberg.raw(file0) emmatokens = nltk.word_tokenize(emmatext) #emmatokens2 = tweettknzr.tokenize(emmatext) emmawords = [w.lower() for w in emmatokens] emmavocab = sorted(set(emmawords)) revisemmawords = [w for w in emmawords if w.isalpha()] fdist = FreqDist(revisemmawords) topkeys = fdist.most_common(50) #total number of samples print(fdist.N()) print(topkeys) print(fdist.tabulate()) fdist.plot(cumulative=True) # for pair in topkeys: # print(pair) mycorpus = PlaintextCorpusReader('.', '.*\.txt') print(mycorpus.fileids()) print(mycorpus.words()) labweekstring = mycorpus.raw('labweek.txt') # print(mycorpus) # print(labweekstring)
item.append(x) # 计算词频 fre = FreqDist(item) print(fre.most_common(10)) # 生成list list_fre = list(fre.most_common(10)) list_item = [] list_count = [] for x in list_fre: list_item.append(x[0]) list_count.append(x[1]) # 频率分布图 fre.tabulate(10) fre.plot(10) # 饼图.. plt.pie(x=list_count, labels=list_item) plt.show() # 柱状图 plt.bar(list_item, list_count) plt.show() # 用Seaborn画柱状图 sns.barplot(list_item, list_count) plt.show() # 索引和产品重组dataframe data = pd.DataFrame({'item': item, 'transaction': indexid})
print(fdist.most_common(3)) # [(3, 20013), (1, 16274), (2, 16165)] ################################################################## ## 统计 英文字符 fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha()) # 可以不用 [] 将生成器 list 化 print( fdist.most_common(5) ) # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)] print([char for (char, count) in fdist.most_common()]) # 26 个字母使用频率排序 ################################################################## ## most_common(n); 得到前 n 个按频率排序后的词 print( fd.most_common(5) ) # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)] fd.tabulate() # 表格形式给出 most_common() # 或者使用 Counter 来实现 from collections import Counter print( Counter(fd).most_common(5) ) # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)] # 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the. # 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系. # 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k); # f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数. # 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数 # 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" . # 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现, # 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系