Python FreqDist.tabulate Examples

Programming Language: Python

Namespace/Package Name: nltk

Class/Type: FreqDist

Method/Function: tabulate

Examples at hotexamples.com: 9

Python FreqDist.tabulate - 9 examples found. These are the top rated real world Python examples of nltk.FreqDist.tabulate extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

keys(30)

N(30)

values(30)

update(30)

plot(30)

most_common(30)

FreqDist(30)

items(30)

freq(30)

inc(26)

hapaxes(25)

B(22)

get(22)

max(18)

iteritems(7)

pop(6)

copy(5)

tabulate(4)

samples(3)

__delitem__(2)

pformat(2)

sort_values(2)

has_key(1)

__init__(1)

transpose(1)

sort(1)

pprint(1)

reverse(1)

reset_index(1)

r_Nr(1)

_cumulative_frequencies(1)

clear(1)

elements(1)

insert(1)

viewkeys(1)

Example #1

Show file

File: counting_other3.4.py Project: loveclj/python

word_len = [len(w) for w in text1]
print word_len





# Example	Description
# fdist = FreqDist(samples)	create a frequency distribution containing the given samples
# fdist[sample] += 1	increment the count for this sample
# fdist['monstrous']	count of the number of times a given sample occurred
# fdist.freq('monstrous')	frequency of a given sample
# fdist.N()	total number of samples
# fdist.most_common(n)	the n most common samples and their frequencies
# for sample in fdist:	iterate over the samples
# fdist.max()	sample with the greatest count
# fdist.tabulate()	tabulate the frequency distribution
# fdist.plot()	graphical plot of the frequency distribution
# fdist.plot(cumulative=True)	cumulative plot of the frequency distribution
# fdist1 |= fdist2	update fdist1 with counts from fdist2
# fdist1 < fdist2	test if samples in fdist1 occur less frequently than in fdist2

fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)

Example #2

Show file

File: word_freq.py Project: jecompton/nltk_utils

    if args.stop_punctuation:
        stoplist += [x.decode('UTF8') for x in set(list(punctuation))]
        stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014']
        stoplist.append('--')

    words = [word for word in word_tokenize(text) if word not in stoplist]
    if args.stem:
        st = LancasterStemmer()
        words = [st.stem(word) for word in words]

    freq_dist = FreqDist(words)

    print('Total words: ' + str(orig_freq_dist.N()))
    print('Total after filter: ' + str(freq_dist.N()))
    # B() gives list of unique words
    print('Unique words: ' + str(freq_dist.B()))
    print('Unique words ratio: ' +
          str(float(freq_dist.B()) / float(freq_dist.N())))
    print('\n')

    if args.words:
        for word in args.words:
            print(word + ': ' + str(freq_dist[word]))
            print(word + ' freq: ' + str(freq_dist.freq(word)))
            print('\n')

    # Show top 30
    print('Top ' + str(args.num_words) + ' words:')
    freq_dist.tabulate(args.num_words)

Example #3

Show file

File: NLP_tut.py Project: DeepakSinghRawat/Tutorials

# Words that are more frequent than 7 times and are more than 7 characters long
rare_and_long = sorted(w for w in set(brown.words()) if len(w) > 7 and fd[w] > 7)


### SUMMARY
# Other functions and attributes of frequency distribution object
fd = FreqDist(brown.words())
fd['County']  # count of a specific word
fd.freq('County')  # frequency of a specific word
fd.N()  # total number of samples
fd.most_common(10)
for sample in fd:
    print sample
fd.max()
fd.tabulate()
fd.plot()
fd1 |= fd2  # update fd1 with counts from fd2
fd1 < fd2  # test if samples in fd1 occur less frequenctly than in fd2


### IMPORTING TEXT
# NLTK comes with a collection of texts to get started. To import a specific text:
from nltk.book import text1
from nltk.book import sent7

### USING CONDITIONALS
# select words based on their length
[w for w in sent7 if len(w) < 4]
# select words based on other attributes
w.startswith('t')  # same as w[0]=='t'

Example #4

Show file

File: FreqDist.py Project: pombredanne/pysample

#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import FreqDist

fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成
fdist.inc(sample) # sampleで指定されたデータの数を1増やす
fdist['データ'] # 指定されたデータの出現数
fdist.freq('データ') # 指定されたデータの頻度
fdist.N() # サンプルの総数
fdist.keys() # 頻度の順にソートされたサンプル
for sample in fdist: # 頻度の順にサンプルをイテレート
    pass
fdist.max() # 数の最も多いサンプル
fdist.tabulate() # 頻度分布を表形式で表示
fdist.plot() # 頻度分布をプロット
fdist.plot(cumulative=True) # 累積頻度をプロット
fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト

Example #5

Show file

File: dsc.py Project: dmml/NLTK

# lemma
def lemma(text):
    lmtzr = WordNetLemmatizer()
    return [lmtzr.lemmatize(w) for w in text]

nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title)  # Frequency distribution of text
fdist_title.most_common(50)  # most common 50
fdist_title['science']  # return count of a given word
fdist_title.max()  # max counts
fdist_title.plot(50, cumulative=True)  # plot
fdist_title.plot(50)
fdist_title.tabulate(50)  # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))

# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)
bigrams_title.most_common(50)
bigrams_title.plot(50,cumulative=True)
trigrams_title.most_common(20)

Example #6

Show file

File: l21_FreqDist-词频统计_Zipf-Law-可视化.py Project: coder352/shellscript

    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(fdist.items())  # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha())  # 可以不用 [] 将生成器 list 化
print(fdist.most_common(5))  # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)]
print([char for (char, count) in fdist.most_common()])  # 26 个字母使用频率排序
##################################################################
## most_common(n); 得到前 n 个按频率排序后的词
print(fd.most_common(5))  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
fd.tabulate()  # 表格形式给出 most_common()
# 或者使用 Counter 来实现
from collections import Counter
print(Counter(fd).most_common(5))  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
# 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the.
# 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系.

# 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k);
# f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数.
# 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数
# 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" .

# 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现,
# 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系
##################################################################
## 使用 NLTK 对齐普夫定律进行作图

Example #7

Show file

import nltk
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
from nltk.corpus import PlaintextCorpusReader

tweettknzr = TweetTokenizer()

file0 = nltk.corpus.gutenberg.fileids()[0]
emmatext = nltk.corpus.gutenberg.raw(file0)
emmatokens = nltk.word_tokenize(emmatext)
#emmatokens2 = tweettknzr.tokenize(emmatext)
emmawords = [w.lower() for w in emmatokens]
emmavocab = sorted(set(emmawords))
revisemmawords = [w for w in emmawords if w.isalpha()]
fdist = FreqDist(revisemmawords)
topkeys = fdist.most_common(50)
#total number of samples
print(fdist.N())
print(topkeys)
print(fdist.tabulate())
fdist.plot(cumulative=True)
# for pair in topkeys:
#     print(pair)
mycorpus = PlaintextCorpusReader('.', '.*\.txt')
print(mycorpus.fileids())
print(mycorpus.words())
labweekstring = mycorpus.raw('labweek.txt')
# print(mycorpus)
# print(labweekstring)

Example #8

Show file

        item.append(x)

# 计算词频
fre = FreqDist(item)
print(fre.most_common(10))

# 生成list
list_fre = list(fre.most_common(10))
list_item = []
list_count = []
for x in list_fre:
    list_item.append(x[0])
    list_count.append(x[1])

# 频率分布图
fre.tabulate(10)
fre.plot(10)

# 饼图..
plt.pie(x=list_count, labels=list_item)
plt.show()

# 柱状图
plt.bar(list_item, list_count)
plt.show()
# 用Seaborn画柱状图
sns.barplot(list_item, list_count)
plt.show()

# 索引和产品重组dataframe
data = pd.DataFrame({'item': item, 'transaction': indexid})

Example #9

Show file

File: l21_FreqDist-词频统计_Zipf-Law-可视化.py Project: HCShi/jShellscript

print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower()
                      for ch in gutenberg.raw('austen-persuasion.txt')
                      if ch.isalpha())  # 可以不用 [] 将生成器 list 化
print(
    fdist.most_common(5)
)  # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)]
print([char for (char, count) in fdist.most_common()])  # 26 个字母使用频率排序
##################################################################
## most_common(n); 得到前 n 个按频率排序后的词
print(
    fd.most_common(5)
)  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
fd.tabulate()  # 表格形式给出 most_common()
# 或者使用 Counter 来实现
from collections import Counter
print(
    Counter(fd).most_common(5)
)  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
# 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the.
# 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系.

# 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k);
# f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数.
# 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数
# 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" .

# 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现,
# 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系