Example #1
0
word_len = [len(w) for w in text1]
print word_len





# Example	Description
# fdist = FreqDist(samples)	create a frequency distribution containing the given samples
# fdist[sample] += 1	increment the count for this sample
# fdist['monstrous']	count of the number of times a given sample occurred
# fdist.freq('monstrous')	frequency of a given sample
# fdist.N()	total number of samples
# fdist.most_common(n)	the n most common samples and their frequencies
# for sample in fdist:	iterate over the samples
# fdist.max()	sample with the greatest count
# fdist.tabulate()	tabulate the frequency distribution
# fdist.plot()	graphical plot of the frequency distribution
# fdist.plot(cumulative=True)	cumulative plot of the frequency distribution
# fdist1 |= fdist2	update fdist1 with counts from fdist2
# fdist1 < fdist2	test if samples in fdist1 occur less frequently than in fdist2

fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)
Example #2
0
    if args.stop_punctuation:
        stoplist += [x.decode('UTF8') for x in set(list(punctuation))]
        stoplist += [u'\u201d', u'\u201c', u'\u2019', u'\u2014']
        stoplist.append('--')

    words = [word for word in word_tokenize(text) if word not in stoplist]
    if args.stem:
        st = LancasterStemmer()
        words = [st.stem(word) for word in words]

    freq_dist = FreqDist(words)

    print('Total words: ' + str(orig_freq_dist.N()))
    print('Total after filter: ' + str(freq_dist.N()))
    # B() gives list of unique words
    print('Unique words: ' + str(freq_dist.B()))
    print('Unique words ratio: ' +
          str(float(freq_dist.B()) / float(freq_dist.N())))
    print('\n')

    if args.words:
        for word in args.words:
            print(word + ': ' + str(freq_dist[word]))
            print(word + ' freq: ' + str(freq_dist.freq(word)))
            print('\n')

    # Show top 30
    print('Top ' + str(args.num_words) + ' words:')
    freq_dist.tabulate(args.num_words)
Example #3
0
# Words that are more frequent than 7 times and are more than 7 characters long
rare_and_long = sorted(w for w in set(brown.words()) if len(w) > 7 and fd[w] > 7)


### SUMMARY
# Other functions and attributes of frequency distribution object
fd = FreqDist(brown.words())
fd['County']  # count of a specific word
fd.freq('County')  # frequency of a specific word
fd.N()  # total number of samples
fd.most_common(10)
for sample in fd:
    print sample
fd.max()
fd.tabulate()
fd.plot()
fd1 |= fd2  # update fd1 with counts from fd2
fd1 < fd2  # test if samples in fd1 occur less frequenctly than in fd2


### IMPORTING TEXT
# NLTK comes with a collection of texts to get started. To import a specific text:
from nltk.book import text1
from nltk.book import sent7

### USING CONDITIONALS
# select words based on their length
[w for w in sent7 if len(w) < 4]
# select words based on other attributes
w.startswith('t')  # same as w[0]=='t'
Example #4
0
#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import FreqDist

fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成
fdist.inc(sample) # sampleで指定されたデータの数を1増やす
fdist['データ'] # 指定されたデータの出現数
fdist.freq('データ') # 指定されたデータの頻度
fdist.N() # サンプルの総数
fdist.keys() # 頻度の順にソートされたサンプル
for sample in fdist: # 頻度の順にサンプルをイテレート
    pass
fdist.max() # 数の最も多いサンプル
fdist.tabulate() # 頻度分布を表形式で表示
fdist.plot() # 頻度分布をプロット
fdist.plot(cumulative=True) # 累積頻度をプロット
fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト


Example #5
0
File: dsc.py Project: dmml/NLTK
# lemma
def lemma(text):
    lmtzr = WordNetLemmatizer()
    return [lmtzr.lemmatize(w) for w in text]

nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title)  # Frequency distribution of text
fdist_title.most_common(50)  # most common 50
fdist_title['science']  # return count of a given word
fdist_title.max()  # max counts
fdist_title.plot(50, cumulative=True)  # plot
fdist_title.plot(50)
fdist_title.tabulate(50)  # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))

# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)
bigrams_title.most_common(50)
bigrams_title.plot(50,cumulative=True)
trigrams_title.most_common(20)
    print(idx, word)  # 0 [; 1 Persuasion; 2 by; 3 Jane; 4 Austen
##################################################################
## 统计词的长度频率
fdist = FreqDist(len(w) for w in gutenberg.words('austen-persuasion.txt'))
print(fdist)  # <FreqDist with 16 samples and 98171 outcomes>
print(fdist.items())  # dict_items([(1, 16274), (10, 1615), (2, 16165), (4, 15613), (6, 6538), (7, 5714), (3, 20013), (8, 3348), (13, 230), (9, 2887), (5, 8422), (11, 768), (12, 486), (14, 69), (15, 25), (16, 4)])
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower() for ch in gutenberg.raw('austen-persuasion.txt') if ch.isalpha())  # 可以不用 [] 将生成器 list 化
print(fdist.most_common(5))  # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)]
print([char for (char, count) in fdist.most_common()])  # 26 个字母使用频率排序
##################################################################
## most_common(n); 得到前 n 个按频率排序后的词
print(fd.most_common(5))  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
fd.tabulate()  # 表格形式给出 most_common()
# 或者使用 Counter 来实现
from collections import Counter
print(Counter(fd).most_common(5))  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
# 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the.
# 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系.

# 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k);
# f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数.
# 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数
# 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" .

# 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现,
# 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系
##################################################################
## 使用 NLTK 对齐普夫定律进行作图
Example #7
0
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
from nltk.corpus import PlaintextCorpusReader

tweettknzr = TweetTokenizer()

file0 = nltk.corpus.gutenberg.fileids()[0]
emmatext = nltk.corpus.gutenberg.raw(file0)
emmatokens = nltk.word_tokenize(emmatext)
#emmatokens2 = tweettknzr.tokenize(emmatext)
emmawords = [w.lower() for w in emmatokens]
emmavocab = sorted(set(emmawords))
revisemmawords = [w for w in emmawords if w.isalpha()]
fdist = FreqDist(revisemmawords)
topkeys = fdist.most_common(50)
#total number of samples
print(fdist.N())
print(topkeys)
print(fdist.tabulate())
fdist.plot(cumulative=True)
# for pair in topkeys:
#     print(pair)
mycorpus = PlaintextCorpusReader('.', '.*\.txt')
print(mycorpus.fileids())
print(mycorpus.words())
labweekstring = mycorpus.raw('labweek.txt')
# print(mycorpus)
# print(labweekstring)
Example #8
0
        item.append(x)

# 计算词频
fre = FreqDist(item)
print(fre.most_common(10))

# 生成list
list_fre = list(fre.most_common(10))
list_item = []
list_count = []
for x in list_fre:
    list_item.append(x[0])
    list_count.append(x[1])

# 频率分布图
fre.tabulate(10)
fre.plot(10)

# 饼图..
plt.pie(x=list_count, labels=list_item)
plt.show()

# 柱状图
plt.bar(list_item, list_count)
plt.show()
# 用Seaborn画柱状图
sns.barplot(list_item, list_count)
plt.show()

# 索引和产品重组dataframe
data = pd.DataFrame({'item': item, 'transaction': indexid})
print(fdist.most_common(3))  # [(3, 20013), (1, 16274), (2, 16165)]
##################################################################
## 统计 英文字符
fdist = nltk.FreqDist(ch.lower()
                      for ch in gutenberg.raw('austen-persuasion.txt')
                      if ch.isalpha())  # 可以不用 [] 将生成器 list 化
print(
    fdist.most_common(5)
)  # [('e', 46949), ('t', 32192), ('a', 29371), ('o', 27617), ('n', 26718)]
print([char for (char, count) in fdist.most_common()])  # 26 个字母使用频率排序
##################################################################
## most_common(n); 得到前 n 个按频率排序后的词
print(
    fd.most_common(5)
)  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
fd.tabulate()  # 表格形式给出 most_common()
# 或者使用 Counter 来实现
from collections import Counter
print(
    Counter(fd).most_common(5)
)  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
# 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the.
# 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系.

# 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k);
# f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数.
# 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数
# 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" .

# 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现,
# 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系