def test_plot(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) try: empty.plot(conditions="BUG") # nonexistent keys shouldn't be added except: pass self.assertEqual(empty.conditions(), [])
#%% from nltk.corpus import inaugural from nltk import ConditionalFreqDist from nltk.probability import FreqDist fd3 = FreqDist([s for s in inaugural.words()]) print(fd3.freq('freedom')) # count frequency of words length in decending order cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.words(fileid) if fileid > '1980' and fileid < '2010') print(cfd.items()) cfd.plot() # %%
import nltk wnl = nltk.WordNetLemmatizer() from nltk.corpus import PlaintextCorpusReader from nltk import ConditionalFreqDist corpus = PlaintextCorpusReader('C:/Data/Candidate_tweets/Processing_tweets/By_week_tweets/Cleaned_by_week/', '.*') corpus.fileids()[0:3] print len(corpus.words()) cfd = ConditionalFreqDist( (target, fileid) for fileid in corpus.fileids() for w in corpus.words(fileid) for target in ['obama', 'romney', 'opponent'] if w==target) cfd.plot() cfd = nltk.ConditionalFreqDist( (target, fileid) for fileid in corpus.fileids() for w in corpus.words(fileid) for target in ['democrat', 'republican', 'independent'] if w==target) cfd.plot()
#!/usr/bin/python # coding: utf-8 # 2013/03/20 from nltk import ConditionalFreqDist cfdist = ConditionalFreqDist(pairs) # pairs で指定されたデータの頻度分布を生成 (条件,事象)のペア cfdist.conditions() # アルファード順にソートされた条件のリスト cfdist['条件'] # 指定された条件の頻度分布 cfdist['条件'][sample] # cfdist.tablate() cfdist.tablate(samples,conditions) cfdist.plot() cfdist.plot(samples,conditions) cfdist1 < cfdist2
leng=min_length english_vocab=set(w.lower() for w in nltk.corpus.words.words()) full=[] while leng!=0: temp_let=[list(i)+[obligatory_letter] for i in itertools.combinations(letter_list, leng-1)] temp_words=list(set([w for w in ["".join(j) for i in temp_let for j in itertools.permutations(i)] if w in english_vocab])) full=full+temp_words leng=leng+1 if len(temp_words)!=0 else 0 return full wrds=how_many_words(['e','g','i','v','v','o','n','l'], 'r', min_length=3) names=nltk.corpus.names names.fileids() [w for w in names.words('male.txt') if w in names.words('female.txt')] cfd=nltk.ConditionalFreqDist([(fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid)]) cfd.plot() entries=nltk.corpus.cmudict.entries() len(entries) for entry in entries[100:110]: print(entry) for word, pron in entries: if len(pron)==3: p1,p2,p3=pron if p1=='P' and p3=='T': print(word, pron) from nltk.corpus import swadesh swadesh.fileids() swadesh.words('en')
from nltk.corpus import toolbox from nltk.corpus import udhr ################################################################## ## ConditionalFreqDist 简单应用: 文本情感分析 word = ['实惠', '快', '也好', '快', '也好'] anls = ['1', '1', '1', '-1', '1'] tmp_Con = ConditionalFreqDist(zip(word, anls)) print(tmp_Con) # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了 print(tmp_Con.tabulate()) print(tmp_Con.conditions()) # ['实惠', '快', '也好'] print(tmp_Con['快'].most_common()) # [('1', 1), ('-1', 1)] print(tmp_Con['快'].keys()) # dict_keys(['1', '-1']) print(len(tmp_Con['快'].keys())) # 2; 可以看到每个词语的词性有多少个... print(len(tmp_Con['也好'].keys())) # 1; 重复的已经 set() 化了 print([condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1]) # ['快'] tmp_Con.plot() tmp_Con_1 = ConditionalFreqDist(zip(anls, word)) print(tmp_Con_1.conditions()) # ['实惠', '快', '也好'] ################################################################## ## Brown 语料库 word 归类分析 print(brown.categories()) # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) # 这里的 categories=genre 不能去掉 genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] # 从 brown.categories() 中找的 modals = ['can', 'could', 'may', 'might', 'must', 'will'] # 随机找的几个单词 print(cfd.tabulate(conditions=genres, samples=modals)) # Observe that the most frequent modal in the news genre is will, while the most frequent modal in the romance genre is could # can could may might must will # 每个类别种各个单词的数量 # news 93 86 66 38 50 389 # religion 82 59 78 12 54 71 # hobbies 268 58 131 22 83 264 # science_fiction 16 49 4 12 8 16 # romance 74 193 11 51 45 43
## ConditionalFreqDist 简单应用: 文本情感分析 word = ['实惠', '快', '也好', '快', '也好'] anls = ['1', '1', '1', '-1', '1'] tmp_Con = ConditionalFreqDist(zip(word, anls)) print(tmp_Con) # <ConditionalFreqDist with 3 conditions>; 将相同的 'tmp' 合并了 print(tmp_Con.tabulate()) print(tmp_Con.conditions()) # ['实惠', '快', '也好'] print(tmp_Con['快'].most_common()) # [('1', 1), ('-1', 1)] print(tmp_Con['快'].keys()) # dict_keys(['1', '-1']) print(len(tmp_Con['快'].keys())) # 2; 可以看到每个词语的词性有多少个... print(len(tmp_Con['也好'].keys())) # 1; 重复的已经 set() 化了 print([ condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1 ]) # ['快'] tmp_Con.plot() tmp_Con_1 = ConditionalFreqDist(zip(anls, word)) print(tmp_Con_1.conditions()) # ['实惠', '快', '也好'] ################################################################## ## Brown 语料库 word 归类分析 print( brown.categories() ) # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) # 这里的 categories=genre 不能去掉 genres = [ 'news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor' ] # 从 brown.categories() 中找的 modals = ['can', 'could', 'may', 'might', 'must', 'will'] # 随机找的几个单词 print(
A = nltk.corpus.brown.categories() print(A) B = nltk.corpus.brown.words(categories='adventure') print(B) #使用 state_union 语料库阅读器,访问《国情咨文报告》的文本。计数每个文档中 出现的 men、women 和 people。 # 随时间的推移这些词的用法有什么变化? from nltk.corpus import state_union from nltk import ConditionalFreqDist cfd = ConditionalFreqDist((target, fileid[:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in ['men', 'women', 'people']) cfd.plot() #在名字语料库上定义一个条件频率分布,显示哪个首字母在男性名字中比在女性名字 中更常用 from nltk.corpus import names cfd1 = ConditionalFreqDist((fileid, w.lower()[0]) for fileid in names.fileids() for w in names.words(fileid)) cfd1.plot() ##写一个程序,找出所有在布朗语料库中出现至少 3 次的词。 from nltk.corpus import brown from nltk import FreqDist words = brown.words() fd = FreqDist([w.lower() for w in words])
from nltk.corpus import names from nltk import ConditionalFreqDist as CondFreqDist g2n = CondFreqDist([(gender, name[0]) for gender in names.fileids() for name in names.words(gender)]) n2g = CondFreqDist([(name[0] , gender) for gender in names.fileids() for name in names.words(gender)]) g2n.plot() n2g.plot()
def test_plot(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added self.assertEqual(empty.conditions(), [])