def demo(scorer=None, compare_scorer=None): """Finds trigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk.corpus import stopwords, webtext ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer)))))
def ch03_42_wordnet_semantic_index(): from nltk.corpus import webtext from nltk.corpus import wordnet as wn postings = [] docids = {} for (pos, fileid) in enumerate(webtext.fileids()): docids[pos] = fileid wpos = 0 words = webtext.words(fileid) for word in words: try: postings.append((word.lower(), (pos, wpos))) offset = wn.synsets(word)[0].offset postings.append((offset, (pos, wpos))) poffset = wn.synsets(word)[0].hypernyms()[0].offset postings.append((poffset, (pos, wpos))) except IndexError: continue wpos = wpos + 1 index = nltk.Index(postings) query = "canine" qpostings = [] qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]]) try: offset = wn.synsets(query)[0].offset qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]]) except IndexError: pass for (pos, wpos) in qpostings: left = webtext.words(docids[pos])[wpos - 4:wpos] right = webtext.words(docids[pos])[wpos:wpos + 4] print left, right
def demo(scorer=None, compare_scorer=None): """Finds bigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk.corpus import stopwords, webtext ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, spearman_correlation( ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer)))))
def ch03_42_wordnet_semantic_index(): from nltk.corpus import webtext from nltk.corpus import wordnet as wn postings = [] docids = {} for (pos, fileid) in enumerate(webtext.fileids()): docids[pos] = fileid wpos = 0 words = webtext.words(fileid) for word in words: try: postings.append((word.lower(), (pos, wpos))) offset = wn.synsets(word)[0].offset postings.append((offset, (pos, wpos))) poffset = wn.synsets(word)[0].hypernyms()[0].offset postings.append((poffset, (pos, wpos))) except IndexError: continue wpos = wpos + 1 index = nltk.Index(postings) query = "canine" qpostings = [] qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]]) try: offset = wn.synsets(query)[0].offset qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]]) except IndexError: pass for (pos, wpos) in qpostings: left = webtext.words(docids[pos])[wpos-4:wpos] right = webtext.words(docids[pos])[wpos:wpos+4] print left, right
def process_webtext(): print 'webtext' from nltk.corpus import webtext count = 0 word = 'bank' sen1 = 'depository_financial_institution.n.01' sen2 = 'bank.n.01' file_name = 'data/bank_webtext_tmp.txt' for f in webtext.fileids(): sents = webtext.sents(f) for i in range(len(sents)): sent = sents[i] if (word in sent): appendToFile(file_name, sentToStr(sent, '0')) count = count + 1 print count
def main(): # store word lengths brown_word_lens = [] web_word_lens = [] inaugural_word_lens = [] gutenberg_word_lens = [] genesis_word_lens = [] for file in gutenberg.fileids(): for word in gutenberg.words(file): gutenberg_word_lens.append(len(word)) for file in brown.fileids(): for word in brown.words(file): brown_word_lens.append(len(word)) for file in webtext.fileids(): for word in webtext.words(file): web_word_lens.append(len(word)) for file in inaugural.fileids(): for word in inaugural.words(file): inaugural_word_lens.append(len(word)) for file in genesis.fileids(): for word in genesis.words(file): genesis_word_lens.append(len(word)) with open("wordlens.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), len(web_word_lens), len(brown_word_lens), len(gutenberg_word_lens))): for corpus in [genesis_word_lens, inaugural_word_lens, web_word_lens, brown_word_lens, gutenberg_word_lens]: if(i >= len(corpus)): f.write(",") else: f.write(str(corpus[i]) + ",") f.write("\n")
def demo(scorer_bam=None, compare_scorer_bam=None, scorer_tam=None, compare_scorer_tam=None): if scorer_bam is None: scorer_bam = BigramAssocMeasures.likelihood_ratio if compare_scorer_bam is None: compare_scorer_bam = BigramAssocMeasures.raw_freq if scorer_tam is None: scorer_tam = TrigramAssocMeasures.likelihood_ratio if compare_scorer_tam is None: compare_scorer_tam = BigramAssocMeasures.raw_freq regex = '^[A-Za-z]+$' #正则表达式匹配英文单词 str_regex = re.compile(regex) for file in webtext.fileids(): # 根据文件逐个处理 words_list = [] for word in webtext.words(file): if not str_regex.match(word): #如果不是纯英文单词,则跳过 continue words_list.append(word) # 获取二元搭配,窗口大小为3,4,5 for window_size in range(3, 4): bcf = BigramCollocationFinder.from_words(words_list, window_size) bcf.apply_freq_filter(window_size) for item in bcf.nbest(scorer_bam, 1000): get_collocation(item) #获取搭配次词 # 获取三元搭配 for window_size in range(3, 4): tcf = TrigramCollocationFinder.from_words(words_list, window_size) tcf.apply_freq_filter(window_size) # tcf.apply_word_filter(word_filter) #corr = spearman_correlation(ranks_from_scores(tcf.score_ngrams(scorer)), # ranks_from_scores(tcf.score_ngrams(compare_scorer))) for item in tcf.nbest(scorer_tam, 1000): get_collocation(item)
def webtext(): from nltk.corpus import webtext as webtext from nltk.corpus import nps_chat # list comprehension version file_ids = [fileid for fileid in webtext.fileids()] chat_file_ids = [fileid for fileid in nps_chat.fileids()] pirates = webtext.raw('pirates.txt') pirates_words = len(webtext.words('pirates.txt')) pirates_sents = len(webtext.sents('pirates.txt')) uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')])) lexical_diversity = lexical_div(uniqs, pirates_words) # import nltk.book as book # text1 = book.text1 # pirates = webtext.raw('pirates.txt') return render_template('webtext.html', file_ids=file_ids, chat_file_ids=chat_file_ids, pirates=pirates)
def exercise_webtext(): # 打印网络文本的文件名 for file_id in webtext.fileids(): print file_id
for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') macbeth_sentences #load sentences of Macbeth macbeth_sentences[1037] longest_len = max([len(s) for s in macbeth_sentences]) [s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence from nltk.corpus import webtext for fileid in webtext.fileids(): print (fileid, webtext.raw(fileid)[:65], '...') from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] from nltk.corpus import brown brown.categories() brown.words(categories = 'news') brown.words(fileids = ['cg22']) from nltk.corpus import brown news_text = brown.words(categories = 'news') fdist = nltk.FreqDist([w.lower() for w in news_text]) modals = ['can', 'could', 'may', 'might', 'must', 'will']
from nltk.corpus import webtext from nltk.corpus import nps_chat #webtext overview for idx in webtext.fileids(): print('file={} , words={}'.format(idx, len(webtext.words(idx))))
def main(): # store word lengths brown_common_freq = [] web_common_freq = [] inaugural_common_freq = [] gutenberg_common_freq = [] genesis_common_freq = [] common = ["the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day", "most", "us"] common.sort() for file in gutenberg.fileids(): total_words = len(gutenberg.words(file)) total_common = 0 for word in gutenberg.words(file): if word.lower() in common: total_common += 1 gutenberg_common_freq.append(float(total_common)/total_words) for file in brown.fileids(): total_words = len(brown.words(file)) total_common = 0 for word in brown.words(file): if word.lower() in common: total_common += 1 brown_common_freq.append(float(total_common)/total_words) for file in webtext.fileids(): total_words = len(webtext.words(file)) total_common = 0 for word in webtext.words(file): if word.lower() in common: total_common += 1 web_common_freq.append(float(total_common)/total_words) for file in inaugural.fileids(): total_words = len(inaugural.words(file)) total_common = 0 for word in inaugural.words(file): if word.lower() in common: total_common += 1 inaugural_common_freq.append(float(total_common)/total_words) for file in genesis.fileids(): total_words = len(genesis.words(file)) total_common = 0 for word in genesis.words(file): if word.lower() in common: total_common += 1 genesis_common_freq.append(float(total_common)/total_words) with open("common-words.txt", 'w') as f: sys.stdout = f f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n") for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq), len(web_common_freq), len(brown_common_freq), len(gutenberg_common_freq))): for corpus in [genesis_common_freq, inaugural_common_freq, web_common_freq, brown_common_freq, gutenberg_common_freq]: if i >= len(corpus): f.write(",") else: f.write(str(round(corpus[i], 5)) + ",") f.write("\n")
# Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus reader nltk.corpus.webtext.words() to # access some sample text in two different genres. from nltk.corpus import brown from nltk.corpus import webtext brown_genres = brown.categories() print(brown_genres) print(brown.sents(categories=brown_genres[-1])) webtext_genres = webtext.fileids() print(webtext_genres) print(webtext.words(webtext_genres[1]))
inaugural.words(fileids='1933-Roosevelt.txt') # # WEBTEXT CORPUS # In[5]: from nltk.corpus import webtext # In[6]: webtext.fileids() # In[7]: webtext.words(fileids='pirates.txt')[:10] # In[16]: k= webtext.fileids() # In[46]:
# 특정 문서를 문장 단위로 읽어온다. n = 5 sentence = nltk.corpus.gutenberg.sents('austen-emma.txt') for i in n range(5): print(sentence[i]) print("문장 개수 = ", len(sentence)) import nltk from nltk.corpus import webtext nltk.download('punkt') nltk.download('webtext') # Webtext 코퍼스의 파일 ID를 조회한다. textId = webtext.fileids() print(textId) text = """ Natural language processing (NLP) is a subfield of computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation. """ sent_tok = nltk.sent_tokenize(text) # 문서 -> 문장. 위 문서를 두 문장('.'으로 나뉨)으로 나눔 print(len(sent_tok)) sent_tok[1] # 첫 번째 문장 len(sent_tok) # 문장의 개수
import nltk from nltk.corpus import gutenberg fileids = gutenberg.fileids() emma = gutenberg.words("austen-emma.txt") emmaTxt = nltk.Text(gutenberg.words('austen-emma.txt')) print(fileids) print(emma, len(emma), len(set(emma))) print(emmaTxt, len(emmaTxt), len(set(emmaTxt))) from nltk.corpus import webtext fileids = webtext.fileids() print("webtext fileids", fileids) from nltk.corpus import brown fileids = brown.fileids() print("brown fileids", fileids) news_text = brown.words(categories="news") fdist = nltk.FreqDist([w.lower() for w in news_text]) modals = ["can", "could", "may", "mignt", "must", "will"] for m in modals: print "modals," + m + ":", fdist[m] #----------------------- from nltk.corpus import words
words_only_digits = [w for w in chat if w.isdigit()] words_lower_unique = len(set([w.lower() for w in chat if w.isalpha() ])) #true vocab #HERE: TO SOLVE SOME CHAPTER 1 EXERCISES #--------------------------------- ##CHAPTER 2:Accessing Text Corpora import nltk #print(nltk.corpus.gutenberg.fileids()) #prints filenames for nltk.gutenberg emma = nltk.corpus.gutenberg.words('austen-emma.txt') #select text #print(len(emma)) emma = nltk.Text(emma) #to use previous functions as with nltk.book txts print(emma.concordance('surprise')) print(' '.join(emma[20:50])) #LIST to STRING - comes out as text #examples of corpus available in nltk from nltk.corpus import webtext #less formal text print(webtext.fileids()) #filenames from nltk.corpus import nps_chat #predators print(nps_chat.fileids()) from nltk.corpus import brown #brown uni various texts print(brown.fileids()) from nltk.corpus import reuters print(reuters.fileids()) from nltk.corpus import inaugural print(inaugural.fileids()) #page 72 for a variety of corpus functionality commands
def get_webtext_raw(): for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...')
def webtext(): for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65], '...'
def main(): #store FreqDist's #index is the length of the word, 0 is for all words samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" brown_letters = FreqDist() web_letters = FreqDist() inaugural_letters = FreqDist() gutenberg_letters = FreqDist() genesis_letters = FreqDist() for file in gutenberg.fileids(): for word in gutenberg.words(file): for character in word: if(character in string.letters): gutenberg_letters[character.upper()] += 1 for file in brown.fileids(): for word in brown.words(file): for character in word: if(character in string.letters): brown_letters[character.upper()] += 1 for file in webtext.fileids(): for word in webtext.words(file): for character in word: if(character in string.letters): web_letters[character.upper()] += 1 for file in inaugural.fileids(): for word in inaugural.words(file): for character in word: if(character in string.letters): inaugural_letters[character.upper()] += 1 for file in genesis.fileids(): for word in genesis.words(file): for character in word: if(character in string.letters): genesis_letters[character.upper()] += 1 with open("genesis-letter-freq.txt",'w') as f: sys.stdout = f f.write("GENESIS\n") for let in samples: print(str(genesis_letters[let])) with open("gutenberg-letter-freq.txt", 'w') as f: sys.stdout = f f.write("GUTENBERG\n") for let in samples: print(str(gutenberg_letters[let])) with open("webtext-letter-freq.txt", 'w') as f: sys.stdout = f f.write("WEBTEXT\n") for let in samples: print(str(web_letters[let])) with open("inaugural-letter-freq.txt", 'w') as f: sys.stdout = f f.write("INAUGURAL\n") for let in samples: print(str(inaugural_letters[let])) with open("brown-letter-freq.txt", 'w') as f: sys.stdout = f f.write("BROWN\n") for let in samples: print(str(brown_letters[let])) with open("letter-freq.txt", 'w') as f: corpora = [gutenberg_letters, web_letters, inaugural_letters, brown_letters, genesis_letters] f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n") for let in samples: for corpus in corpora: f.write(str(corpus[let]) + ",") f.write("\n")
import nltk nltk.download() ''' Tokenization What: Separate text into units such as sentences or words Why: Gives structure to previously unstructured text Notes: Relatively easy with English language text, not easy with some languages ''' # "corpus" = collection of documents # "corpora" = plural form of corpus from nltk.corpus import webtext webtext.fileids() # wine reviews corpus text = webtext.raw('wine.txt') text[:500] # tokenize into sentences sentences = [sent for sent in nltk.sent_tokenize(text)] sentences[:10] # tokenize into words tokens = [word for word in nltk.word_tokenize(text)] tokens[:100] # only keep tokens that start with a letter (using regular expressions) import re
''' College is so hectic,I'm tired ''' # In[9]: #importing library from nltk.corpus from nltk.corpus import webtext # In[10]: webtext.fileids() # In[11]: webtext.words('pirates.txt')[:20] # In[12]: #printing first 20 words file_ids = webtext.fileids() for file in file_ids: print(file)
from nltk.corpus import brown, webtext # Brown corpus print('Categories:', list(brown.categories())) print('Brown sample text:\n\t', ' '.join(brown.words(categories='adventure')[:50])) # Webtext corpus print() print('Categories:', webtext.fileids()) print('Webtext sample text:\n\t', ' '.join(webtext.words('firefox.txt')[:50]))
def __init__(self): self.number_id = 28 self.source_id = "webtext" self.titles = [name for name in webtext.fileids()] self.data = [webtext.raw(name) for name in self.titles]
if __name__ == '__main__': inputs1 = [\ 'Spread the peanut butter.',\ 'spread the Peanut butter with the knife.',\ 'spread the Peanut butter on the bread.',\ 'get two Slices. of bread.',\ 'get a knife.'\ ] inputs2 = [\ 'spread the peanut butter',\ 'spread the peanut butter',\ 'get a knife.'\ ] # runtime tests ff = webtext.fileids()[0] #the sentences we want to sample from ffs = webtext.raw(ff) ffx = unicodedata.normalize('NFKD', ffs).encode('ascii', 'ignore').split('.') ffs = [] for entry in ffx: if len(entry) < 10: ffs.append(entry) ffs = [x for x in ffs if x != ''] final = [] for i in range(1): this_round = [[]] for size in [1000]: inputs = numpy.random.choice(ffs, size=size)
@author: Ritwik Gupta """ #20/12/19 from nltk.corpus import brown brown.categories() print(brown.words(categories='hobbies')[0:5]) from nltk.corpus import inaugural inaugural.fileids() inaugural.words(fileids='1933-Roosevelt.txt')[0:10] from nltk.corpus import webtext d1 = {} for i in webtext.fileids(): d1[i] = webtext.words(fileids=i)[:20] #Downloaded the MASC data import nltk with open('tweets1.txt', 'r') as f: text = f.read().strip() text1 = text.split() text2 = nltk.Text(text1) text2.concordance("good", 1) #Project Gutenberg from urllib import request url = "http://www.gutenberg.org/files/2554/2554-0.txt" response = request.urlopen(url) raw = response.read().decode('utf-8')
of external corpora. NLTK corpus okuyuculari. Bu paketteki moduller, cesitli bicimlerde corpus dosyalarini okumak icin kullanilabilen islevleri saglar. Bu islevler, hem NLTK corpus paketinde dagitilan corpus dosyalarini hem de dis corpora'nin bir parcasi olan corpus dosyalarini okumak icin kullanilabilir. Common Structures for Text Corpora: The simplest kind of corpus is a collection of isolated texts with no particular organization; some corpora are structured into categories like genre (Brown Corpus); some categorizations overlap, such as topic categories (Reuters Corpus); other corpora represent language use over time (Inaugural Address Corpus). 1- Gutenberg Corpus 2- Web and Chat Text 3- Brown Corpus 4- Reuters Corpus 5- Inaugural Address Corpus 6- Annotated Text Corpora """ from nltk.corpus import gutenberg, webtext, brown, reuters, inaugural print "Gutenberg FileIds :", gutenberg.fileids() print "Webtext FileIds :", webtext.fileids() print "Brown FileIds :", brown.fileids() print "Brown Categories :", brown.categories() print "Reuters FileIds :", reuters.fileids() print "Reuters Categories :", reuters.categories() print "Inaugural FileIds :", inaugural.fileids()
word_sim = {} for i in range(self.unique_word): vet_w2 = self.w1[i] theta_sum = np.dot(vet_w1, vet_w2) theta_den = np.linalg.norm(vet_w1) + np.linalg.norm(vet_w2) theta = theta_sum / theta_den word = self.index_word[i] word_sim[word] = theta sort_word = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True) for word, sim in sort_word[:top_n]: print(word, sim) fx = webtext.raw(webtext.fileids()[0]) corpus = fx[:1000] print(corpus) settings = {"train": {"window": 2, "epoch": 3000, "lr": 0.01}} w2 = word2vec(settings) pre_pr = w2.pre_process(corpus, ispara=False) # print(corpus) training_Data = w2.gen_training_data(pre_pr) w2.train(training_Data) t_word = "phoenix" print(w2.word_vec(t_word)) w2.vec_sim(t_word, 5) # print(training_Data.size*training_Data.itemsize)
# Importing modules with datasets within nltk.corpus from nltk.corpus import gutenberg from nltk.corpus import webtext from nltk.corpus import nps_chat from nltk.corpus import brown from nltk.corpus import reuters # Printing the list of all dataset names in each module print('Printing the file IDs for each module...\n') print('gutenberg:\n', gutenberg.fileids()) print('webtext:\n', webtext.fileids()) print('nps_chat:\n', nps_chat.fileids()) print('brown:\n', brown.fileids()) print('reuters:\n', reuters.fileids()) # Printing the categories of each module # NOTE: gutenberg, webtext and nps_text do not have "categories" print('Printing the categories for each module, if available...\n') print('brown:\n', brown.categories()) print('reuters:\n', reuters.categories()) # Accessing the corpora # NOTE: TXT files can be accessed through "raw" to get the full files print('Accessing the sample files...') print('gutenberg:\n', gutenberg.raw("austen-emma.txt")) # Accessing sentences of a sample file print('Getting a list of sentences...') print('List of sentences from austen-emma.txt:\n', gutenberg.sents("austen-emma.txt")) print('List of sentences from a chat:\n', nps_chat.posts("10-19-20s_706posts.xml"))
def fun04(): """fun04""" for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:50]
def webtext_example(): #Get raw text for the webtext dictionary for fileid in webtext.fileids(): raw_text = webtext.raw(fileid) for i, line in enumerate(raw_text.split('\n')): print('[{}] {} : {}'.format(fileid, i, line))
import nltk, matplotlib from nltk.corpus import webtext print(webtext.fileids()) fileid = 'singles.txt' wbt_words = webtext.words(fileid) fdist = nltk.FreqDist(wbt_words) print('최대 발생 토큰 "', fdist.max(), '" 수 : ', fdist[fdist.max()]) print('말뭉치 내 총 고유 토큰 수 : ', fdist.N()) print('말뭉치에서 가장 흔한 10개 단어는 다음과 같습니다.') print(fdist.most_common(10)) print('개인 광고의 빈도 분포') print(fdist.tabulate()) fdist.plot(cumulative=True)
# -*- coding: utf-8 -*- import matplotlib matplotlib.use('TkAgg') import nltk ''' ☼ Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus reader nltk.corpus.webtext.words() to access some sample text in two different genres. ''' from nltk.corpus import brown,webtext romance_text = brown.words(categories='romance') print brown.categories() print webtext.fileids() print webtext.words('firefox.txt')
#Return the max len of sentences longest_len = max([len(s) for s in macbeth_sentences]) #Save the sentences biggest longest_sent = [s for s in macbeth_sentences if len(s) == longest_len] #******************************************************************************************************** # Web and Chat Text #******************************************************************************************************** ''' Web Texts ''' from nltk.corpus import webtext for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65] ''' Chats ''' from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] #******************************************************************************************************** # Brown Corpus #******************************************************************************************************** from nltk.corpus import brown
def fun3(): from nltk.corpus import webtext for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65]
def exercise_webtext(): # 打印网络文本的文件名 for file_id in webtext.fileids(): print(file_id)
#!/usr/bin/env python # coding: utf-8 # In[6]: from nltk.corpus import webtext # In[7]: webtext.fileids() # In[8]: print(webtext.words(fileids='pirates.txt')) # In[9]: for file in webtext.fileids(): print(webtext.words(fileids=file)[:20])
def webtext_example(): from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...')
def print_private(): from nltk.corpus import webtext for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65]
round(word_count / vocab_count), fileid) macbeth_sentences = gutenberg.sents( 'shakespeare-macbeth.txt' ) # Notes: Dispalay the longest sentence from the macbeth text and its length print("\n112th Macbeth sentence: ", macbeth_sentences[111]) print("\nNumber of sentences in Macbeth: ", len(macbeth_sentences)) longest_length = max(len(s) for s in macbeth_sentences) longest_sentence = [ sentence for sentence in macbeth_sentences if len(sentence) == longest_length ] print("\nLength of longest sentence in Macbeth: ", longest_length) print("\nLongest sentence in Macbeth: ", longest_sentence) from nltk.corpus import webtext # Notes: Web and Chat Corpus. Gutenberg contains formal literature, it is important to consider less formal language as well. for fileids in webtext.fileids( ): # Notes: From the web and chat corpus print the fileids with first 65 characters print(fileids, webtext.raw(fileids)[:65], "\n") from nltk.corpus import nps_chat # Notes: Instant messaging chat session corpus. Contains over 10,000 'posts' chatroom = nps_chat.posts( '10-19-20s_706posts.xml') # Notes: format = dd_mm-age_numberofposts.xml print(chatroom[123]) # Notes: Borwn corpus and its categories # Notes: Reuters corpus # Notes: Insuagral Address Corpus # Notes: Annotated Text Corpora # Notes: Corpora in other languages # Notes: Loading your own corpus print( "\n--- 2.2 Conditional Frequency Distributions ---\n"