def get_training_test_sentences(self): brown_cats = ",".join(brown.categories()) self.news_text = brown.words(categories= brown.categories()) self.news_tagged_sentences = brown.tagged_sents(categories= brown.categories()) size = int(len(self.news_tagged_sentences) * .9) brown_train = self.news_tagged_sentences[:size] brown_test = self.news_tagged_sentences[size:] self.train_sents = brown_train self.test_sents = brown_test
def print_brown(): from nltk.corpus import brown print brown.categories() print brown.words(categories='news') print brown.words(fileids=['cg22']) print brown.sents(categories=['news','reviews']) news_text=brown.words(categories='news') fdist=nltk.FreqDist([w.lower() for w in news_text]) modals=['can','could','may','might','must','will'] for m in modals: print m+':',fdist[m]
def build_all_brown(subset_size=None): documents = [] categories = [] all_categories = set() try: fileids = brown.fileids() for fileid in fileids: if subset_size: if len(all_categories) > subset_size: break category = brown.categories(fileid)[0] words = [x.lower() for x in brown.words(fileid)] documents.append(words) categories.append(category) all_categories.add(category) if subset_size != len(brown.categories()): # exclude the final item, since it's the sole member of the next group documents = documents[:-1] categories = categories[:-1] documents = [" ".join(d) for d in documents] except LookupError: """ we don't have the Brown corpus via nltk on this machine """ try: with open("brown_docs_cats.pickle") as f: documents, categories = pickle.load(f) except IOError: raise Exception("can't load Brown Corpus via NLTK or file") # documents = [' '.join(d) for d in documents] """ # let's NOT get tempted to hide away the encoding # we'll probably need to access, e.g., the vectorizer, to do reverse # transformations once we want to interpret/evaluate the model doc_vectorizer = CountVectorizer() doc_vec = doc_vectorizer.fit_transform(documents) """ return documents, categories
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout): """ Import the brown corpus into `ds`. E.g. >>> from nathan.core import Dataspace >>> ds = Dataspace() >>> %time brown.import_brown(ds, silent=True) CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s Wall time: 12min 29s """ if not silent: total = len(brown.sents()) counter = 0 for category in brown.categories(): cat_handle = ds.insert("#%s" % category) for sent in brown.tagged_sents(categories=category): if simplify_tags: norm = (simplify_tag(t) for t in sent) norm = [nltk.tuple2str(t) for t in norm] sen_handle = ds.insert(norm) ds.link(cat_handle, sen_handle) if not silent: counter += 1 if (counter % 100 == 0): print("importing %s of %s sentences..." % (counter, total), file=log)
def fun08(): """fun08""" cfd = nltk.ConditionalFreqDist((genre, word) \ for genre in brown.categories() \ for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals)
def makeWordSet(args=None): '''Use the Brown corpus to see how many words used''' word_set = set() for cat in brown.categories(): word_set = word_set.union(set(brown.words(categories=cat))) for cat in reuters.categories(): word_set = word_set.union(set(reuters.words(categories=cat))) return word_set
def brown_diversity(): """calculate and display lexical diversity score (token/token_type) for each brown corpus category""" cfd = nltk.ConditionalFreqDist((category, word) for category in brown.categories() for word in brown.words(categories=category)) print "{0:15s} {1:10s}".format("CATEGORY", "DIVERSITY") for category in cfd.conditions(): print "{0:15s} {1:10f}".format(category, (cfd[category].N() * 1.0 / cfd[category].B()))
def ex11(): from nltk.corpus import brown modals = set(["can", "could", "may", "might", "shall", "should", "will", "would", "must", "ought"]) cfd = nltk.ConditionalFreqDist( (genre, modal) for genre in brown.categories() for modal in [w.lower() for w in brown.words(categories=genre) if w.lower() in modals]) cfd.plot()
def exercise_brown2(): """带条件的频率分布函数""" cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre) ) genres = ["news", "religion", "hobbies", "science_fiction", "romance", "humor"] modals = ["can", "could", "may", "might", "must", "will"] cfd.tabulate(conditions=genres, samples=modals)
def ex16(): from nltk.corpus import brown lex_div = {} for category in brown.categories(): words = brown.words(categories=category) ld = len(words) / len(set(words)) print category, ld lex_div[category] = ld print sorted(lex_div.iteritems(), key=operator.itemgetter(1))
def print_modals(): from nltk.corpus import brown cfd=nltk.ConditionalFreqDist( (genre,word) for genre in brown.categories() for word in brown.words(categories=genre) ) genres=['news','religion','hobbies','science_fiction','romance','humor'] modals=['can','could','may','might','must','will'] cfd.tabulate(conditions=genres,samples=modals)
def exercise_brown(): # 打印布朗语料库中的分类 print brown.categories() # 打印分类为新闻的文本词汇 print brown.words(categories="news") # 打印文本'cg22' print brown.words(fileids=["cg22"]) # 打印句子 print brown.sents(categories=["news", "reviews"]) """比较不同文体中的情态动词的用法""" # 获取文本 news_text = brown.words(categories="news") # 单词定义频率 fdist = nltk.FreqDist([w.lower() for w in news_text]) # 定义情态动词表 modals = ["can", "could", "may", "might", "must", "will"] for m in modals: print m + ":", fdist[m]
def training_sentences(use=1.0, categories=[]): """returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]""" if len(categories) == 0: categories = brown.categories() # use all of the brown categories sents = [] for category in categories: total = len(brown.tagged_sents(categories=category)) max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max] return sents
def init_corpus(): print 'init corpus.. ', global categories, category_sentences categories = brown.categories() half_cat = int(len(categories) * 0.5) categories = categories[:half_cat] for category in categories: sents = brown.tagged_sents(categories = category) category_sentences[category] = sents print 'done'
def test_sentences(categories=[]): """returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]""" if len(categories) == 0: categories = brown.categories() # use all of the brown categories sents = [] for category in categories: total = len(brown.tagged_sents(categories=category)) start = int(TEST_PROPORTION * total) # use the last k sentences for test sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1] return sents
def ch03_29_reading_difficulty(): sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") from nltk.corpus import brown for category in brown.categories(): raw = brown.raw(categories=category) words = len(brown.words(categories=category)) sentences = len(sent_tokenizer.tokenize(raw)) letters_per_word = (len(raw) - words) / words # raw chars - words space chars words_per_sentence = words / sentences reading_level = (4.71 * letters_per_word) + (0.5 * words_per_sentence) + 21.43 print category, reading_level
def brown(): brown.categories() brown.words(categories='news') brown.words(fileids=['cg22']) brown.sents(categories=['news', 'editorial', 'reviews']) news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print m + ':', fdist[m], cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals)
def main(): #Annahme: Wort "heart" statt "fun" words = [u'money', u'duty', u'love', u'heart'] categories = [u'science_fiction', u'romance', u'government', u'humor', u'religion'] #Dein Code cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) cfd.tabulate(conditions=categories, samples=words) print_min_max_for_all(cfd, words, categories)
def ex10(): from nltk.corpus import brown from nltk.corpus import stopwords stopwords = stopwords.words("english") for genre in brown.categories(): print genre words = map(lambda x : x.lower(), brown.words(categories=genre)) fd = nltk.FreqDist([w for w in words if w.isalpha() and not(w in stopwords)]) vocab_size = len(set(words)) sum = 0 for word in fd.keys(): freq = fd[word] print "... %s (%f)" % (word, (freq * 100 / vocab_size)) sum = sum + freq if (sum > (vocab_size / 3)): break
def exercise11(): print print "Exercise 11" cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modal_verb = ['shall', 'should', 'may', 'had', 'have'] print "Tabulation data for closed class of words in english" print "For modal verbs:" cfd.tabulate(conditions = genres, samples = modal_verb) print print "For Prepositions:" prepositions = ['for', 'from', 'onto', 'to', 'with'] cfd.tabulate(conditions = genres, samples = prepositions) print print "For Pronoun:" pronoun = ['me', 'she', 'her', 'I', 'we'] cfd.tabulate(conditions = genres, samples = pronoun) print
def tabulate(cfdist, words, categories): print '%-16s' % 'Category', for word in words: # column headings print '%6s' % word, print for category in categories: print '%-16s' % category, # row heading for word in words: # for each word print '%6d' % cfdist[category][word], # print table cell print # end the row cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] tabulate(cfd, modals, genres)
def get_brown_data(self): count = {} self.len_list = [] for cats in brown.categories(): for sent in brown.sents(categories=cats): l = len(sent) #if l < 3: #continue self.len_list.append(l) if l in count: count[l] += 1 else: count[l] = 1 total = len(self.len_list) for i in range(100): if i in count.keys(): self.probs.append(count[i]/(total+0.0)) else: self.probs.append(0)
def countWords(): """counting words by genre""" cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)] print len(genre_word) print genre_word[:4] print genre_word[-4:] cfd = nltk.ConditionalFreqDist(genre_word) print cfd.conditions() cfd['news'] cfd['romance'] list(cfd['romance']) cfd['romance']['could']
''' days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] cfd = nltk.ConditionalFreqDist( ('news', word) for word in brown.words(categories='news')) cfd.tabulate(samples=days) ''' #Checking basic FreqDist fdist1 = FreqDist(text1) vocabulary1 = fdist1.keys() vocabulary1[:50] fdist1['whale'] #Checking basic ConditionalFreqDist cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals) #The Lookup Tagger fd = nltk.FreqDist(brown.words(categories='news')) #One word can have multiple tags. brown.tagged_words() returns object of class ConcatenatedCorpusView. #We are creating the baseline tagger by tagging each word from the brown news corpus brown_sents = brown.sents(categories='news') #brown_tagged_sents is of class ConcatenatedCorpusView brown_tagged_sents = brown.tagged_sents(categories='news') cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) #cfd['of'] is a FreqDist with 3 samples and 2849 outcomes. The entries are {('IN', 2716), ('IN-TL', 128), ('IN-HL', 5)}. most_freq_words = fd.keys()[:100] #word comes from most_freq_words, which comes from fd, which comes from freq distn of brown.words in news corpus.
import nltk from nltk.corpus import brown whwords = ['what', 'which', 'why', 'when', 'where', 'who'] genre_news = brown.words(categories='news') fdist = nltk.FreqDist(genre_news) count = 0 for wh in whwords: count = count + fdist[wh] print("\n", count) #q2 categories = list(brown.categories()) modals = ['can', 'could', 'may', 'might', 'must', 'will', 'would'] for c in categories: print(c) words = brown.words(categories=c) fdist = nltk.FreqDist([w.lower() for w in words]) for m in modals: print(m + ':', fdist[m]) print() #q3 import nltk from nltk.corpus import inaugural print(inaugural.fileids()) year = [fileid[:4] for fileid in inaugural.fileids()]
for category in categories: if not 'stopwords' in kwargs: tokens = [w for w in brown.words(categories=category)] else: if kwargs['stopwords'] == 'english': tokens = [w for w in brown.words(categories=category) if w not in FileOps.get_stopwords('stopwords.txt')] token_count = len(tokens) type_count = len(set(tokens)) diversity = "%.3f" % (type_count/token_count) tmp = category.split('_') category = ' '.join(tmp) category_info = (category, token_count, type_count, diversity) print(category_info) corpus_info.append(category_info) return corpus_info if __name__ == "__main__": <<<<<<< HEAD ======= plurals_info = get_plurals_info(brown.categories()) FileOps.write_plurals_table(plurals_info) >>>>>>> a4d8d09f16fdcb0d863e6b8683049a38be7be961 corpus_info = get_corpus_info(brown.categories()) stopwords_info = get_corpus_info(brown.categories(), stopwords='english') FileOps.write_table(corpus_info, stopwords_info) print_corpus_info(brown.categories(), FileOps.get_stopwords('stopwords.txt'))
print(tmp_Con.conditions()) # ['实惠', '快', '也好'] print(tmp_Con['快'].most_common()) # [('1', 1), ('-1', 1)] print(tmp_Con['快'].keys()) # dict_keys(['1', '-1']) print(len(tmp_Con['快'].keys())) # 2; 可以看到每个词语的词性有多少个... print(len(tmp_Con['也好'].keys())) # 1; 重复的已经 set() 化了 print([ condition for condition in tmp_Con.conditions() if len(tmp_Con[condition].keys()) > 1 ]) # ['快'] tmp_Con.plot() tmp_Con_1 = ConditionalFreqDist(zip(anls, word)) print(tmp_Con_1.conditions()) # ['实惠', '快', '也好'] ################################################################## ## Brown 语料库 word 归类分析 print( brown.categories() ) # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) # 这里的 categories=genre 不能去掉 genres = [ 'news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor' ] # 从 brown.categories() 中找的 modals = ['can', 'could', 'may', 'might', 'must', 'will'] # 随机找的几个单词 print( cfd.tabulate(conditions=genres, samples=modals) ) # Observe that the most frequent modal in the news genre is will, while the most frequent modal in the romance genre is could # can could may might must will # 每个类别种各个单词的数量 # news 93 86 66 38 50 389 # religion 82 59 78 12 54 71 # hobbies 268 58 131 22 83 264
# -*- coding: utf-8 -*- """ """ # BROWN CORPUS DEMO from nltk.corpus import brown import nltk print 'Total Categories:', len(brown.categories()) print brown.categories() # tokenized sentences brown.sents(categories='mystery') # POS tagged sentences brown.tagged_sents(categories='mystery') # get sentences in natural form sentences = brown.sents(categories='mystery') # get tagged words tagged_words = brown.tagged_words(categories='mystery') # get nouns from tagged words nouns = [(word, tag) for word, tag in tagged_words if any(noun_tag in tag for noun_tag in ['NP', 'NN'])] print nouns[0:10] # prints the first 10 nouns
soundex = fc + soundex[1:] soundex = soundex.replace('0', '') return (soundex + size * '0')[:size] print("1") print(SOUNDEX('abcdefghijklmnopqrstuvwxyz')) print(SOUNDEX('Tim')) print(SOUNDEX('Trump')) print(SOUNDEX('Einstein')) # 2.1 # nltk.download('brown') # 使用以下categories id = 3018216005 categories = brown.categories()[id % len(brown.categories())] brown_tagged_sents = brown.tagged_sents(categories=categories) brown_sents = brown.sents(categories=categories) unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) unigram_tagger.tag(brown_sents[2007]) size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] unigram_tagger = nltk.UnigramTagger(train_sents) print("2.1") print('categories:', categories) print('size:', size) print('train size:', len(train_sents)) print('test size:', len(test_sents)) print(unigram_tagger.evaluate(test_sents))
# -*- coding: utf-8 -*- import matplotlib matplotlib.use('TkAgg') import nltk ''' ☼ Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus reader nltk.corpus.webtext.words() to access some sample text in two different genres. ''' from nltk.corpus import brown,webtext romance_text = brown.words(categories='romance') print brown.categories() print webtext.fileids() print webtext.words('firefox.txt')
import nltk from nltk.corpus import brown for category in brown.categories(): words = brown.words(categories = category) fdist = nltk.FreqDist([w.lower() for w in words]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] print category + ' ' for m in modals: print m + ":", fdist[m], print "\n"
def word_category(self, word): # print "Finding category" if (word.name in stopwords.words('english')): return 'stopwords' categories = brown.categories() return max(categories, key=lambda cat: self.cond_freq[cat][word.name])
Us nombre moyen de mots mar phrase from nltk.corpus import brown as brown dict = {} #----------------------------- for cat in brown.categories: Uw = len("".join(list(brown.words(categories=cat))))/len(brown.words(categories=cat)) Us = len(brown.words(categories=cat))/len(brown.sents(categories=cat)) dict[cat] = 4.71*Uw + 0.5*Us - 21.43 #----------------------------- dict = {cat : (4.71*(len("".join(list(brown.words(categories=cat))))/len(brown.words(categories=cat))) + 0.5*(len(brown.words(categories=cat))/len(brown.sents(categories=cat))) - 21.43) for cat in brown.categories() } Uw = lambda brown,cat : len("".join(list(brown.words(categories=cat))))/len(brown.words(categories=cat)) Us = lambda brown,cat : len(brown.words(categories=cat))/len(brown.sents(categories=cat)) d = {cat : (4.71*Uw(brown,cat) + 0.5*Us(brown,cat) - 21.43) for cat in brown.categories() } #4 ----------------------------------------------------- ["".join([char for char in word if char in "aeiou"]) for word in words]
from nltk.corpus import brown, movie_reviews, reuters print(brown.categories()) # doctest: +NORMALIZE_WHITESPACE print(movie_reviews.categories()) print(reuters.categories()) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS print(brown.categories('ca01')) print(brown.categories(['ca01', 'cb01'])) print(reuters.categories('training/9865')) print(reuters.categories(['training/9865', 'training/9880'])) print(reuters.fileids('barley')) # doctest: +ELLIPSIS print(brown.tagged_words(categories='news')) print(brown.sents( categories=['editorial', 'reviews'])) # doctest: +NORMALIZE_WHITESPACE
def brown(): print brown.categories() print len(brown.sents()) print len(brown.words())
assert i not in guten_dev assert i not in guten_test assert j not in guten_train assert j not in guten_test assert k not in guten_dev assert k not in guten_train ############################# #Generating for Brown Corpus# ############################# random.seed(22) brown_train = [] brown_dev = [] brown_test = [] for cat in brown.categories(): brown_train.append( random.sample(list(brown.fileids(categories=cat)), int((0.6) * len(brown.fileids(categories=cat))))) brown_train = [x for y in brown_train for x in y] #Generating Dev set for Brown Corpus for cat in brown.categories(): brown_dev_1 = [ i for i in brown.fileids(categories=cat) if i not in brown_train ] brown_dev.append(random.sample(brown_dev_1, int(0.5 * len(brown_dev_1)))) brown_dev = [x for y in brown_dev for x in y] #Generating Test set for Brown Corpus for cat in brown.categories():
def nltk(): """ nltk,文本处理简介 :return: """ # 1.语料库 # 查看语料库包含的类别 print(brown.categories()) # 查看brown语料库 print('共有{}个句子'.format(len(brown.sents()))) print('共有{}个单词'.format(len(brown.words()))) # 2.分词 sentence = "Python is a widely used high-level programming language for general-purpose programming." tokens = nltk.word_tokenize(sentence) # 需要下载punkt分词模型 print(tokens) # 3.结巴分词 seg_list = jieba.cut("欢迎来到小象学院", cut_all=True) print("全模式: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut("欢迎来到小象学院", cut_all=False) print("精确模式: " + "/ ".join(seg_list)) # 精确模式 # 4.词形处理 # 词干提取(stemming) # PorterStemmer from nltk.stem.porter import PorterStemmer porter_stemmer = PorterStemmer() print(porter_stemmer.stem('looked')) print(porter_stemmer.stem('looking')) # SnowballStemmer from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer('english') print(snowball_stemmer.stem('looked')) print(snowball_stemmer.stem('looking')) # LancasterStemmer from nltk.stem.lancaster import LancasterStemmer lancaster_stemmer = LancasterStemmer() print(lancaster_stemmer.stem('looked')) print(lancaster_stemmer.stem('looking')) # 词形归并(lemmatization) from nltk.stem import WordNetLemmatizer # 需要下载wordnet语料库 wordnet_lematizer = WordNetLemmatizer() print(wordnet_lematizer.lemmatize('cats')) print(wordnet_lematizer.lemmatize('boxes')) print(wordnet_lematizer.lemmatize('are')) print(wordnet_lematizer.lemmatize('went')) # 指明词性可以更准确地进行lemma # lemmatize 默认为名词 print(wordnet_lematizer.lemmatize('are', pos='v')) print(wordnet_lematizer.lemmatize('went', pos='v')) #词性标注 (Part-Of-Speech) words = nltk.word_tokenize('Python is a widely used programming language.') print(nltk.pos_tag(words)) # 需要下载 averaged_perceptron_tagger # 去除停用词 from nltk.corpus import stopwords # 需要下载stopwords filtered_words = [ word for word in words if word not in stopwords.words('english') ] print('原始词:', words) print('去除停用词后:', filtered_words) # 5.典型的文本预处理流程 # 原始文本 raw_text = 'Life is like a box of chocolates. You never know what you\'re gonna get.' # 分词 raw_words = nltk.word_tokenize(raw_text) # 词形归一化 wordnet_lematizer = WordNetLemmatizer() words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words] # 去除停用词 filtered_words = [ word for word in words if word not in stopwords.words('english') ] print('原始文本:', raw_text) print('预处理结果:', filtered_words)
>>> from nltk.corpus import brown >>> from nltk.book import * *** Introductory Examples for the NLTK Book *** Loading text1, ..., text9 and sent1, ..., sent9 Type the name of the text or sentence to view it. Type: 'texts()' or 'sents()' to list the materials. text1: Moby Dick by Herman Melville 1851 text2: Sense and Sensibility by Jane Austen 1811 text3: The Book of Genesis text4: Inaugural Address Corpus text5: Chat Corpus text6: Monty Python and the Holy Grail text7: Wall Street Journal text8: Personals Corpus text9: The Man Who Was Thursday by G . K . Chesterton 1908 >>> brown.categories() ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] >>> cfd = nltk.ConditionalFreqDist() >>> ============= RESTART: C:/Users/Aurangzeb khan/Desktop/nltk1.py ============= *** Introductory Examples for the NLTK Book *** Loading text1, ..., text9 and sent1, ..., sent9 Type the name of the text or sentence to view it. Type: 'texts()' or 'sents()' to list the materials. text1: Moby Dick by Herman Melville 1851 text2: Sense and Sensibility by Jane Austen 1811 text3: The Book of Genesis text4: Inaugural Address Corpus text5: Chat Corpus text6: Monty Python and the Holy Grail text7: Wall Street Journal
import nltk nltk.download('brown') from nltk.corpus import brown from nltk.probability import FreqDist res = [] for i in ["what", "when", "where", "who", "why"]: aux = [] for j in brown.categories(): aux.append(j) aux.append(FreqDist(brown.words(categories=j)).get(j, 0)) res.append(i) res.append(aux) print(res)
def conditional_frequencies(self): # dlugo... moze dla mniejszego zbioru? # print "Making conditional frequencies..." cfd = ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) return cfd
print(fileid, webtext.raw(fileid)[:100], '...') #%% from nltk.corpus import brown news_text = brown.words(categories='news') fdist = FreqDist(w.lower() for w in news_text) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: #print(m + ':', fdist[m], ) print m, ':', fdist[m], ' ', #%% #conditional frequencies cdf = ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cdf.tabulate(conditions=genres, samples=modals) #%% from nltk.corpus import inaugural cdf = ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america', 'citizen'] if w.lower().startswith(target)) cdf.plot() #%%
sentences[1037] #最长的句子 long = max([len(s) for s in sentences]) [s for s in sentences if len(s) == long] #%% #网络和聊天文本 from nltk.corpus import webtext webtext.fileids() for fileid in webtext.fileids(): print(fileid,webtext.raw(fileid)[:60]) #%% #布朗语料库 from nltk.corpus import brown brown.categories() brown.words(categories='news') news_words = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_words]) modals = ['can','could','may','might','must','will'] for m in modals: print(m,fdist[m]) #%% #条件频率分布 from nltk.corpus import brown cfd = nltk.ConditionalFreqDist( (genre,word) for genre in brown.categories()
def main(): # levenshtein distance for lists of tokens #----------------------------------------- #PCL Solutions #------------- print "~~~~~~~~~Beispiel)~~~~~~~~~~~~~~" l1 = "Vladimir Levenshtein uebernahm dies im Jahre 1960 .".split() l2 = "Vladimir Iosifovich Levenshtein entwickelte dies im Jahre 1965 .".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "ld=3+1.3+4=8.3, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" print "~~~~~~~~~2a und 2b)~~~~~~~~~~~~~" l1 = "Computerlinguistik 2 ist spannend .".split() l2 = "Computerlinguistik macht Spass und ist spannend !".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "ld= nicht 22.1, ->%s" % (ld) print "\nWeshalb ist 22.1 nicht die Optimale Lösung?\n" \ "Unsere Version geht wie folgt vor:\n" \ "delete: 2 (Kosten 1 * 3)\n" \ "insert: macht Spass und (Kosten 3 * 3.0)\n" \ "substitute: . und ! (Kosten 1 * 0.1)\n" \ "Totale Kosten von 12.1 nicht 22.1\n" \ "Die substitution (go diagonal) mit Kosten 16\n" \ "wird immer umgangen\n" \ "und durch ein delete (go down) und \n" \ "ein insert (go right) ersetzt." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" print "~~~~~~~~~2c)~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" l1 = "spricht Yoda in immer Raetseln .".split() l2 = "Yoda spricht immer in Raetseln !".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "ld=0.9, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "das ist !".split() l2 = "ist das !".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "ld=0.4, ->%s" % (ld) print "~~~~~~~~~2d)~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" #load brown corpus sents print "Brown categories:\n %s\n" %brown.categories() brown_sents=brown.sents(categories = "romance") #find minimal distance between l1 and l2 l1 = "I wish you loved me .".split() ld_min=1000 #Startwert #iterate through sent in brown corpus print"\nUm die beste Lösung zu finden:\n" \ "Setze den print_matrix flag auf 0 " \ "und iteriere über die gesamte Kategorie " \ "romance (entferne auf nachfolgender Zeile [0:2])\n" for sent in brown_sents[0:2]: #calculate distance l2 = sent ld = levenshtein_on_tokens(l1, l2) #print out best match so far print l1, "\n", l2 print "ld= , ->%s" % (ld) if ld<ld_min: ld_min=ld l2_min=l2 print "ld_min= ", ld_min print "l2_min= ", l2_min print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" # Additional: # Test cases print"\nZusätzliche Testfälle:\n" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" # ---------------------------------------- l1 = "a b c d . a a".split() l2 = "a b c d . a c".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "1) ld= 1.3, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "a b c d . a 2".split() l2 = "a b c d . a 4".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "2) ld= 4.0, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "a b .".split() l2 = "a b !".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "3) ld= 0.1, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "a b c 4".split() l2 = "a b c .".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "4) ld= nicht 16.0, 3.1, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "a b c d".split() l2 = "a e".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "5) ld= 7.3, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "a b c d .".split() l2 = "a e i o".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "6) ld= 4.0, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "a b c d !".split() l2 = "a b c d .".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "7) ld= 0.1, ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "!".split() l2 = "b".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "8) ld= nicht 16, 3.1 , ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "! 3 4".split() l2 = "b ? a".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "9) ld=12.1 , ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" l1 = "l e v e n s h t e i n".split() l2 = "m e i l e n s t e i n".split() ld = levenshtein_on_tokens(l1, l2) print l1, "\n", l2 print "10) ld=7.8 , ->%s" % (ld) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" # levenshtein distance for strings of characters # ----------------------------------------------- print"\nZusätzlich:" print "levenshtein distance for strings of chars\n" \ "Dieser Algorithmus war das Grundgerüst der\n" \ "gesamten Aufgabe" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" s1 = "kitten" s2 = "sitting" ld = levenshtein_on_characters(s1, s2) print "\nld= ", ld print s1, "\n", s2
Sl. No.- 14542 @@@-----------S1: Train: D1-Train, Test: D1-Test----------------@@@ """ """ ###### IMPORT BROWN CORPUS ####### """ import nltk import re nltk.download('brown') from nltk.corpus import brown categories=brown.categories() brown_corpus_sents={} for cat in categories: a=[] for sents in brown.sents(categories=cat): sents.insert(0,'<s>') sents.append('<e>') a.append(sents) brown_corpus_sents[cat]=a BROWN={} for cat in categories:
weight1 = np.random.uniform(low=0.0, high=0.0, size=(34, 17)) weight2 = np.random.uniform(low=0.0, high=0.0, size=(17, 17)) weight3 = np.random.uniform(low=0.0, high=0.0, size=(17, 15)) # print np.matrix(weight1) for i in range(34): for j in range(17): weight1[i, j] = float(1 / (1.0 * 34)) for i in range(17): for j in range(17): weight2[i, j] = float(1 / (1.0 * 17)) for i in range(17): for j in range(15): weight3[i, j] = float(1 / (1.0 * 17)) classes = ["" for x in range(len(brown.categories()))] count = 0 for i in brown.categories(): classes[count] = i count += 1 print classes output1 = [0.0] * 17 output2 = [0.0] * 17 output3 = [0.0] * 15 error1 = [0.0] * 17 error2 = [0.0] * 17 error3 = [0.0] * 15 index = -1 for f in brown.fileids(): print "Fileids ", f
from nltk.corpus import brown def average_num_words_per_sentence(text, category): """returns the average number of words per sentence in a text by dividing number of words in a corpus by number of sentences in the corpus.""" sent_num = len(text.sents(categories=category)) word_num = len(text.words(categories=category)) return word_num / sent_num def average_num_letters(text, category): """finds the average number of letters per word in a corpus.""" word_num = len(text.words(categories=category)) smash_text = ''.join(text.words(categories=category)) letters_len = len(smash_text) return letters_len / word_num def ari(text, category): """Calculates the average readability of a text.""" uw = average_num_letters(text, category) us = average_num_words_per_sentence(text, category) ari = (4.71 * uw) + (0.5 * us) - 21.43 return ari for category in brown.categories(): print(category + ': ' + str(ari(brown, category)))
from nltk.corpus import webtext ## ana ezMPLW more casusal language sets for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...') from nltk.corpus import nps_chat # instant messaging ... bit awk chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] from nltk.corpus import brown print (brown.categories) from nltk.corpus import brown print (brown.categories()) #displays categories print(brown.words(categories='news')) print (brown.words(fileids=['cg23'])) #accessing specific doccuments print (brown.sents(categories =['news' , 'editorial', 'reviews']) # different categories import nltk from nltk.corpus import brown fiction_text = brown.words(categories='fiction') #to easily access fiction >>> fdist = nltk.FreqDist(w.lower() for w in fiction_text) >>> qwords = ['what', 'when' , 'where', 'who' , 'why'] >>> for i in qwords: print(i + ':', fdist[i], end= ' ' ) # making a list of how many occur in this cat, plus end = ' ' , makes it primt all on one line
for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set(w.lower() for w in gutenberg.words(fileid))) avg_word_len = round(num_chars / num_words) avg_sent_len = round(num_words / num_sents) lexical_diversity = round(num_words / num_vocab) print(fileid, " | ", num_chars, " | ", num_words, " | ", num_sents, " | ", num_vocab, " | ", avg_word_len, " | ", avg_sent_len, " | ", lexical_diversity) for fileid in webtext.fileids(): print(fileid) brown.categories() brown.raw("cr09") #stylistics - systematic differences between genres # by use of modal verbs - [can could may might must will] news_text = brown.words(categories='news') hobbies_text = brown.words(categories='hobbies') news_text_fdist = nltk.FreqDist(w.lower() for w in news_text) hobbies_text_fdist = nltk.FreqDist(w.lower() for w in hobbies_text) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print(m, ":", news_text_fdist[m], " | ", hobbies_text_fdist[m]) event_words = ["who", "what", "when", "where", "why"] for m in event_words: print(m, ":", news_text_fdist[m], " | ", hobbies_text_fdist[m])
def fun06(): """fun06""" print brown.categories() print brown.words(categories='news')[:60] print brown.words(fileids=['cg22'])[:60] print brown.sents(categories=['news', 'editorial', 'reviews'])
print(field, webtext.raw(field)[:65], '...') # 聊天文本 from nltk.corpus import nps_chat for field in nps_chat.fileids(): print(field, nps_chat.posts(field)[:12]) chatroom = nps_chat.posts('10-19-20s_706posts.xml') print("chatroom[123]= ", chatroom[123]) # 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究) from nltk.corpus import brown show_subtitle("使用 categories 区分文本") print("brown.categories() =", brown.categories()) print("brown.words(categories='news')= ", brown.words(categories='news')) print("brown.words(categories=['news', 'editorial', 'reviews'])= ", brown.words(categories=['news', 'editorial', 'reviews'])) print("brown.sents(categories=['news', 'editorial', 'reviews'])= ", brown.sents(categories=['news', 'editorial', 'reviews'])) show_subtitle("使用 fileids 区分文本") print("brown.words(fileids='cg22')= ", brown.words(fileids='cg22')) news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print(m + ':', fdist[m], end=', ')
def exercise29(): # Readability measures are used to score the reading difficulty of a text, for the purposes of selecting texts of appropriate difficulty for language learners. Let us define μw to be the average number of letters per word, and μs to be the average number of words per sentence, in a given text. The Automated Readability Index (ARI) of the text is defined to be: 4.71 μw + 0.5 μs - 21.43. Compute the ARI score for various sections of the Brown Corpus, including section f (lore) and j (learned). Make use of the fact that nltk.corpus.brown.words() produces a sequence of words, while nltk.corpus.brown.sents() produces a sequence of sentences for category in brown.categories(): print(category+ ':' + str(ari(brown,category)))
import nltk nltk.download() # 1. brown corpus # 2. Inaugural speech # 3. book corpus - frequency distribution can be done(most common words in textbook) # In[2]: from nltk.corpus import brown # In[4]: brown.categories() # In[5]: print(type(brown)) # In[10]: brown.words(categories="adventure")[:100] # In[11]: len(brown.words(categories="adventure")) # In[13]:
#!/usr/bin/python3 # coding: utf-8 # Brown Corpus (布朗语料库): Brown Corpus of Standard American English 被认为是第一个可以在计算语言学处理中使用的通用英语语料库 # 它包含了一百万字 1961 年出版的美语文本; 它代表了通用英语的样本, 采样自小说, 新闻和宗教文本; 随后, 在大量的人工标注后, 诞生了词性标注过的版本 from nltk.corpus import brown print(len(brown.fileids())) # 500; 个 文档 print(brown.fileids()[:5]) # ['ca01', 'ca02', 'ca03', 'ca04', 'ca05'] print(len(brown.words())) # 1161192; 总共 1161192 个单词 print(brown.words()[:5]) # ['The', 'Fulton', 'County', 'Grand', 'Jury']; 打印前 5 个单词 print(len(brown.words('ca01'))) # 2242; 一片文档还是比较少的 ################################################################## ## 标记数据 print(brown.tagged_words()[:3]) # [('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')]; 打印前 3 个单词的标注 ################################################################## ## categories print(len(brown.categories())) # 15; 个分类 print(brown.categories()) # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] print(len(brown.words(categories='news'))) # 100554; 统一类数据的单词 print(len(brown.sents(categories=['news', 'editorial', 'reviews']))) # 9371 # brown 包括标记数据 和 非标记数据 print(len(brown.words())) # 1161192 print(len(brown.words(categories=brown.categories()))) # 1161192; 所有数据都在 categories 里面 ################################################################## ## 路径 print(brown.abspath('ca01')) # /home/coder352/nltk_data/corpora/brown/ca01 print(brown.abspaths()) # 所有文档路径 ################################################################## ## 类型 print(type(brown)) # <class 'nltk.corpus.reader.tagged.CategorizedTaggedCorpusReader'> print(type(brown.words())) # <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'> print(type(brown.words('ca01'))) # <class 'nltk.corpus.reader.tagged.TaggedCorpusView'>
from nltk.corpus import brown #Introduction to Brown Corpus print(brown.categories()) #Accessing words to Brown Corpus print(brown.words(categories='lore')) #Introduction to Conditional Frequency Distribution from nltk import ConditionalFreqDist #imports statement # pair_list [ (condition, word) ] pair_list = [(category, word) for category in brown.categories() for word in brown.words(categories=category)] print(pair_list[:10]) freqdist = ConditionalFreqDist(pair_list) print(freqdist['lore']['the']) #Conditional Method #tabulate functions category = ['adventure', 'lore', 'news'] samples = ['the', 'and', 'man'] freqdist.tabulate(conditions=category, samples=samples)
num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid #句子划分 macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') longest_len = max([len(s) for s in macbeth_sentences]) #网络聊天语料库 from nltk.corpus import webtext from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] from nltk.corpus import brown brown.categories() brown.sents(categories=['news', 'editorial', 'reviews']) news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print m + ':', fdist[m] cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] modals = ['can', 'could', 'may', 'might', 'must', 'will'] cfd.tabulate(conditions=genres, samples=modals)
def main(): tfidf = None word2vec = None similarityMatrix = None browndict = {} corporadict = None word2vec = None choice = "" while choice != "exit": choice = "" while choice not in ["tfidf", "word2vec", "exit"]: choice = input( "TF-IDF or Word2Vec? [TFIDF, Word2Vec, Exit]\n>").lower() if choice == "exit": break catType = "" while catType not in ["within", "between", "return"]: catType = input( "Within or between clusters? [Within, Between, Return]\n>" ).lower() if catType == "return": break # get all of the words for each document per category texts = [] if catType == "within": for c in brown.categories(): words = NormalizeWords(brown.words(categories=c)) texts.append(words) # build a dictionary for me to use later browndict[c] = words elif catType == "between": for c in brown.categories(): words = NormalizeWords(brown.words(categories=c)) texts.append(words[:len(words) // 2]) texts.append(words[len(words) // 2:]) # build a dictionary for me to use later browndict[c + "1/2"] = words[:len(words) // 2] browndict[c + "2/2"] = words[len(words) // 2:] # create the corpora dictionary built from gensim corporadict = corpora.Dictionary(texts) # create a corpus for the training corpus = [] for line in texts: corpus.append(corporadict.doc2bow(line)) if choice == "tfidf": # create the tfidf model from our built corpus tfidf = TfidfModel(corpus=corpus) # build the similarity matrix similarityMatrix = MatrixSimilarity(corpus, num_features=len(corporadict)) elif choice == "word2vec": word2vec = Word2Vec(brown.sents()) # build term similiarity matrix from our models word-vector termSimilarityIndex = WordEmbeddingSimilarityIndex(word2vec.wv) # build sparse similarity matrix sparseSimiliarityMatrix = SparseTermSimilarityMatrix( termSimilarityIndex, corporadict) # build similarity word-vector WV_SimilarityMatrix = SoftCosineSimilarity( corpus, sparseSimiliarityMatrix) maxes = {} if choice == "tfidf": # Print out the code keys = list(browndict.keys()) for i in range(len(keys) - 1): # Convert to a bag of words and to a tfidf vector, then query it. query_bow = corporadict.doc2bow(browndict[keys[i]]) query_tfidf = tfidf[query_bow] # Get the similarity of every cluster query_similarity = similarityMatrix[query_tfidf] for j in range(i + 1, len(query_similarity)): sim = query_similarity[j] print(keys[i], "and", keys[j], "have a similarity of:", sim) print("") elif choice == "word2vec": keys = list(browndict.keys()) for i in range(len(keys) - 1): # Convert to a bag of words and query it query_bow = corporadict.doc2bow(browndict[keys[i]]) # Get the similarity of every cluster query_similarity = WV_SimilarityMatrix[query_bow] for j in range(i + 1, len(query_similarity)): sim = query_similarity[j] print(keys[i], "and", keys[j], "have a similarity of:", sim) print("")
# -*- coding: utf-8 -*- """ Created on Tue Oct 23 15:19:53 2018 @author: kkonakan """ import nltk.corpus as nc from nltk.corpus import brown fname = 'milton-paradise.txt' guten = nc.gutenberg print(guten.fileids()) print(len(guten.raw(fname))) print(guten.sents(fname)[0:5]) print(guten.words(fname)[0:10]) print(brown.categories())
import nltk import random from nltk.corpus import brown #All of the different categories that are tagged in the corpus print(brown.categories()) #A random sentence from the "humor" category total = [] sentences = brown.sents(categories='humor') for sentence in sentences: sentence = ' '.join(sentence) total.append(sentence) print('\n', random.choice(total)) #The category with the most words categ_count = {} count = [] for category in brown.categories(): words = brown.words(categories=category) count.append(len(words)) categ_count[category] = len(words) count = sorted(count) for category in categ_count: if categ_count[category] == count[-1]: print('\n', category)
from nltk.corpus import stopwords from cPickle import dump from pprint import pprint train_dict = nltk.defaultdict(list) test_dict = nltk.defaultdict(list) def FDtoDIC(fd): out_dict = nltk.defaultdict(float) for key in fd.keys(): out_dict[key] = fd[key] out_dict['N'] = fd.N() return out_dict for category in set(brown.categories()).\ difference(set(['humor', 'science_fiction'])): cat_files = brown.fileids(categories=category) random.shuffle(cat_files) size = int(len(cat_files) * 0.85) train, test = cat_files[:size], cat_files[size:] key_list = [] for f in train: temp = brown.open(f).read().split() temp = [entry.split('/')[0] for entry in temp] temp = [entry for entry in temp if entry \ not in stopwords.words('english')] train_dict[category].append(FDtoDIC(nltk.FreqDist(temp))) key_list.extend(train_dict[category][-1].keys()) # compute the averge sample for the given category key_list = set(key_list)
# Module 3: Corpus # Corpus structure challenge from nltk.corpus import brown # print(brown.fileids()) fileid = 'cl08' # text = brown.words(fileid) # print(text) print(" Num of chars :", len(brown.raw(fileid))) print(" Num of words :", len(brown.words(fileid))) print(" Num of sentences :", len(brown.sents(fileid))) print(" Categories:", brown.categories(fileid))