def get_char_counts(openfile): word_counts = sorted( collections.Counter(c for l in openfile for c in l).items()) counts = dict() for word_count in word_counts: if word_count[0].isalpha() or word_count[0].isspace(): counts[word_count[0]] = float(word_count[1]) counts[' '] = brown.raw().count(' ') corpus_size = float(len(brown.raw())) return corpus_size, counts
def process_brown(): tokenizer = RegexpTokenizer(r'\w+') brown_toks = tokenizer.tokenize(brown.raw()[:50000]) brown_toks = list(set(brown_toks)) brown_toks= map(lambda x: x.lower(), brown_toks) return brown_toks
def brownFreqListNoStop(): # Obtain the list of words brown_words = brown.raw().split(' ') englishstop = stopwords.words('english') filtered_words = [w for w in brown_words if not w in englishstop] num_filtered_words = len(filtered_words) print "We have " + str(num_filtered_words) + " brown filtered words" counter = 0 brown_frequ = defaultdict(int) sleep(2) for word in filtered_words: counter += 1 brown_frequ[word] += 1 if counter % 1000 == 0: print "Progress : " + str( (counter / float(num_filtered_words)) * 100) + " %" brown_frequ = sorted(brown_frequ.values(), reverse=True) brown_rank = np.array(xrange(1, len(brown_frequ) + 1)) c, alpha = powerLaw(brown_frequ, brown_rank) print 'According to Zipfs law %.2f should be close to 1.' % alpha plotPowerLaws( brown_rank, brown_frequ, [c, c], [-1, -alpha], title= "Relation between word rank and frequency for brown, no stop words", xlabel="Word Rank", ylabel="Word Frequency") return 0
def brownFreq(): # Obtain the list of words brown_words = brown.raw().split(' ') num_brown_words = len(brown_words) print "We have " + str(num_brown_words) + " brown words" counter = 0 brown_frequ = defaultdict(int) sleep(2) for word in brown_words: counter += 1 brown_frequ[word] += 1 if counter % 1000 == 0: print "Progress : " + str( (counter / float(num_brown_words)) * 100) + " %" brown_frequ = sorted(brown_frequ.values(), reverse=True) brown_rank = np.array(xrange(1, len(brown_frequ) + 1)) c, alpha = powerLaw(brown_frequ, brown_rank) plotPowerLaws(brown_rank, brown_frequ, [c, c], [-1, -alpha], title="Relation between word rank and frequency for brown", xlabel="Word Rank", ylabel="Word Frequency") return 0
def Main(): db = Database() index = InvertedIndex(db) brown_list = brown.fileids() gutenberg_list = gutenberg.fileids() # document1 = { # 'id': '1', # 'text': 'The big sharks of Belgium drink beer.' # } # document2 = { # 'id': '2', # 'text': 'Belgium has great beer. They drink beer all the time.' # } i = 0 for item in brown_list: documentTemp = {'id': str(i), 'text': brown.raw(item)} index.index_document(documentTemp) for item in gutenberg_list: documentTemp = {'id': str(i), 'text': gutenberg.raw(item)} index.index_document(documentTemp) while True: search_term = input("Enter term(s) to search: ") result = index.lookup_query(search_term.lower()) for term in result.keys(): for appearance in result[term]: # Belgium: { docId: 1, frequency: 1} document = db.get(appearance.docId) print(highlight_term(appearance.docId, term, document['text'])) print("-----------------------------")
def select_genres(n): ''' Selects genres with more than n files. Returns raw data and the genre of each file in the selected genres as two 1d numpy arrays. Parameters ---------- n: An integer. Returns ------- A tuple of (raw, genres) raw: A 1d numpy array. genres: A 1d numpy array. ''' genres = [] raw = [] #Creates arrays of the genres and raw data for genres with more than n files for file in brown.fileids(): for k in brown.categories(file): if len(brown.fileids(k)) > n: genres.append(k) raw.append(brown.raw(file)) return raw, genres
def _init(): global total global char_counts if total == 256: from nltk.corpus import brown for char in brown.raw(): char_counts[ord(char)] += 1 total = float(sum(char_counts))
def __init__(self): self.words = list(word.lower() for word in brown.words()) #self.words= brown.words() #self.text=nltk.Text(word.lower() for word in nltk.corpus.brown.words()) self.tagged_words = brown.tagged_words() self.tagfreq = {} self.raw = brown.raw() """
def ari(cat): """Accept text as list of words""" num_chars = len(brown.raw(categories=cat)) num_words = len(brown.words(categories=cat)) num_sents = len(brown.sents(categories=cat)) avg_word_len = num_chars / num_words avg_sent_len = num_words / num_sents return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43
def learn(self, listofsentences=[], n=126733): self.bf = BloomFilter(1090177, 4) i = 0 for sent in brown.raw(): if i >= n: break for word in sent: self.bf.Insert(word.lower()) i += 1 self.bf.PrintStats()
def ch03_29_reading_difficulty(): sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") from nltk.corpus import brown for category in brown.categories(): raw = brown.raw(categories=category) words = len(brown.words(categories=category)) sentences = len(sent_tokenizer.tokenize(raw)) letters_per_word = (len(raw) - words) / words # raw chars - words space chars words_per_sentence = words / sentences reading_level = (4.71 * letters_per_word) + (0.5 * words_per_sentence) + 21.43 print category, reading_level
def readBrownDataset(): nltk.download("brown") documents = brown.fileids() docs = [] for doc in documents: if len(brown.categories(doc)) == 1: d = brown.raw(doc).replace("\n", " ") d = re.sub( r"/[A-Za-z0-9_-]+ ", " ", d ) #The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn") #.replace("/at","").replace("/nn-tl","").replace("/nn-hp","").replace("/np-hl","").replace("/nn","").replace("/vbd","").replace("/in","").replace("/jj","").replace("/hvz","").replace("/cs","").replace("/nps","").replace("/nr","").replace("/np-tl","").replace("/md","").replace("/np","").replace("/cd-hl","").replace("/vbn","").replace("/np-tl","").replace("/dti","").replace("--/--","") docs.append(d) return docs
def readability(input): letters = brown.raw(categories=input) words = brown.words(categories=input) sentences = brown.sents(categories=input) letters_per_word = len(letters) / len(words) words_per_sentence = len(words) / len(sentences) ari_score = (4.71 * float(letters_per_word)) + (0.5 * float(words_per_sentence)) - 21.43 print("Letters per word: %s" % letters_per_word) print("Words per sentence: %s" % words_per_sentence) print("ARI Score: %s" % ari_score)
def demo(text=None): from nltk.corpus import brown from matplotlib import pylab tt = TextTilingTokenizer(demo_mode=True) if text is None: text = brown.raw()[:10000] s, ss, d, b = tt.tokenize(text) pylab.xlabel("Sentence Gap index") pylab.ylabel("Gap Scores") pylab.plot(range(len(s)), s, label="Gap Scores") pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") pylab.plot(range(len(d)), d, label="Depth scores") pylab.stem(range(len(b)), b) pylab.legend() pylab.show()
def get_brown_data(useN=100): try: fileids = brown.fileids() except LookupError: import nltk nltk.download('brown') fileids = brown.fileids() fileids = fileids[:useN] texts = [brown.raw(fid) for fid in fileids] fileids = [os.path.splitext(fid)[0] for fid in fileids] return texts, fileids
def Automated_Readability_Index29(section): char_count = 0 sent = len(brown.sents(categories=section)) words = len(brown.words(categories=section)) raw_text = brown.raw(categories=section) for ch in raw_text: if ch.isalpha(): char_count = char_count + 1 uw = char_count / float(words) us = words / float(sent) ARI = (4.71 * uw) + (0.5 * us) - 21.43 return ARI
def demo(text=None): from nltk.corpus import brown import pylab tt=TextTilingTokenizer(demo_mode=True) if text is None: text=brown.raw()[:10000] s,ss,d,b=tt.tokenize(text) pylab.xlabel("Sentence Gap index") pylab.ylabel("Gap Scores") pylab.plot(range(len(s)), s, label="Gap Scores") pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") pylab.plot(range(len(d)), d, label="Depth scores") pylab.stem(range(len(b)),b) pylab.legend() pylab.show()
def exercise29(): ''' Readability measures are used to score the reading difficulty of a text, for the purposes of selecting texts of appropriate difficulty for language learners. Let us define avgW to be the average number of letters per word, and avgSen to be the average number of words per sentence, in a given text. The Automated Readability Index (ARI) of the text is defined to be: 4.71 avgWord + 0.5 avgSen - 21.43. Compute the ARI score for various sections of the Brown Corpus, including section f (popular lore) and j (learned). Make use of the fact that nltk.corpus.brown.words() produces a sequence of words, while nltk.corpus.brown.sents() produces a sequence of sentences ''' for category in brown.categories(): chars = brown.raw(categories=category) words = brown.words(categories=category) sentences = brown.sents(categories=category) avgW = len(chars)/len(words) #average number of letters per word avgS = len(words)/len(sentences) #average number of words per sentence print category, "Avg Words", avgW, "Avg Sentences", avgS print("ARI", (4.71 * avgW ) + ( 0.5 * avgS ) - 21.43)
def word_frequencies(contents): toktok = ToktokTokenizer() string_corpus = brown.raw() # Frequencies for each file list = [] for file in contents.keys(): print("Tokenising", file) tokenised = [ toktok.tokenize(sent) for sent in sent_tokenize(string_corpus) ] fdist = Counter(chain(*tokenised)) list.append(fdist) # Combine keys into one set, eliminating duplicates print("Making frequency distribution of all words that we care about.") keys = [] for sublist in list: keys += sublist keys = set(keys) # Build combined frequency dict # Tuple of identifiers for connectives and other common words unwanted = ('at', 'to', 'in', 'ma', 'bez', 'ppss', 'pp$', 'dt', 'bedz', 'hv', 'cc', 'cs', 'hvd', 'wdt', '*', 'bed', 'ber', 'be', 'np$', 'ppo', 'pps', 'abn', 'cd', 'md', 'ben', 'ben', 'wps', 'vbd', 'jj', 'rb', 'do', 'ql', 'dts', 'rp', 'in-tl', 'ex', 'i', 'dti', 'dod', 'wrb', 'hvz', 'nn$') # This is far from the best way to do this, but I couldn't find the documentation for these identifiers frequencies = {} for key in keys: total = 0 if (key[0] not in string.punctuation) and ( key.split('/')[-1] not in unwanted): # Gets rid of unwanted tokens for sublist in list: if key in sublist.keys(): total += sublist[key] frequencies[key.split('/')[0].lower()] = total print("Total words (that we care about): " + str(len(frequencies.keys()))) return frequencies
def get_features(liste): features_file = {} for fileid in liste: features_file[fileid] = {} #on initialise les features du fichier # Utilisons notre libraire my_tools pour ajouter des stats sur les mots words = brown.words(fileid) stats_mots = mt.get_stats_longueur(words) for feature, valeur in stats_mots.items(): features_file[fileid][feature] = valeur # Puis sur les phrases stats_phrases = mt.get_types_phrases(brown.raw(fileid)) for feature, valeur in stats_phrases.items(): features_file[fileid][feature] = valeur adverbes = mt.get_effectif_adverbes(words) for feature, valeur in adverbes.items(): features_file[fileid][feature] = valeur #... l'entrée varie mais la sortie est un dico {"feature_name":valeur,...} print("->Features extraites:", list(features_file[fileid].keys())[:20], "...") return features_file
def demo(text=None): ''' use the bounary together with the pseudo sentences to evaluate the quality of segmentation. :param text: :return: ''' from nltk.corpus import brown from matplotlib import pylab tt = TextTilingTokenizer(w=40, k=20, demo_mode=True) with open('flypaper_short.txt', 'r') as file: text = file.read() if text is None: text = brown.raw()[:10000] s, ss, d, b = tt.tokenize(text) print(b) pylab.xlabel("Sentence Gap index") pylab.ylabel("Gap Scores") pylab.plot(range(len(s)), s, label="Gap Scores") pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") pylab.plot(range(len(d)), d, label="Depth scores") pylab.stem(range(len(b)), b) pylab.legend() pylab.show()
def demo(text=None): from nltk.corpus import brown import pylab tt = TextTilingTokenizer(demo_mode=True) if text is None: text = brown.raw()[:10000] s, ss, d, b = tt.tokenize(text) pylab.xlabel("Sentence Gap index") pylab.ylabel("Gap Scores") pylab.plot(range(len(s)), s, label="Gap Scores") pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") pylab.plot(range(len(d)), d, label="Depth scores") pylab.stem(range(len(b)), b) pylab.legend() pylab.show() """s = tt.tokenize(text) FILE = open("tiled","w") FILE.writelines(s) FILE.close()""" # if __name__ == '__main__': # content = open('toTile', 'r').read() # demo(content)
from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords import pandas as pd # Pick out the first of these texts — Emma by Jane Austen — and give it a short name, gutenberg_raw gutenberg_raw = gutenberg.raw("austen-emma.txt") # Pick out the words from webtext corpus and give it a short name, webtext_words webtext_words = webtext.words() print(webtext_words) # Pick out the text from np_chat corpus and name it as nps_chat_raw nps_chat_raw = nps_chat.raw() # Pick out the text from brown corpus and name it as brown_raw brown_raw = brown.raw() print(brown_raw) # Pick out the text from reuters corpus and name it as reuters_words reuters_words = reuters.words() print(reuters_words) # Pick out the text from inaugural corpus and name it as inaugral_raw inaugral_words = inaugural.words() print(inaugral_words) # Creating a variable for tokenizing words tokenizer = RegexpTokenizer(r'\w+') # Tokenizing the words in gutenberg corpus and assigning it to a variable named tokens tokens = tokenizer.tokenize(gutenberg_raw)
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import brown from nltk.corpus import webtext brown.raw(fileids=["cm02"]) webtext.raw("firefox.txt")
def main(argv): try: opts, args = getopt.getopt(argv,"o:v:c:",["ifile=","ofile="]) except getopt.GetoptError: print ' [-o <datafile>] [-v <vocabfile>]' sys.exit(2) outputfile = "trainging.dat" vocabfile = "vocab.txt" corpus = "20newsgroups" for opt, arg in opts: if opt == '-o': outputfile = arg elif opt == '-v': vocabfile = arg elif opt == '-c': corpus = arg tokenizer = RegexpTokenizer(r'[a-z]+') id_dict = {} nterms = 0; wordlist = [] data = [] if corpus == "20newsgroups": #categories = [] #categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med'] categories = ['talk.politics.guns','soc.religion.christian','sci.electronics','rec.sport.baseball','comp.graphics'] if len(categories) == 0: data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42).data else: data = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42).data elif corpus == "brown": for fileid in brown.fileids(): data.append(brown.raw(fileid)) elif corpus == "reuters": for fileid in reuters.fileids(): data.append(reuters.raw(fileid)) else: for dirname, dirnames, filenames in os.walk(corpus): for filename in filenames: inpfile = os.path.join(dirname,filename) with io.open(inpfile, "r", errors='ignore') as fp: data.append(fp.read()) fp.close() stemmer = SnowballStemmer("english") with io.open(outputfile, "wb") as output: for i in range(len(data)): lines = data[i].split('\n') fd = {} termsdoc=0 for line in lines: if line.startswith("From:") or line.startswith("Subject:") or line.startswith("Reply-To:") or line.startswith("Organization:") or line.startswith("Lines:") or line.lower().startswith("Nntp-Posting-Host:") or line.startswith("X-Newsreader:") or line.startswith("Distribution:") or line.startswith("Keywords:") or line.startswith("Article-I.D.:") or line.startswith("Supersedes:") or line.startswith("Expires:") or line.startswith("NNTP-Posting-Host:") or line.startswith("Summary:") or line.startswith("Originator:") : continue; line = line.lower() splits = tokenizer.tokenize(line) filtered_words = [word for word in splits if word not in stopwords.words('english')] filtered_words = [word for word in filtered_words if len(word) > 2] filtered_words = [word for word in filtered_words if word not in ["edu","com","subject","writes","mil", "subject"]] for word in filtered_words: try: id = id_dict[word] except KeyError: id_dict[word] = nterms id = nterms nterms = nterms+1 wordlist.append(word) try: fd[id] = fd[id]+1 except KeyError: fd[id] = 1 termsdoc = termsdoc+1 outline = str(termsdoc) for idterm in fd: outline = outline+" "+str(idterm)+":"+str(fd[idterm]) output.write(outline+"\n") output.close() output = open(vocabfile,"w") for val in wordlist: output.write(str(val)+"\n") output.close()
"Word rank inversely proportional to word frequency (Gutenberg)", "Word rank", "Word frequency") for w in gutenWordsFiltered: gutenFilteredFreq[w] += 1 gutenFilteredFreq = sorted(gutenFilteredFreq.values(), reverse=True) # Filtered word ranks gutenFilteredRank = numpy.array(xrange(1, len(gutenFilteredFreq) + 1)) c, a = powerLaw(gutenFilteredFreq, gutenFilteredRank) plotPowerLaws( gutenFilteredRank, gutenFilteredFreq, [c, c], [-1, -a], "Word rank inversely proportional to word frequency (Gutenberg without stopwords)", "Word rank", "Word frequency") # Brown corpus brownWords = brown.raw().split(" ") # Without stopwords brownWordsFiltered = [w for w in brownWords if not w in stopWords] # Frequencies brownFreq = defaultdict(int) brownFilteredFreq = defaultdict(int) for w in brownWords: brownFreq[w] += 1 brownFreq = sorted(brownFreq.values(), reverse=True) # Word ranks brownRank = numpy.array(xrange(1, len(brownFreq) + 1)) c, a = powerLaw(brownFreq, brownRank) plotPowerLaws(brownRank, brownFreq, [c, c], [-1, -a], "Word rank inversely proportional to word frequency (Brown)", "Word rank", "Word frequency")
""" #! python import nltk from nltk.corpus import treebank from nltk.corpus import brown from nltk.corpus import nps_chat from nltk.corpus import conll2000 import string from sklearn.feature_extraction.text import TfidfVectorizer #corpora brown = brown.raw() nps_chat = nps_chat.raw() conll2000 = conll2000.raw() treebank = treebank.raw() default=treebank; operational= brown; stemmer = nltk.stem.porter.PorterStemmer() remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) def stem_tokens(tokens): return [stemmer.stem(item) for item in tokens]
# -*- coding: utf-8 -*- import nltk from nltk.corpus import brown import jieba with open('flypaper_short.txt', 'r') as file: comments = file.read() #segment the word seg_list = jieba.cut(comments) tokenized_comments = (" ".join(seg_list)) print(tokenized_comments) #ttt = nltk.tokenize.TextTilingTokenizer(demo_mode=True) ttt = nltk.tokenize.TextTilingTokenizer() text = brown.raw()[:10000] #print(text) #gap_scores, smooth_scores, depth_scores, segment_boundaries = ttt.tokenize(tokenized_comments) result = ttt.tokenize(tokenized_comments) print(result) ''' print(gap_scores) print(smooth_scores) print(depth_scores) print(segment_boundaries) '''
from nltk.corpus import brown from nltk.util import ngrams from lib import constants from lib import huffman_tree # HuffmanTree object. ht = huffman_tree.HuffmanTree() # A dict with a bag of tags for each word. words = {} words_huffman_encoded = {} # A dict with a bag of tags for each huffman encoded word. # All raw text of brown corpus. txt = brown.raw() # Raw sentences from brown corpus. tagged_sentences = txt.split('./.') # Get all tags for all words for tagged_sentence in tagged_sentences: tagged_sentence = tagged_sentence.strip() if re.match(r'[0-9 ]+$', tagged_sentence): continue for word in tagged_sentence.split(): word = word.strip() word = word.split('/') if len(word) < 2: continue word[0] = str(word[0]).translate(string.maketrans("",""),
# Module 3: Corpus # Corpus structure challenge from nltk.corpus import brown # print(brown.fileids()) fileid = 'cl08' # text = brown.words(fileid) # print(text) print(" Num of chars :", len(brown.raw(fileid))) print(" Num of words :", len(brown.words(fileid))) print(" Num of sentences :", len(brown.sents(fileid))) print(" Categories:", brown.categories(fileid))
# Jonathan Monreal import re, nltk from nltk.corpus import brown raw = brown.raw(categories = 'humor') tokens = re.findall(r'\s(wh[\w]+)', raw) for word in tokens: print word
len(brown.fileids()) # 500 sources, each file is a source. # In[ ]: print(brown.fileids()[:100]) # First 100 sources. # You can access the raw files with: # In[ ]: print(brown.raw('cb01').strip()[:1000]) # First 1000 characters. # <br> # You will see that **each word comes with a slash and a label** and unlike normal text, we see that **punctuations are separated from the word that comes before it**, e.g. # # > The/at General/jj-tl Assembly/nn-tl ,/, which/wdt adjourns/vbz today/nr ,/, has/hvz performed/vbn in/in an/at atmosphere/nn of/in crisis/nn and/cc struggle/nn from/in the/at day/nn it/pps convened/vbd ./. # # <br> # And we also see that the **each sentence is separated by a newline**: # # > There/ex followed/vbd the/at historic/jj appropriations/nns and/cc budget/nn fight/nn ,/, in/in which/wdt the/at General/jj-tl Assembly/nn-tl decided/vbd to/to tackle/vb executive/nn powers/nns ./. # > # > The/at final/jj decision/nn went/vbd to/in the/at executive/nn but/cc a/at way/nn has/hvz been/ben opened/vbn for/in strengthening/vbg budgeting/vbg procedures/nns and/cc to/to provide/vb legislators/nns information/nn they/ppss need/vb ./. # # <br>
gutenberg.raw(fileid) #Words : gutenberg.words(fileid) #Sentence : gutenberg.sents(fileid) from nltk.tokenize import sent_tokenize tok = sent_tokenize(text) for x in range(5): print(tok[x]) from nltk.corpus import brown brown.categories() from nltk.corpus import brown brown.categories() text = brown.raw(categories='news') import nltk nltk.download('reuters') from nltk.corpus import reuters reuters.fileids() reuters.categories() fileid = 'test/16399' text = reuters.raw(fileid) text1=reuters.raw(categories='zinc') reuters.categories(fileid) import nltk nltk.download('movie_reviews')
# ngrams_stats_bi_rev = pickle.load('ngrams_stats_bi_rev.pkl') else: #initialise ngrams_stats_tri={} ngrams_stats_bi={} ngrams_stats_bi_rev={} ngrams_stats_tri_rev={} ''' #class ngrams_stats_tri = {} ngrams_stats_bi = {} ngrams_stats_bi_rev = {} ngrams_stats_tri_rev = {} vocab = Counter() #choose sample sample1 = brown.raw() sample2 = gutenberg.raw() sample3 = inaugural.raw() sample5 = nltk.corpus.state_union.raw() sample4 = genesis.raw('english-web.txt') sample = sample1 + sample2 + sample3 + sample4 + sample5 vocab, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev, ngrams_stats_bi_rev = mainTrain( vocab, sample, ngrams_stats_tri, ngrams_stats_bi, ngrams_stats_tri_rev, ngrams_stats_bi_rev) ''' with open('ngrams_stats_tri.pkl', 'w') as hfile: pickle.dump(ngrams_stats_tri, hfile) with open('ngrams_stats_bi.pkl', 'w') as hfile: pickle.dump(ngrams_stats_bi, hfile) with open('ngrams_stats_tri_rev.pkl', 'w') as hfile: pickle.dump(ngrams_stats_tri_rev, hfile)
#%% from nltk.corpus import reuters # %% articles = [" ".join(reuters.words(f)) for f in reuters.fileids()] with open('reuters.txt', 'w') as f: for article in articles: f.write(article) f.write('\n\n') # %% fileids = reuters.fileids() with open('reuters.txt', 'w') as f: for file_id in reuters.fileids(): f.write(reuters.raw(file_id)) f.write('\n\n') # %% from nltk.corpus import brown print(brown.raw(categories='learned')) # %%
from nltk.corpus import gutenberg from nltk.corpus import brown text = gutenberg.raw() text_tokens = nltk.word_tokenize(text) frecList_gutenberg = FreqDist(text_tokens) text_brown = brown.raw() text_brown_tokens = nltk.word_tokenize(text_brown) frecList_brown = FreqDist(text_tokens)ten
import nltk from nltk import * with open("dracula.txt") as f: tokens = nltk.word_tokenize(f.read()) text = nltk.Text(tokens) alpha_text = [word for word in text if word.isalpha() and len(word) > 5 and word[0].isupper() and word[1:].islower()] print(FreqDist(alpha_text).most_common(5)) from nltk.corpus import reuters, brown print(brown.categories()) fileid = brown.fileids(brown.categories()[-1]) raw = brown.raw(fileid) print(raw[:50]) cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set(w.lower() for w in gutenberg.words(fileid))) avg_word_len = round(num_chars / num_words) avg_sent_len = round(num_words / num_sents) lexical_diversity = round(num_words / num_vocab) print(fileid, " | ", num_chars, " | ", num_words, " | ", num_sents, " | ", num_vocab, " | ", avg_word_len, " | ", avg_sent_len, " | ", lexical_diversity) for fileid in webtext.fileids(): print(fileid) brown.categories() brown.raw("cr09") #stylistics - systematic differences between genres # by use of modal verbs - [can could may might must will] news_text = brown.words(categories='news') hobbies_text = brown.words(categories='hobbies') news_text_fdist = nltk.FreqDist(w.lower() for w in news_text) hobbies_text_fdist = nltk.FreqDist(w.lower() for w in hobbies_text) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print(m, ":", news_text_fdist[m], " | ", hobbies_text_fdist[m]) event_words = ["who", "what", "when", "where", "why"] for m in event_words: print(m, ":", news_text_fdist[m], " | ", hobbies_text_fdist[m])
from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews from topia.termextract import extract extractor = extract.TermExtractor() with open('./corpus/all3.txt', 'r') as f: with open('./data/terms.txt', 'w') as o: o.write("Term\tOccurences\tStrength\n") for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()): o.write("\t".join(map(str, term)) + "\n")
from nltk.corpus import inaugural, reuters, brown, gutenberg from itertools import product as iter_product def words(text): return re.findall('[a-z]+', text.lower()) def train(features): model = collections.defaultdict(lambda: 1) for f in features: model[f] += 1 return model NWORDS = train(words(inaugural.raw() + reuters.raw() + brown.raw() + gutenberg.raw())) alphabet = 'abcdefghijklmnopqrstuvwxyz' def edits1(word): splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [a + b[1:] for a, b in splits if b] transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] inserts = [a + c + b for a, b in splits for c in alphabet] return set(deletes + transposes + replaces + inserts) def known_edits2(word): return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)