def main(): start = timer() tokensDF = spark.read.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Attack_Westminster_big_tokenized.json") freqTokensDF = spark.read.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit1-words.json") custom_stopwords = ["``", "''", "'s", "said", "could", "also", "news", "--", "...", "``", "''"] stop_words = set(stopwords.words('english') + list(string.punctuation) + custom_stopwords) # top_wordnet = set(freqTokensDF.rdd.flatMap(lambda x: get_synsets(x.word)).collect()) # counter = Counter(tokensDF.rdd.flatMap(lambda x: x.tokens).flatMap(get_synsets).filter(lambda x: x in top_wordnet).collect()) filtered_words = tokensDF.rdd.flatMap(lambda x: x.tokens_lower).collect() # print("length of filtered_words: ") # print(len(filtered_words)) # print(filtered_words[0:10]) # freq_synset_count = counter.most_common(100) # freq_synset = [word for word, _ in freq_synset_count] freq_words = freqTokensDF.rdd.map(lambda x: x.word).collect() # print("top 10 words from unit 1:") # print(freq_words[0:10]) baseline_lower = [x.lower() for x in nltk.Text(brown.words() + words.words() + state_union.words()) if x.lower() not in stop_words] word_usage = get_percent_usage(freq_words, filtered_words, baseline_lower, key="event") # print("most_common type: " + str(type(freq_synset_count))). # list # print("word_usage type: " + str(type(word_usage))) # list # countDF = spark.createDataFrame(freq_synset_count, ['synset', 'count']) # countDF.write.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_word_count_big.json", mode="overwrite") # count_usage_DF = spark.createDataFrame(word_usage, ['word', 'event', 'baseline', 'diff']) count_usage_DF = spark.createDataFrame(word_usage) count_usage_DF.write.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_word_freq_big.json", mode="overwrite") # count_usage_DF.write.csv("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_word_freq_big.csv", mode="overwrite") # lemma_wordnet = set(freqTokensDF.rdd.flatMap(lambda x: get_lemmas(x.word)).collect()) # lemma_counter = Counter(tokensDF.rdd.flatMap(lambda x: x.tokens).flatMap(get_lemmas).filter(lambda x: x in lemma_wordnet).collect()) lemmas = get_lemma_set(freq_words) # print('lemmas: ' + str(type(lemmas))) # list # print('lemmas[0]: ' + str(type(lemmas[0]))) # tuple lemma_usage = get_lemma_percent_usage(lemmas, filtered_words, baseline_lower, key='event') # print('lemma_usage: ' + str(type(lemma_usage))) # list # lemma_countDF = spark.createDataFrame(lemma_counter.most_common(100), ['lemma', 'count']) # TODO: Error: TypeError: not supported type: <type 'set'> lemma_usage_DF = spark.createDataFrame(lemma_usage) lemma_usage_DF.write.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_lemma_freq_big.json", mode="overwrite") lemma_diff = get_lemma_diff(lemma_usage, key='diff') lemma_diff_DF = spark.createDataFrame(lemma_diff) lemma_diff_DF.write.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_final_result.json", mode="overwrite") end = timer() print('time elapsed: ' + str(end - start))
def chapter2_exercise4(): # Read in the texts of the State of the Union addresses, using the state_union corpus reader.Count occurrences of # men, women, and people in each document.What has happened to the usage of these words over time? files = state_union.fileids() men = dict() women = dict() people = dict() for index, file in enumerate(files): words = sorted(state_union.words(fileids=[file])) men[file] = words.count("men") women[file] = words.count("women") people[file] = words.count("people") print(file[:4], men[file], women[file], people[file], end=" ") if index % 6 == 5: print() print("\nMEN") for file, men_c in men.items(): print(file[:4], men_c) print("\nWOMEN") for file, women_c in women.items(): print(file[:4], women_c) print("\nPERSON") for file, person_c in people.items(): print(file[:4], person_c) print("men:", sum(men.values())) print("women:", sum(women.values())) print("people:", sum(people.values()))
def main(): print("Start: Initialize Variables") start = timeit.default_timer() sentences_str = fetch_sentences(INPUT_FILE) # TODO: After converting to lowercase, we may not know whether a word is a proper noun. word_tokens = word_tokenize(sentences_str.lower()) stop_words = set( stopwords.words('english') + list(string.punctuation) + custom_stopwords) filtered_words = [w for w in word_tokens if w not in stop_words] freq_words_with_counts = get_most_frequent_words(filtered_words, 500) freq_words = [word for word, _ in freq_words_with_counts] baseline_lower = [ x.lower() for x in nltk.Text(brown.words() + words.words() + state_union.words()) if x.lower() not in stop_words ] end = timeit.default_timer() print("End: Initialize Variables (took: %0.2fs)" % (end - start)) ### Used to determine the typical length of words for both baseline and our words. # print("Start: Plot Conditional Frequency Distributions") # start = timeit.default_timer() # plot_cond_freq_dist(word_tokens, baseline_lower) # end = timeit.default_timer() # print("End: Plot Conditional Frequency Distributions (took: %0.2fs)" % (end - start)) ### Used to find the most common words in our dataset that help identify our dataset. print("Start: Word Frequency Usage Calculation") start = timeit.default_timer() word_usage = get_percent_usage(freq_words, filtered_words, baseline_lower, key="event") with open('result/unit2_word_freq.csv', 'w') as f: f.write("word, event, baseline, diff\n") for word in word_usage: f.write( '%s,%.4f,%.4f,%.4f\n' % (word['word'], word['event'], word['baseline'], word['diff'])) end = timeit.default_timer() print("End: Word Frequency Usage Calculation (took: %0.2fs)" % (end - start)) ### Used to find the most frequent words when counting their synonyms as well. print("Start: Lemma Frequency Usage Calculation") start = timeit.default_timer() lemmas = get_lemma_set(freq_words) lemma_usage = get_lemma_percent_usage(lemmas, filtered_words, baseline_lower, key="event") with open('result/unit2_lemma_freq.csv', 'w') as f: f.write("word, event, baseline, diff\n") for lemma in lemma_usage: f.write('%s,%.4f,%.4f,%.4f\n' % (lemma['word'], lemma['event'], lemma['baseline'], lemma['diff'])) end = timeit.default_timer() print("End: Lemma Frequency Usage Calculation (took: %0.2fs)" % (end - start))
def state_union_men_stat(): cfd = nltk.ConditionalFreqDist((target,year[:4]) for year in state_union.fileids() for w in state_union.words(year) for target in ['men','women','people'] if w.lower().startswith(target) ) cfd.plot()
def all_documents(): documents = [] for document in state_union.fileids(): text = "" for word in state_union.words(document): text = text + " " + word documents.append((text, extract_president(document))) return documents
def get_tf(filename): words = state_union.words(filename) freq = {} for word in words: word = word.lower() if word not in stop and word_regex.match(word): if word in freq: freq[word] += 1 else: freq[word] = 1 return freq
def get_all_words(filenames): all_words = set() for filename in filenames: file_word_list = state_union.words(filename) file_word_set = set() for word in file_word_list: word = word.lower() if word not in stop and word_regex.match(word): file_word_set.add(word) all_words |= file_word_set return all_words
def get_pos_tf(filename): words = state_union.words(filename) pos_words = pos_tag(words) freq = {} for (word, pos) in pos_words: word = word.lower() if word not in stop and word_regex.match(word): word_pos = word + '/' + pos if word_pos in freq: freq[word_pos] += 1 else: freq[word_pos] = 1 return freq
def get_idf(filenames, feature_words): idf = {} file_words = {} for filename in filenames: file_words[filename] = set(state_union.words(filename)) for feature_word in feature_words: idf[feature_word] = 0 for filename in filenames: if feature_word in file_words[filename]: idf[feature_word] += 1 if idf[feature_word] == 0: # smoothing idf[feature_word] = 1 idf[feature_word] = math.log( len(filenames) / idf[feature_word] ) return idf
def ex4(): from nltk.corpus import state_union tags = ["men", "women", "people"] # for fileid in state_union.fileids(): # words = state_union.words(fileid) # fdist = nltk.FreqDist([w.lower() for w in words]) # print fileid + ": ", # for tag in tags: # print tag + "=" + str(fdist[tag]) + " ", # print cfd = nltk.ConditionalFreqDist((target, fileid[0:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in tags if w.lower() == target) cfd.plot()
def ex4(): from nltk.corpus import state_union tags = ["men", "women", "people"] # for fileid in state_union.fileids(): # words = state_union.words(fileid) # fdist = nltk.FreqDist([w.lower() for w in words]) # print fileid + ": ", # for tag in tags: # print tag + "=" + str(fdist[tag]) + " ", # print cfd = nltk.ConditionalFreqDist( (target, fileid[0:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in tags if w.lower() == target) cfd.plot()
def __init__(self, corpus, n, maxword): if n < 1 or maxword < 1: raise Exception("Silly noodle, negatives aren't fun") #Special Word self.SPECIALWORD = "??" #corpus self.corpus = corpus #Number of Grams self.numberGrams = n #Number of words to consider self.maxNumberGrams = maxword #List of word in in corpus self.listOfTotalWords = self.corpus.words() self.listOfSecondCorpusWords = reuters.words() self.listofThirdCorpusWords = state_union.words() self.listofThirdCorpusWords = gutenberg.words() #Dictionary of Words self.dictionaryOfWords = Counter(self.listOfTotalWords) self.dictionaryOfWordsSecondCorpus = Counter( self.listOfSecondCorpusWords) #Dictionary of maxword most common self.listOfCommonWords = self.dictionaryOfWords.most_common(maxword) # Make sure our special word is special flag = True while flag: if self.SPECIALWORD in self.dictionaryOfWords: self.SPECIALWORD = self.SPECIALWORD + "?" else: flag = False #We have to use a dictionary here because a set cannot #house elements of length one, which, for example, a period can #be one of our most common words. Therefore we made a dictionary #with literally a dummy value self.dictOfCommonWords = dict() for word in self.listOfCommonWords: self.dictOfCommonWords[word.__getitem__(0)] = "dummy" #The new listing of words after we replace all the #undesired words self.newListingOfWords = [ '.' ] + self.listOfTotalWords + self.listOfSecondCorpusWords + self.listofThirdCorpusWords """ for word in self.listOfTotalWords: if word in self.dictOfCommonWords: self.newListingOfWords.append(word) else: self.newListingOfWords.append(word) #self.newListingOfWords.append(self.SPECIALWORD) """ #Length of newListingOfWords self.lengthOfNewListingOfWords = len(self.newListingOfWords) #The ngrams of the the new listing of words self.ngramsOfNewList = ngrams(self.newListingOfWords, self.numberGrams) #Dictionary of a number of Occurances for a particular gram self.numberOccurancesOfGrams = Counter(self.ngramsOfNewList) #get all the grams of smaller size self.gramsOfSmaller = ngrams(self.newListingOfWords, self.numberGrams - 1) #get the counter of the gramsOfSmaller self.gramsOfSmallerCounter = Counter(self.gramsOfSmaller)
# b print("Words longer than 4 characters:") for word in words: if len(word) > 4: print(word, end=" ") print("\n") # Exercise 2 # a files = list(state_union.fileids()) terms = ["men", "women", "people"] statistics = nltk.ConditionalFreqDist((file, word) for file in state_union.fileids() for word in state_union.words(file) for term in terms if word.lower() == term) statistics.tabulate(conditions=files, samples=terms) # b years_raw = sorted(list(set([int(year[:4]) for year in state_union.fileids()]))) years = [str(year) for year in years_raw] year_statistics = nltk.ConditionalFreqDist( (word.lower(), fileid[:4]) for fileid in state_union.fileids() for word in state_union.words(fileid) for term in terms if word.lower() == term) year_statistics.plot() # More women over time, a lot of people in 1995 and 1946, more or less stable amount of men.
latinLanguages.append(language) languageContains = list() for latinlanguage in latinLanguages: if wordTested in udhr.words(latinlanguage): languageContains.append(latinlanguage) return languageContains print("According to corpus.udhr the word 'war' is used in languages:") print(find_language("war")) print() #EXERCISE 4 numMen = 0 numWomen = 0 numPeople = 0 for word in state_union.words(): #print(word) if word == "PEOPLE".lower(): numPeople += 1 if word == "MEN".lower(): numMen += 1 if word == "WOMEN".lower(): numWomen += 1 print("CHAPTER 2 EXERCISE 4") print("NUM PEOPLE " + str(numPeople) + " NUM MEN " + str(numMen) + " NUM WOMEN " + str(numWomen)) print() #EXERCISE 5 #Holonyms are words that are the larger group under which a word falls #Meronyms are the smaller words included
text3: The Book of Genesis text4: Inaugural Address Corpus text5: Chat Corpus text6: Monty Python and the Holy Grail text7: Wall Street Journal text8: Personals Corpus text9: The Man Who Was Thursday by G . K . Chesterton 1908 >>> >>> >>> >>> >>> # problem 1 >>> from nltk.corpus import state_union >>> cfd = nltk.ConditionalFreqDist((text, word) for text in state_union.fileids() for word in state_union.words( fileids = text )) >>> text = state_union.fileids() >>> contexts = ['men', 'women', 'people'] >>> cfd.tabulate(condition = text, samples = contexts) men women people 1945-Truman.txt 2 2 10 1946-Truman.txt 12 7 49 1947-Truman.txt 7 2 12 1948-Truman.txt 4 1 22 1949-Truman.txt 2 1 15 1950-Truman.txt 6 2 15 1951-Truman.txt 8 2 9 1953-Eisenhower.txt 3 0 17 1954-Eisenhower.txt 2 0 15 1955-Eisenhower.txt 4 0 26
# word tokens len([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()]) #words len(list(set([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()]))) #3 from nltk.corpus import brown brown.categories() brown.words(categories='science_fiction') #4 from nltk.corpus import state_union state_union.fileids() words=['men', 'women', 'people'] from nltk import ConditionalFreqDist cfd=ConditionalFreqDist([(word, fileid) for fileid in state_union.fileids() for word in [w for w in state_union.words(fileid)]]) cfd.plot(conditions=words) #5 word='life' from nltk.corpus import wordnet as wn for syn in wn.synsets(word): for mer in syn.part_meronyms(): print("Synset '{2}':\n\t{0}\n\npart meronym '{1}':\n\t{3} ".format(syn.definition(), mer.lemma_names()[0],syn.lemma_names()[0],mer.definition())) for mer in syn.member_meronyms(): print("Synset '{2}':\n\t{0}\n\nmember meronym '{1}':\n\t{3} ".format(syn.definition(), mer.lemma_names()[0],syn.lemma_names()[0],mer.definition())) for mer in syn.substance_meronyms(): print("Synset '{2}':\n\t{0}\n\nsubstance meronym '{1}':\n\t{3} ".format(syn.definition(),
import nltk from nltk.corpus import state_union #Word freq count cfd = nltk.FreqDist(state_union.words()) cfd.plot(50) j=1 for word in cfd.most_common(200): print(str(j)+" & "+str(word[0])+" & "+str(word[1])+"\\\\") j += 1
def get_vocab_size(filename): words = state_union.words(filename) vocab = set(words) return len(vocab)
def get_speech_length(filename): words = state_union.words(filename) return len(words)
reviews_text = brown.words(categories='reviews') # OR brown.words(categories=['news', 'reviews']) from nltk.corpus import webtext # webtext.fileids() webtext.words('singles.txt') webtext.words('overheard.txt') # 4.☼ Read in the texts of the State of the Union addresses, using the state_union corpus reader. # Count occurrences of men, women, and people in each document. # What has happened to the usage of these words over time? from nltk.corpus import state_union search_terms = ['men', 'women', 'people'] for fileid in state_union.fileids(): # fdist = nltk.FreqDist(word for target in search_terms for word in state_union.words(fileid) if word.lower().startswith(target)) fdist = nltk.FreqDist(state_union.words(fileid)) for term in search_terms: print(fileid, term, fdist[term]) #over time # years = [fileid[:4] for fileid in state_union.fileids()] ''' [ (genre, fileid[:4]) for genre in ['men', 'women', 'people'] for fileid in state_union.fileids() for word in state_union.words(fileid) if word==genre ] cfd = nltk.ConditionalFreqDist( (target, fileid[:4])
def question1(): a = nltk.ConditionalFreqDist((x, id[:4]) for id in state_union.fileids() for w in state_union.words(id) for x in ['men', 'women', 'people'] if w.lower().startswith(x)) a.plot()
def state_union_ts(word_list): cfd = nltk.ConditionalFreqDist((word.lower(), fileid[:4]) for fileid in state_union.fileids() for word in state_union.words(fileid) if word.lower() in word_list) return cfd
import nltk from nltk.corpus import state_union test = [fid for fid in state_union.fileids() if 'Johnson' in fid] train = [fid for fid in state_union.fileids() if fid not in test] print 'TEST:', ', '.join(test) f = open('sou.test.txt','w') for w in state_union.words(test): print>>f, w f.close() f = open('sou.norm.test.txt','w') for s in state_union.sents(test): s = ' '.join(s).lower() s = s.replace("' s ","'s ").replace(' .','.') s = ' '.join(nltk.word_tokenize(s)) print>>f, s f.close() print 'TRAIN:', ', '.join(train) f = open('sou.train.txt','w') for w in state_union.words(train): print>>f, w f.close() f = open('sou.norm.train.txt','w') for s in state_union.sents(train): s = ' '.join(s).lower() s = s.replace("' s ","'s ").replace(' .','.') s = ' '.join(nltk.word_tokenize(s))
print ('len(set([w.lower() for w in gutenberg.words(\'austen-persuasion.txt\')])) = ' + str(len(set([w.lower() for w in gutenberg.words('austen-persuasion.txt')]))) + '\n\n') print ('3. brown.words(categories=\'news\') = ' + str(brown.words(categories='news')) + '\n') print ('brown.words(categories=\'reviews\') = ' + str(brown.words(categories='reviews')) + '\n\n') print ('4.\nfor fileid in state_union.fileids():\n ' + 'print str(state_union.words(fileid).count(\'men\')) + \' \' + ' + 'str(state_union.words(fileid).count(\'women\')) + \' \' + ' + 'str(state_union.words(fileid).count(\'people\'))') print ('FileID\t\t\tMen\tWomen\tPeople') print('===============================================') for fileid in state_union.fileids(): print (str(fileid) + ('\t' * (1 + int(2 - (1 / 15) * len(fileid)))) + str(state_union.words(fileid).count('men')) + '\t' + str(state_union.words(fileid).count('women')) + '\t' + str(state_union.words(fileid).count('people'))) print '\n\n' print '5.' for word in [wn.synset('aircraft.n.01'), wn.synset('zebra.n.01')]: print word print 'Member meronyms: ' + str(word.member_meronyms()) print 'Part meronyms: ' + str(word.part_meronyms()) print 'Substance meronyms: ' + str(word.substance_meronyms()) print 'Member holonyms: ' + str(word.member_holonyms()) print 'Part holonyms: " ' + str(word.part_holonyms()) print 'Substance holonyms: " ' + str(word.substance_holonyms()) print '\n' print '\n\n'
linux_set = set(linux_words) for cat in brown_cats: words = brown.words(categories=cat) tokens = [w.lower() for w in words] all_toks_brown = all_toks_brown + tokens complete_toks = complete_toks + tokens for cat in reuters_cats: words = reuters.words(categories=cat) tokens = [w.lower() for w in words] all_toks_reuters = all_toks_reuters + tokens complete_toks = complete_toks + tokens for cat in state_union_cats: words = state_union.words(cat) tokens = [w.lower() for w in words] all_toks_state_union = all_toks_state_union + tokens complete_toks = complete_toks + tokens for word in linux_words: complete_toks.append(word) #list_brown = list() #for word in all_toks_brown: # word_length = len(word) # list_brown = list_brown + word_length cnt_brown = Counter() cnt_reuters = Counter()
# -*- coding: utf-8 -*- import matplotlib matplotlib.use('TkAgg') import nltk ''' ☼ Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of men, women, and people in each document. What has happened to the usage of these words over time? ''' from nltk.corpus import state_union #print state_union.fileids() targets = ['men', 'women', 'people'] pair = [(target, fileid[:4]) for fileid in state_union.fileids() for word in state_union.words(fileid) for target in targets if word.lower() == target] print pair cfd = nltk.ConditionalFreqDist(pair) cfd.plot()
for president in inaugural.fileids(): vocab = Vocabulary(inaugural.words(president), unk_cutoff=2) president_vocabulary[president] = len(vocab) inverse_vocabulary = [(value, key) for key, value in president_vocabulary.items()] print(max(inverse_vocabulary)[1], max(inverse_vocabulary)[0]) #richest vocabulary for Harrison in 1841 print(min(inverse_vocabulary)[1], min(inverse_vocabulary)[0]) #poorest vocabulary for Washington in 1793 president_vocabulary_state_union = {} for president in state_union.fileids(): vocab = Vocabulary(state_union.words(president), unk_cutoff=2) president_vocabulary_state_union[president] = len(vocab) inverse_vocabulary_state_union = [ (value, key) for key, value in president_vocabulary_state_union.items() ] print( max(inverse_vocabulary_state_union)[1], max(inverse_vocabulary_state_union) [0]) #richest vocabulary for Truman in 1946 print( min(inverse_vocabulary_state_union)[1], min(inverse_vocabulary_state_union) [0]) #poorest vocabulary for Johnson in 1963 # Exercise 2
#Number 1 (2.4) in HW3 print('################ Number 1 ################') #Generating list for each of the words through time from nltk.corpus import state_union as su total = [] men = [] women = [] people = [] for s in su.fileids(): length_women = 0 length_men = 0 length_people = 0 length = 0 for w in su.words(s): if w.lower() == 'women': length_women += 1 length += 1 elif w.lower() == 'men': length_men += 1 length += 1 elif w.lower() == 'people': length_people += 1 length += 1 total.append(length) women.append(length_women) men.append(length_men) people.append(length_people) length_women = 0 length_men = 0
print(i,": ", len([w for w in words if len(w) == i]) / len(words)) alphalowerfreq = FreqDist(wordsloweralpha) ourwords = ["shooting", "elementary", "school", "dead", "victim", "gunman", "connecticut", "sandy", "injured", "lanza", "tragedy", "grade", "children", "firearm", "weapon", "morning", "december", "teacher", "police", "motive"]; for i in range(0, len(ourwords)): print(str(ourwords[i])+ ": "+ str(alphalowerfreq[ourwords[i]] / len(wordsloweralpha))) from nltk.corpus import brown from nltk.corpus import reuters from nltk.corpus import state_union from nltk.corpus import words setnames = ["baseline", "Class Event", "Connecticut School Shooting"] baselineNOSET = brown.words() + reuters.words() + state_union.words() + words.words(); baseline = set(baselineNOSET) #nltk.Text(corpuses) sets = {"baseline":baseline , "Class Event":set(classwordsloweralpha), "Connecticut School Shooting":set(wordsloweralpha)} setlens = {"baseline":sum(map(lambda x: len(x), baseline)) , "Class Event":sum(map(lambda x: len(x), set(classwordsloweralpha))), "Connecticut School Shooting":sum(map(lambda x: len(x), set(wordsloweralpha)))} cfd = nltk.ConditionalFreqDist((s, len(word)) for s in setnames for word in sets[s]) cpd = nltk.ConditionalProbDist( (s, len(word)) for s in setnames for word in sets[s]) cfd.plot(cumulative=True) baselinefreq = FreqDist([w.lower() for w in baselineNOSET if w.isalpha()])
# read texts from the State of the Union addresses using the state_union module # determine the frequency of use of the words "men", "women", "people" in each document import nltk from nltk.corpus import state_union state_files = state_union.fileids() words = ['men', 'women', 'people'] cfd = nltk.ConditionalFreqDist( (text, word) for text in state_files for word in state_union.words(text)) cfd.tabulate(conditions=state_files, samples=words) cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in state_files for word in state_union.words(fileid) for target in words if word.lower().startswith(target)) cfd.plot() # analyze the frequency chart of modal verbs for different genres # find other word use classes that also differ in different genres import nltk import nltk.corpus corpus_name = nltk.corpus.brown files = corpus_name.fileids() modals = ['can', 'could', 'may', 'might', 'must', 'will'] commons = ['the', 'be', 'to', 'of', 'and', 'in', 'that'] adjectives = ['good', 'new', 'first', 'last', 'long'] genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] cfd = nltk.ConditionalFreqDist((genre, word)
def tabulate(cfdist, words, categories): print('%-16s' % 'Category', end=' ') for word in words: print('%6s' % word, end=' ') print() for category in categories: print('%-16s' % category, end=' ') for word in words: print('%6d' % cfdist[category][word], end=' ') print() cfd = nltk.ConditionalFreqDist( (fileid, word) for fileid in state_union.fileids() for word in state_union.words(fileid)) # In[47]: tabulate(cfd, ['men', 'women', 'people'], state_union.fileids()) # In[55]: #5. Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(), substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms(). wordnet.synset('book.n.01').part_holonyms() wordnet.synset('book.n.01').substance_holonyms() wordnet.synset('book.n.01').member_holonyms()
mport pandas as pd import sys print sys.version import nltk help(nltk.download) nltk.download('all-corpora') from nltk.corpus import state_union state_union.words() len(state_union.words()) sentences = state_union.sents() print sentences state_union_text = nltk.Text(state_union.words()) print state_union_text.count("war") state_union_text.concordance("economy") state_union_text.similar("economy") state_union_text.common_contexts(["economy", "jobs"]) from nltk.probability import FreqDist fdist = FreqDist(state_union_text) result = fdist.most_common(15) result
# Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of # men, women and people in each document. What has happened to the usages of these words over time? import nltk from nltk.corpus import state_union fileids = state_union.fileids() print(fileids) print(fileids[1][:4]) print(state_union.words(fileids[1])[:100]) cfd = nltk.ConditionalFreqDist((gender, fileid[:4]) for fileid in state_union.fileids() for word in state_union.words(fileid) for gender in ['men', 'women', 'people'] if word.lower().startswith(gender)) cfd.plot()
#2. persuasion=nltk.corpus.gutenberg.words('austen-persuasion.txt') len(persuasion) len(set(persuasion)) #3. from nltk.corpus import brown brown.fileids() brown.categories() brown.words(categories='adventure') #4. from nltk.corpus import state_union text = state_union.words() cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in ['men','women'] if w.lower().startswith(target)) cfd.plot() #5. wn.synset('fish.n.01').part_meronyms() wn.synset('fish.n.01').member_meronyms() wn.synset('leaf.n.01').substance_meronyms() wn.synset('fish.n.01').member_holonyms() wn.synset('leaf.n.01').substance_holonyms()
from nltk.corpus import state_union import nltk cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in ["people", "men", "women"] if w.lower().startswith(target)) cfd.plot() # Can't really depict a noticeable evolution.
from nltk.corpus import state_union import nltk #remove punctuation and stopwords all_words=(w.lower() for w in state_union.words() if (w.isalpha()) and (w.lower() not in nltk.corpus.stopwords.words('english'))) #Word freq count cfd = nltk.FreqDist(all_words) #Calculate 50 most common words mostcommon = cfd.most_common(50) #Plot 50 most common words and print them cfd.plot(50) print(mostcommon)
#4 import nltk from nltk.corpus import state_union for speech in state_union.fileids(): words = state_union.words(fileids=[speech]) fdist = nltk.FreqDist(w.lower() for w in words) print(speech) print("she: ", fdist["she"], end='\n') print("he: ", fdist["he"], end='\n') print("people: ", fdist["people"], end='\n')
'u': 6, 'v': 6, 'w': 800, 'x': 60, 'y': 10, 'z': 7 } return sum(letter_vals[l.lower()] for l in word if len(re.findall(r'[a-z]', l.lower())) > 0) print('(a)\ngematria for \'forbidden\':', gematria('forbidden'), '\n\n') # (b) words = state_union.words() #print('\n(b)\nNumber of words with gematria score 666: ' + str(len([w for w in words if (gematria(w)==666)]))) #w666 = [(fileid,w) for fileid in state_union.fileids() for w in state_union.words(fileid) if gematria(w)==666] print('(b)') for fileid in state_union.fileids(): w666 = [w.lower() for w in state_union.words(fileid) if gematria(w) == 666] print(fileid, ': ', len(w666), w666) print() # (c) def decode(text): tokens = [t.lower() for t in text.split()]
def findNbest(finder, n): trigram_measures = nltk.collocations.TrigramAssocMeasures() resultList = [] for col in finder.nbest(trigram_measures.pmi, n * 3): if (len(resultList) >= n): break if maxOneName(col): resultList.append(col) return resultList def performTrigram(wordSet, windowSize, printTime=False): start = time.time() finder = TrigramCollocationFinder.from_words(wordSet, window_size=windowSize) end = time.time() if printTime: print(end - start) return finder def printResults(set): i = 1 for (a, b, c) in set: print("\\item " + a + " " + b + " " + c) #ADD 1.3 code here!!!! printResults(findNbest(performTrigram(state_union.words(), 3, True), 10)) printResults(findNbest(performTrigram(state_union.words(), 5, True), 10)) printResults(findNbest(performTrigram(state_union.words(), 10, True), 10))
classevent_words = classevent_wordlists.words() classevent_sents = classevent_wordlists.sents() classevent_words = [w.lower() for w in classevent_words if w.isalnum()] classevent_words = nltk.Text(classevent_words) classevent_words_lem = [lemmer.lemmatize(w) for w in classevent_words] small_words = wordlists.words() small_words = [w.lower() for w in small_words if w.isalnum()] small_words = nltk.Text(small_words) small_words_lem = [lemmer.lemmatize(w) for w in small_words] yourwords = ['earthquake', 'seismic', 'aftershocks', 'quake', 'damage', 'magnitude', 'tremor', 'richter', 'epicenter', 'depth', 'fault', 'hypocenter', 'focus', 'dead', 'casualties', 'structural', 'seismometer', 'temblor', 'hazard', 'impact'] yourwords_lem = [lemmer.lemmatize(w.lower()) for w in yourwords] baseline_words = brown.words() + state_union.words() + reuters.words() + words.words() baseline_words_lem = [lemmer.lemmatize(w) for w in baseline_words] def non_cumulative_word_length_distribution(words): freq_dist_pairs = FreqDist([len(w) for w in words]).items() return_dist = {} for pair in freq_dist_pairs: return_dist[pair[0]] = pair[1] return return_dist def cumulative_word_length_distribution(words): non_cumulative_dist = non_cumulative_word_length_distribution(words) print non_cumulative_dist cumulative_dist = {} for i in non_cumulative_dist: sum = 0 for j in range(i):
import nltk from nltk.corpus import state_union from nltk.stem.porter import PorterStemmer #stemmer declaration stemmer=PorterStemmer() #retrieve all non-stopwords and stem them all_stems =(stemmer.stem(w.lower()) for w in state_union.words () if (w.isalpha ()) and (w.lower() not in nltk.corpus.stopwords.words('english'))) #list of all unique stems in the collection dist_stems=set(all_stems)
import nltk from nltk.corpus import state_union from nltk.stem import WordNetLemmatizer from nltk.stem.porter import PorterStemmer #declare lematizer lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() #get all non-stopwords and lemmatize them all_lemmas = (lemmatizer.lemmatize(w.lower()) for w in state_union.words() if ( w.isalpha()) and (w.lower() not in nltk.corpus.stopwords.words('english'))) #create empy list for stemmed lemmas stemmed_lemmas = [] #populate list for word in all_lemmas: stemmed_lemmas.append(stemmer.stem(word))
import nltk from nltk.corpus import state_union # Plot usage of words over time cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in ['men', 'women', 'people'] if w.lower().startswith(target)) cfd.plot()
#def __new__(cls, *args, **kwargs): # http://stackoverflow.com/questions/42558/python-and-the-singleton-pattern # if not cls.__instance: # cls.__instance = super(Decoder, cls).__new__(cls, *args, **kwargs) # return cls.__instance def __init__(self): if not Decoder.__index: Decoder.__index = nltk.Index([]) Decoder.__index if __name__ == "__main__": from nltk.corpus import state_union print "Occurrences of the sign of the Devil in SotU addresses past:" for file in state_union.fileids(): print file[:4], sum(1 for word in state_union.words(file) if gematria(word) == 666) decoded = Decoder().decode(state_union.words()) print decoded[:200]