def chapter2_exercise4(): # Read in the texts of the State of the Union addresses, using the state_union corpus reader.Count occurrences of # men, women, and people in each document.What has happened to the usage of these words over time? files = state_union.fileids() men = dict() women = dict() people = dict() for index, file in enumerate(files): words = sorted(state_union.words(fileids=[file])) men[file] = words.count("men") women[file] = words.count("women") people[file] = words.count("people") print(file[:4], men[file], women[file], people[file], end=" ") if index % 6 == 5: print() print("\nMEN") for file, men_c in men.items(): print(file[:4], men_c) print("\nWOMEN") for file, women_c in women.items(): print(file[:4], women_c) print("\nPERSON") for file, person_c in people.items(): print(file[:4], person_c) print("men:", sum(men.values())) print("women:", sum(women.values())) print("people:", sum(people.values()))
def state_union_men_stat(): cfd = nltk.ConditionalFreqDist((target,year[:4]) for year in state_union.fileids() for w in state_union.words(year) for target in ['men','women','people'] if w.lower().startswith(target) ) cfd.plot()
def all_documents(): documents = [] for document in state_union.fileids(): text = "" for word in state_union.words(document): text = text + " " + word documents.append((text, extract_president(document))) return documents
def init(): train = [] test = [] filenames = state_union.fileids() for i in range(0,len(filenames)): if (i % 2 == 0): train.append(filenames[i]) else: test.append(filenames[i]) return (train, test)
def main(): nlp = spacy.load('en') text = '' for file in state_union.fileids(): text += state_union.raw(file) result_dictionary = bigram_text(text, nlp) i = 0 for occurrences, bigram in result_dictionary.items(): print(bigram, occurrences) i = i +1 if i > 100: break
def ex4(): from nltk.corpus import state_union tags = ["men", "women", "people"] # for fileid in state_union.fileids(): # words = state_union.words(fileid) # fdist = nltk.FreqDist([w.lower() for w in words]) # print fileid + ": ", # for tag in tags: # print tag + "=" + str(fdist[tag]) + " ", # print cfd = nltk.ConditionalFreqDist((target, fileid[0:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in tags if w.lower() == target) cfd.plot()
def ex4(): from nltk.corpus import state_union tags = ["men", "women", "people"] # for fileid in state_union.fileids(): # words = state_union.words(fileid) # fdist = nltk.FreqDist([w.lower() for w in words]) # print fileid + ": ", # for tag in tags: # print tag + "=" + str(fdist[tag]) + " ", # print cfd = nltk.ConditionalFreqDist( (target, fileid[0:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in tags if w.lower() == target) cfd.plot()
path = dir_path + "/*.txt" list_txt = glob.glob(path) all_toks_china = list() for txt in list_txt: file_y = open(txt).read() tokens = word_tokenize(file_y) all_toks_china = all_toks_china + tokens brown_cats = brown.categories() all_toks_brown = list() reuters_cats = reuters.categories() all_toks_reuters = list() state_union_cats = state_union.fileids() all_toks_state_union = list() complete_toks = list() linux_words = open("../ref/words").read().split('\n') linux_set = set(linux_words) for cat in brown_cats: words = brown.words(categories=cat) tokens = [w.lower() for w in words] all_toks_brown = all_toks_brown + tokens complete_toks = complete_toks + tokens for cat in reuters_cats: words = reuters.words(categories=cat)
return entity_names def extract_entities(taggedText): ''' Create map with entity and their counts :param taggedText: Parsed text (output of ne chunker) in tree form :return: dict of entities and their freq counts ''' entity_names = [] for tree in taggedText: entity_names.extend(extract_entity_names(tree)) return entity_names #get year and words for each file extracted= [(state_union.raw(fileid), int(fileid[:4])) for fileid in state_union.fileids()] docs, years = zip(*extracted) #break text down into sentences, tokens tokens = [nltk.word_tokenize(text) for text in docs] sents = [nltk.sent_tokenize(text.replace("\n", " ")) for text in docs] senttokens = [[nltk.word_tokenize(sent) for sent in entry] for entry in sents] #get counts of unique words and plot over time unique = [len(set(words)) for words in tokens] plt.scatter(years, unique) plt.show() #get unique/total ratio ratios = [(float(len(set(words)))/float(len(words))) for words in tokens] plt.scatter(years, ratios)
text2: Sense and Sensibility by Jane Austen 1811 text3: The Book of Genesis text4: Inaugural Address Corpus text5: Chat Corpus text6: Monty Python and the Holy Grail text7: Wall Street Journal text8: Personals Corpus text9: The Man Who Was Thursday by G . K . Chesterton 1908 >>> >>> >>> >>> >>> # problem 1 >>> from nltk.corpus import state_union >>> cfd = nltk.ConditionalFreqDist((text, word) for text in state_union.fileids() for word in state_union.words( fileids = text )) >>> text = state_union.fileids() >>> contexts = ['men', 'women', 'people'] >>> cfd.tabulate(condition = text, samples = contexts) men women people 1945-Truman.txt 2 2 10 1946-Truman.txt 12 7 49 1947-Truman.txt 7 2 12 1948-Truman.txt 4 1 22 1949-Truman.txt 2 1 15 1950-Truman.txt 6 2 15 1951-Truman.txt 8 2 9 1953-Eisenhower.txt 3 0 17 1954-Eisenhower.txt 2 0 15
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Jun 13 11:30:48 2018 @author: vpapg """ # Read in the texts of the State of the Union addresses, using the state_union # corpus reader. Count occurrences of men, women, and people in each document. # What has happened to the usage of these words over time? from nltk.corpus import state_union from nltk import ConditionalFreqDist text = state_union.words() print("Men:", text.count("men")) print("Women:", text.count("women")) Text(text).dispersion_plot(["men", "women"]) cfd = ConditionalFreqDist((target, fileid) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in ['men', 'women'] if w.lower().startswith(target)) cfd.plot() # The word 'women' appears more in recent documents, so it appears more # over time
#Importing NLTK and download : tokenizer, tagger, stopwords, corpus import nltk import pandas as pd import numpy as np from nltk.corpus import stopwords, state_union from nltk.tokenize import word_tokenize from nltk.collocations import TrigramCollocationFinder from nltk import pos_tag nltk.download('state_union') nltk.download('stopwords') nltk.download('tagsets') #make corpusList ready corpusList = [] for i in range(len(state_union.fileids())): corpusList.append(state_union.raw(state_union.fileids()[i])) #concatanete all raw texts within corpusList allTexts = " ".join(corpusList) #get english stop words stop_words = set(stopwords.words('english')) #tokenize tokens = word_tokenize(allTexts) #tag tokens tagged = pos_tag(tokens) #convert tagged tuple into dataframe for the ease of manipulation
print 'Number of arguments:', len(sys.argv), 'arguments.' print 'Argument List:', str(sys.argv) phrase = sys.argv[1] corpora = sys.argv[2] corpus = [] #Check corpus if corpora == "gutenberg": titles = gutenberg.fileids() for title in titles: corpus.append(gutenberg.raw(title)) elif corpora == "state_union": titles = state_union.fileids() for title in titles: corpus.append(state_union.raw(title)) else: print "Choose from gutenberg or state_union" exit(0) vectorizer = TfidfVectorizer(min_df=1, stop_words="english") X = vectorizer.fit_transform(corpus) XA = X.toarray() # print vectorizer.vocabulary_ print 'The dimensions of the TF.IDF matrix: ' print XA.shape print 'TF.IDF computation for the ' + corpora + ' corpus is completed\n'
def get_text(): text = '' for file in state_union.fileids(): text += state_union.raw(file) return text
import nltk from nltk.corpus import state_union test = [fid for fid in state_union.fileids() if 'Johnson' in fid] train = [fid for fid in state_union.fileids() if fid not in test] print 'TEST:', ', '.join(test) f = open('sou.test.txt','w') for w in state_union.words(test): print>>f, w f.close() f = open('sou.norm.test.txt','w') for s in state_union.sents(test): s = ' '.join(s).lower() s = s.replace("' s ","'s ").replace(' .','.') s = ' '.join(nltk.word_tokenize(s)) print>>f, s f.close() print 'TRAIN:', ', '.join(train) f = open('sou.train.txt','w') for w in state_union.words(train): print>>f, w f.close() f = open('sou.norm.train.txt','w') for s in state_union.sents(train): s = ' '.join(s).lower() s = s.replace("' s ","'s ").replace(' .','.') s = ' '.join(nltk.word_tokenize(s))
23. RP Particle 24. SYM Symbol 25. TO to 26. UH Interjection 27. VB Verb, base form 28. VBD Verb, past tense 29. VBG Verb, gerund or present participle 30. VBN Verb, past participle 31. VBP Verb, non-3rd person singular present 32. VBZ Verb, 3rd person singular present 33. WDT Wh-determiner 34. WP Wh-pronoun 35. WP$ Possessive wh-pronoun 36. WRB Wh-adverb ''' state_union.fileids() text = state_union.raw('2006-GWBush.txt') train_text = state_union.raw('2005-GWBush.txt') custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized1 = custom_sent_tokenizer.tokenize(text) tagged1 = [] tokenized2 = sent_tokenize(text) tagged2 = [] for sent in tokenized1: words = word_tokenize(sent) tagged = nltk.pos_tag(words) tagged1.append(tagged)
# -*- coding: utf-8 -*- import matplotlib matplotlib.use('TkAgg') import nltk ''' ☼ Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of men, women, and people in each document. What has happened to the usage of these words over time? ''' from nltk.corpus import state_union #print state_union.fileids() targets = ['men', 'women', 'people'] pair = [(target, fileid[:4]) for fileid in state_union.fileids() for word in state_union.words(fileid) for target in targets if word.lower() == target] print pair cfd = nltk.ConditionalFreqDist(pair) cfd.plot()
#Assignment: 03 #Due Date: January 31st, 2018 import nltk #Number 1 (2.4) in HW3 print('################ Number 1 ################') #Generating list for each of the words through time from nltk.corpus import state_union as su total = [] men = [] women = [] people = [] for s in su.fileids(): length_women = 0 length_men = 0 length_people = 0 length = 0 for w in su.words(s): if w.lower() == 'women': length_women += 1 length += 1 elif w.lower() == 'men': length_men += 1 length += 1 elif w.lower() == 'people': length_people += 1 length += 1 total.append(length)
president_vocabulary = {} for president in inaugural.fileids(): vocab = Vocabulary(inaugural.words(president), unk_cutoff=2) president_vocabulary[president] = len(vocab) inverse_vocabulary = [(value, key) for key, value in president_vocabulary.items()] print(max(inverse_vocabulary)[1], max(inverse_vocabulary)[0]) #richest vocabulary for Harrison in 1841 print(min(inverse_vocabulary)[1], min(inverse_vocabulary)[0]) #poorest vocabulary for Washington in 1793 president_vocabulary_state_union = {} for president in state_union.fileids(): vocab = Vocabulary(state_union.words(president), unk_cutoff=2) president_vocabulary_state_union[president] = len(vocab) inverse_vocabulary_state_union = [ (value, key) for key, value in president_vocabulary_state_union.items() ] print( max(inverse_vocabulary_state_union)[1], max(inverse_vocabulary_state_union) [0]) #richest vocabulary for Truman in 1946 print( min(inverse_vocabulary_state_union)[1], min(inverse_vocabulary_state_union) [0]) #poorest vocabulary for Johnson in 1963
def state_union_ts(word_list): cfd = nltk.ConditionalFreqDist((word.lower(), fileid[:4]) for fileid in state_union.fileids() for word in state_union.words(fileid) if word.lower() in word_list) return cfd
from nltk.corpus import gutenberg gutenberg.fileids() gutenberg.words('austen-emma.txt') # word tokens len([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()]) #words len(list(set([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()]))) #3 from nltk.corpus import brown brown.categories() brown.words(categories='science_fiction') #4 from nltk.corpus import state_union state_union.fileids() words=['men', 'women', 'people'] from nltk import ConditionalFreqDist cfd=ConditionalFreqDist([(word, fileid) for fileid in state_union.fileids() for word in [w for w in state_union.words(fileid)]]) cfd.plot(conditions=words) #5 word='life' from nltk.corpus import wordnet as wn for syn in wn.synsets(word): for mer in syn.part_meronyms(): print("Synset '{2}':\n\t{0}\n\npart meronym '{1}':\n\t{3} ".format(syn.definition(), mer.lemma_names()[0],syn.lemma_names()[0],mer.definition())) for mer in syn.member_meronyms(): print("Synset '{2}':\n\t{0}\n\nmember meronym '{1}':\n\t{3} ".format(syn.definition(),
def question1(): a = nltk.ConditionalFreqDist((x, id[:4]) for id in state_union.fileids() for w in state_union.words(id) for x in ['men', 'women', 'people'] if w.lower().startswith(x)) a.plot()
#pres_avg_length = {} def getPresFromSpeech(speech_id): # 2001-GWBush-1.txt words = speech_id.split('.') if len(words) > 0: single_words = words[0].split('-') if len(single_words) > 0: for word in single_words: if word.isalpha(): return word return "" all_words = {} for speech_id in state_union.fileids(): text = state_union.raw(speech_id) words = word_tokenize(text) for word in words: if word not in all_words.keys(): all_words[word] = 1 else: all_words[word] += 1 sent_len = [] word_len = [] pres_list = [] pres_sent_total = {} pres_word_total = {} pres_char_total = {}
#4 import nltk from nltk.corpus import state_union for speech in state_union.fileids(): words = state_union.words(fileids=[speech]) fdist = nltk.FreqDist(w.lower() for w in words) print(speech) print("she: ", fdist["she"], end='\n') print("he: ", fdist["he"], end='\n') print("people: ", fdist["people"], end='\n')
def tabulate(cfdist, words, categories): print('%-16s' % 'Category', end=' ') for word in words: print('%6s' % word, end=' ') print() for category in categories: print('%-16s' % category, end=' ') for word in words: print('%6d' % cfdist[category][word], end=' ') print() cfd = nltk.ConditionalFreqDist( (fileid, word) for fileid in state_union.fileids() for word in state_union.words(fileid)) # In[47]: tabulate(cfd, ['men', 'women', 'people'], state_union.fileids()) # In[55]: #5. Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(), substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms(). wordnet.synset('book.n.01').part_holonyms() wordnet.synset('book.n.01').substance_holonyms() wordnet.synset('book.n.01').member_holonyms()
print(tempPhrase[-4:]) print(sorted(w.lower() for w in set(tempPhrase))) #only sort puts capital letters first #2 Use the corpus module to explore austen-persuasion.txt. How many word tokens does this book have? How many word types? austen_persuasion = gutenberg.words('austen-persuasion.txt') print("Number of word tokens = ", len(austen_persuasion)) print("Number of word types = ", len(set(austen_persuasion))) #3 Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus reader nltk.corpus.webtext.words() to access some sample text in two different genres. print(brown.categories()) news_data = brown.words(categories='news') religion_data = brown.words(categories='religion') #4 Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of men, women, and people in each document. What has happened to the usage of these words over time? print(state_union.fileids()) #cfd for inaugral address speeches for each president showing count of words american and citizen each speech cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in ['men', 'women'] if w.lower().startswith(target)) #cfd.plot() #5 Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(), substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms(). house = wn.synsets('house') print(house) house = wn.synset('house.n.01') print(house.lemma_names()) print(house.definition()) print(house.examples())
for word in words: if word[:2] == "sh": print(word, end=" ") print("\n") # b print("Words longer than 4 characters:") for word in words: if len(word) > 4: print(word, end=" ") print("\n") # Exercise 2 # a files = list(state_union.fileids()) terms = ["men", "women", "people"] statistics = nltk.ConditionalFreqDist((file, word) for file in state_union.fileids() for word in state_union.words(file) for term in terms if word.lower() == term) statistics.tabulate(conditions=files, samples=terms) # b years_raw = sorted(list(set([int(year[:4]) for year in state_union.fileids()]))) years = [str(year) for year in years_raw] year_statistics = nltk.ConditionalFreqDist( (word.lower(), fileid[:4]) for fileid in state_union.fileids() for word in state_union.words(fileid) for term in terms
import nltk from nltk.corpus import state_union # Plot usage of words over time cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in ['men', 'women', 'people'] if w.lower().startswith(target)) cfd.plot()
# In[1]: # NLTK imports import nltk from nltk.corpus import webtext from nltk.corpus import state_union import numpy as np nltk.download('state_union') nltk.download('stopwords') nltk.download('punkt') print("\n\n") print('The fields are: ') print(state_union.fileids()) # # TF.IDV Representation # Computing the TF.IDV value of each word of each text in the corpus # In[ ]: # Compute the TF value of each word from a bag of words (bow) def computeTF(wordDic, bow): tfDict = {} bowCount = len(bow) for word, count in wordDict.items(): tfDict[word] = count / float(bowCount) return tfDict
# Author: Jack Keane # Date: 3/25/20 # Description: Convert state of the union speeches into csv # Libraries from nltk.corpus import state_union from nltk.tokenize import sent_tokenize import string # Code speeches = state_union.fileids() f = open("../acronym_data/state_union_data.csv", "w") for s in speeches: speech = state_union.raw(s) sentences = sent_tokenize(speech.lower().replace("\n", " ")) for sen in sentences: f.write( sen.translate(str.maketrans('', '', string.punctuation)) + "\n") f.close()
# read texts from the State of the Union addresses using the state_union module # determine the frequency of use of the words "men", "women", "people" in each document import nltk from nltk.corpus import state_union state_files = state_union.fileids() words = ['men', 'women', 'people'] cfd = nltk.ConditionalFreqDist( (text, word) for text in state_files for word in state_union.words(text)) cfd.tabulate(conditions=state_files, samples=words) cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in state_files for word in state_union.words(fileid) for target in words if word.lower().startswith(target)) cfd.plot() # analyze the frequency chart of modal verbs for different genres # find other word use classes that also differ in different genres import nltk import nltk.corpus corpus_name = nltk.corpus.brown files = corpus_name.fileids() modals = ['can', 'could', 'may', 'might', 'must', 'will'] commons = ['the', 'be', 'to', 'of', 'and', 'in', 'that'] adjectives = ['good', 'new', 'first', 'last', 'long'] genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] cfd = nltk.ConditionalFreqDist((genre, word)
import nltk from nltk.corpus import state_union from nltk.tokenize import sent_tokenize, word_tokenize from nltk.stem import WordNetLemmatizer import re from gensim import corpora, models _files_all_speechs = state_union.fileids() all_raw_speeches = [] for _file_ in _files_all_speechs: all_raw_speeches.append(state_union.raw(_file_)) #print('Number of Speeches:', len(all_raw_speeches)) all_categories = [x.split('-')[1].split('.')[0] for x in _files_all_speechs] #print(all_categories) stopwords = nltk.corpus.stopwords eng_stopwords = stopwords.words('english') wordnet_lemmatizer = WordNetLemmatizer() def basic_preprocessing(text): text = text.lower() #lowering text = re.sub( r'\[.*?\]', '', text ) #removing all instances of citation brackets found in wiki articles text = word_tokenize(text) text = [word for word in text if word not in eng_stopwords] #removing stop words text = [word for word in text if len(word) > 1] #removing single character tokens
len(persuasion) len(set(persuasion)) #3. from nltk.corpus import brown brown.fileids() brown.categories() brown.words(categories='adventure') #4. from nltk.corpus import state_union text = state_union.words() cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in state_union.fileids() for w in state_union.words(fileid) for target in ['men','women'] if w.lower().startswith(target)) cfd.plot() #5. wn.synset('fish.n.01').part_meronyms() wn.synset('fish.n.01').member_meronyms() wn.synset('leaf.n.01').substance_meronyms() wn.synset('fish.n.01').member_holonyms() wn.synset('leaf.n.01').substance_holonyms() #6.cannot translate among 3 languages at a time, loop to solve from nltk.corpus import swadesh
def __init__(self): self.number_id = 40 self.source_id = "state_union" self.titles = [name for name in state_union.fileids()] self.data = [state_union.raw(name) for name in self.titles]
return entity_names def extract_entities(taggedText): ''' Create map with entity and their counts :param taggedText: Parsed text (output of ne chunker) in tree form :return: dict of entities and their freq counts ''' entity_names = [] for tree in taggedText: entity_names.extend(extract_entity_names(tree)) return entity_names #get year and words for each file extracted= [(state_union.raw(fileid), int(fileid[:4])) for fileid in state_union.fileids()] docs, years = zip(*extracted) #break text down into sentences, tokens tokens = [nltk.word_tokenize(text) for text in docs] sents = [nltk.sent_tokenize(text.replace("\n", " ")) for text in docs] senttokens = [[nltk.word_tokenize(sent) for sent in entry] for entry in sents] #get counts of unique words and plot over time unique = [len(set(words)) for words in tokens] plt.scatter(years, unique) plt.show() #get unique/total ratio ratios = [(float(len(set(words)))/float(len(words))) for words in tokens] plt.scatter(years, ratios)
#Gives an accuracy of 88% on test data return [clf_bern, clf_tree, vectorizer] [clf_bern, clf_tree, vectorizer] = train_questions() #this method classifies to which category a new question belongs def classify_question(question, vectorizer=vectorizer, clf=clf_bern): b = vectorizer.transform([question]) b = b.toarray() return clf.predict(b)[0] #loading the data set of different minutes of the meets lisa = state_union.fileids() dataset = [] for ele in lisa: dataset.append(state_union.raw(ele)) for i in range(len(dataset)): dataset[i] = dataset[i].encode('utf-8') #this funtion finds the most important words in the nth meet. def important_words(n, dataset=dataset): data = dataset #removing punctuations and \n from the data for i in range(len(data)): data[i] = data[i].translate(None, string.punctuation) data[i] = data[i].translate(None, "\n")