コード例 #1
0
def main():
	start = timer()

	tokensDF = spark.read.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Attack_Westminster_big_tokenized.json")
	freqTokensDF = spark.read.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit1-words.json")
	
	custom_stopwords = ["``", "''", "'s", "said", "could", "also", "news", "--", "...", "``", "''"]
	stop_words = set(stopwords.words('english') + list(string.punctuation) + custom_stopwords)

	# top_wordnet = set(freqTokensDF.rdd.flatMap(lambda x: get_synsets(x.word)).collect())
	# counter = Counter(tokensDF.rdd.flatMap(lambda x: x.tokens).flatMap(get_synsets).filter(lambda x: x in top_wordnet).collect())

	filtered_words = tokensDF.rdd.flatMap(lambda x: x.tokens_lower).collect() 

	# print("length of filtered_words: ") 
	# print(len(filtered_words))
	# print(filtered_words[0:10])
	# freq_synset_count = counter.most_common(100)
	# freq_synset = [word for word, _ in freq_synset_count]
	freq_words = freqTokensDF.rdd.map(lambda x: x.word).collect()
	
	# print("top 10 words from unit 1:")
	# print(freq_words[0:10])

	baseline_lower = [x.lower() for x in nltk.Text(brown.words() + words.words() + state_union.words()) if x.lower() not in stop_words]

	word_usage = get_percent_usage(freq_words, filtered_words, baseline_lower, key="event")

	# print("most_common type: " + str(type(freq_synset_count))). # list
	# print("word_usage type: " + str(type(word_usage)))  # list 
	# countDF = spark.createDataFrame(freq_synset_count, ['synset', 'count'])
	# countDF.write.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_word_count_big.json", mode="overwrite")

	# count_usage_DF = spark.createDataFrame(word_usage, ['word', 'event', 'baseline', 'diff'])
	count_usage_DF = spark.createDataFrame(word_usage)

	count_usage_DF.write.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_word_freq_big.json", mode="overwrite")
	# count_usage_DF.write.csv("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_word_freq_big.csv", mode="overwrite")

	# lemma_wordnet = set(freqTokensDF.rdd.flatMap(lambda x: get_lemmas(x.word)).collect())
	# lemma_counter = Counter(tokensDF.rdd.flatMap(lambda x: x.tokens).flatMap(get_lemmas).filter(lambda x: x in lemma_wordnet).collect())
	lemmas = get_lemma_set(freq_words)
	# print('lemmas: ' + str(type(lemmas)))  # list
	# print('lemmas[0]: ' + str(type(lemmas[0])))  # tuple

	lemma_usage = get_lemma_percent_usage(lemmas, filtered_words, baseline_lower, key='event')
	# print('lemma_usage: ' + str(type(lemma_usage)))  # list

	# lemma_countDF = spark.createDataFrame(lemma_counter.most_common(100), ['lemma', 'count'])
	# TODO: Error: TypeError: not supported type: <type 'set'>
	lemma_usage_DF = spark.createDataFrame(lemma_usage)
	lemma_usage_DF.write.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_lemma_freq_big.json", mode="overwrite")

	lemma_diff = get_lemma_diff(lemma_usage, key='diff')
	lemma_diff_DF = spark.createDataFrame(lemma_diff)
	lemma_diff_DF.write.json("/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Unit2_final_result.json", mode="overwrite")

	end = timer()

	print('time elapsed: ' + str(end - start))
コード例 #2
0
def chapter2_exercise4():
    # Read in the texts of the State of the Union addresses, using the state_union corpus reader.Count occurrences of
    # men, women, and people in each document.What has happened to the usage of these words over time?
    files = state_union.fileids()
    men = dict()
    women = dict()
    people = dict()
    for index, file in enumerate(files):
        words = sorted(state_union.words(fileids=[file]))
        men[file] = words.count("men")
        women[file] = words.count("women")
        people[file] = words.count("people")
        print(file[:4], men[file], women[file], people[file], end="      ")
        if index % 6 == 5:
            print()
    print("\nMEN")
    for file, men_c in men.items():
        print(file[:4], men_c)
    print("\nWOMEN")
    for file, women_c in women.items():
        print(file[:4], women_c)
    print("\nPERSON")
    for file, person_c in people.items():
        print(file[:4], person_c)
    print("men:", sum(men.values()))
    print("women:", sum(women.values()))
    print("people:", sum(people.values()))
コード例 #3
0
def main():
    print("Start: Initialize Variables")
    start = timeit.default_timer()
    sentences_str = fetch_sentences(INPUT_FILE)
    # TODO: After converting to lowercase, we may not know whether a word is a proper noun.
    word_tokens = word_tokenize(sentences_str.lower())
    stop_words = set(
        stopwords.words('english') + list(string.punctuation) +
        custom_stopwords)
    filtered_words = [w for w in word_tokens if w not in stop_words]
    freq_words_with_counts = get_most_frequent_words(filtered_words, 500)
    freq_words = [word for word, _ in freq_words_with_counts]
    baseline_lower = [
        x.lower()
        for x in nltk.Text(brown.words() + words.words() + state_union.words())
        if x.lower() not in stop_words
    ]
    end = timeit.default_timer()
    print("End: Initialize Variables (took: %0.2fs)" % (end - start))

    ### Used to determine the typical length of words for both baseline and our words.
    # print("Start: Plot Conditional Frequency Distributions")
    # start = timeit.default_timer()
    # plot_cond_freq_dist(word_tokens, baseline_lower)
    # end = timeit.default_timer()
    # print("End: Plot Conditional Frequency Distributions (took: %0.2fs)" % (end - start))

    ### Used to find the most common words in our dataset that help identify our dataset.
    print("Start: Word Frequency Usage Calculation")
    start = timeit.default_timer()
    word_usage = get_percent_usage(freq_words,
                                   filtered_words,
                                   baseline_lower,
                                   key="event")
    with open('result/unit2_word_freq.csv', 'w') as f:
        f.write("word, event, baseline, diff\n")
        for word in word_usage:
            f.write(
                '%s,%.4f,%.4f,%.4f\n' %
                (word['word'], word['event'], word['baseline'], word['diff']))
    end = timeit.default_timer()
    print("End: Word Frequency Usage Calculation (took: %0.2fs)" %
          (end - start))

    ###  Used to find the most frequent words when counting their synonyms as well.
    print("Start: Lemma Frequency Usage Calculation")
    start = timeit.default_timer()
    lemmas = get_lemma_set(freq_words)
    lemma_usage = get_lemma_percent_usage(lemmas,
                                          filtered_words,
                                          baseline_lower,
                                          key="event")
    with open('result/unit2_lemma_freq.csv', 'w') as f:
        f.write("word, event, baseline, diff\n")
        for lemma in lemma_usage:
            f.write('%s,%.4f,%.4f,%.4f\n' % (lemma['word'], lemma['event'],
                                             lemma['baseline'], lemma['diff']))
    end = timeit.default_timer()
    print("End: Lemma Frequency Usage Calculation (took: %0.2fs)" %
          (end - start))
コード例 #4
0
ファイル: NLPBook.py プロジェクト: lyozhou/lyo-lib
def state_union_men_stat():
    cfd = nltk.ConditionalFreqDist((target,year[:4])
        for year in state_union.fileids()
        for w in state_union.words(year)
        for target in ['men','women','people']
        if w.lower().startswith(target)
    )
    cfd.plot()
コード例 #5
0
def all_documents():
    documents = []
    for document in state_union.fileids():
        text = ""
        for word in state_union.words(document):
            text = text + " " + word
        documents.append((text, extract_president(document)))
    return documents
コード例 #6
0
def get_tf(filename):
    words = state_union.words(filename)
    freq = {}
    for word in words:
        word = word.lower()
        if word not in stop and word_regex.match(word):
            if word in freq:
                freq[word] += 1
            else:
                freq[word] = 1
    return freq
コード例 #7
0
def get_all_words(filenames):
    all_words = set()
    for filename in filenames:
        file_word_list = state_union.words(filename)
        file_word_set = set()
        for word in file_word_list:
            word = word.lower()
            if word not in stop and word_regex.match(word):
                file_word_set.add(word)
        all_words |= file_word_set
    return all_words
コード例 #8
0
def get_pos_tf(filename):
    words = state_union.words(filename)
    pos_words = pos_tag(words)
    freq = {}
    for (word, pos) in pos_words:
        word = word.lower()
        if word not in stop and word_regex.match(word):
            word_pos = word + '/' + pos
            if word_pos in freq:
                freq[word_pos] += 1
            else:
                freq[word_pos] = 1
    return freq
コード例 #9
0
def get_idf(filenames, feature_words):
    idf = {}
    file_words = {}
    for filename in filenames:
        file_words[filename] = set(state_union.words(filename))            
    for feature_word in feature_words:
        idf[feature_word] = 0
        for filename in filenames:
            if feature_word in file_words[filename]:
                idf[feature_word] += 1
        if idf[feature_word] == 0:
            # smoothing
            idf[feature_word] = 1 
        idf[feature_word] = math.log( len(filenames) / idf[feature_word] )
    return idf
コード例 #10
0
def ex4():
    from nltk.corpus import state_union
    tags = ["men", "women", "people"]
    #  for fileid in state_union.fileids():
    #    words = state_union.words(fileid)
    #    fdist = nltk.FreqDist([w.lower() for w in words])
    #    print fileid + ": ",
    #    for tag in tags:
    #      print tag + "=" + str(fdist[tag]) + " ",
    #    print
    cfd = nltk.ConditionalFreqDist((target, fileid[0:4])
                                   for fileid in state_union.fileids()
                                   for w in state_union.words(fileid)
                                   for target in tags if w.lower() == target)
    cfd.plot()
コード例 #11
0
ファイル: ch02_ex.py プロジェクト: 447327642/nltk-examples
def ex4():
  from nltk.corpus import state_union
  tags = ["men", "women", "people"]
#  for fileid in state_union.fileids():
#    words = state_union.words(fileid)
#    fdist = nltk.FreqDist([w.lower() for w in words])
#    print fileid + ": ",
#    for tag in tags:
#      print tag + "=" + str(fdist[tag]) + " ",
#    print
  cfd = nltk.ConditionalFreqDist(
    (target, fileid[0:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
      for target in tags if w.lower() == target)
  cfd.plot()
コード例 #12
0
    def __init__(self, corpus, n, maxword):
        if n < 1 or maxword < 1:
            raise Exception("Silly noodle, negatives aren't fun")

        #Special Word
        self.SPECIALWORD = "??"

        #corpus
        self.corpus = corpus

        #Number of Grams
        self.numberGrams = n

        #Number of words to consider
        self.maxNumberGrams = maxword

        #List of word in in corpus
        self.listOfTotalWords = self.corpus.words()
        self.listOfSecondCorpusWords = reuters.words()
        self.listofThirdCorpusWords = state_union.words()
        self.listofThirdCorpusWords = gutenberg.words()

        #Dictionary of Words
        self.dictionaryOfWords = Counter(self.listOfTotalWords)
        self.dictionaryOfWordsSecondCorpus = Counter(
            self.listOfSecondCorpusWords)

        #Dictionary of maxword most common
        self.listOfCommonWords = self.dictionaryOfWords.most_common(maxword)

        # Make sure our special word is special
        flag = True
        while flag:
            if self.SPECIALWORD in self.dictionaryOfWords:
                self.SPECIALWORD = self.SPECIALWORD + "?"
            else:
                flag = False

        #We have to use a dictionary here because a set cannot
        #house elements of length one, which, for example, a period can
        #be one of our most common words. Therefore we made a dictionary
        #with literally a dummy value
        self.dictOfCommonWords = dict()
        for word in self.listOfCommonWords:
            self.dictOfCommonWords[word.__getitem__(0)] = "dummy"

        #The new listing of words after we replace all the
        #undesired words
        self.newListingOfWords = [
            '.'
        ] + self.listOfTotalWords + self.listOfSecondCorpusWords + self.listofThirdCorpusWords
        """
        for word in self.listOfTotalWords:
            
            if word in self.dictOfCommonWords:
                self.newListingOfWords.append(word)
            else:
                self.newListingOfWords.append(word)
                #self.newListingOfWords.append(self.SPECIALWORD)
        """

        #Length of newListingOfWords
        self.lengthOfNewListingOfWords = len(self.newListingOfWords)

        #The ngrams of the the new listing of words
        self.ngramsOfNewList = ngrams(self.newListingOfWords, self.numberGrams)

        #Dictionary of a number of Occurances for a particular gram
        self.numberOccurancesOfGrams = Counter(self.ngramsOfNewList)

        #get all the grams of smaller size
        self.gramsOfSmaller = ngrams(self.newListingOfWords,
                                     self.numberGrams - 1)

        #get the counter of the gramsOfSmaller
        self.gramsOfSmallerCounter = Counter(self.gramsOfSmaller)
コード例 #13
0
ファイル: Lab1_1-5.py プロジェクト: serverlat/TDT4310
# b
print("Words longer than 4 characters:")
for word in words:
    if len(word) > 4:
        print(word, end=" ")
print("\n")

# Exercise 2

# a
files = list(state_union.fileids())
terms = ["men", "women", "people"]
statistics = nltk.ConditionalFreqDist((file, word)
                                      for file in state_union.fileids()
                                      for word in state_union.words(file)
                                      for term in terms
                                      if word.lower() == term)
statistics.tabulate(conditions=files, samples=terms)

# b
years_raw = sorted(list(set([int(year[:4])
                             for year in state_union.fileids()])))
years = [str(year) for year in years_raw]
year_statistics = nltk.ConditionalFreqDist(
    (word.lower(), fileid[:4]) for fileid in state_union.fileids()
    for word in state_union.words(fileid) for term in terms
    if word.lower() == term)
year_statistics.plot()
# More women over time, a lot of people in 1995 and 1946, more or less stable amount of men.
コード例 #14
0
            latinLanguages.append(language)
    languageContains = list()
    for latinlanguage in latinLanguages:
        if wordTested in udhr.words(latinlanguage):
            languageContains.append(latinlanguage)
    return languageContains


print("According to corpus.udhr the word 'war' is used in languages:")
print(find_language("war"))
print()
#EXERCISE 4
numMen = 0
numWomen = 0
numPeople = 0
for word in state_union.words():
    #print(word)
    if word == "PEOPLE".lower():
        numPeople += 1
    if word == "MEN".lower():
        numMen += 1
    if word == "WOMEN".lower():
        numWomen += 1
print("CHAPTER 2 EXERCISE 4")
print("NUM PEOPLE " + str(numPeople) + " NUM MEN " + str(numMen) +
      " NUM WOMEN " + str(numWomen))
print()

#EXERCISE 5
#Holonyms are words that are the larger group under which a word falls
#Meronyms are the smaller words included
コード例 #15
0
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
>>> 
>>> 
>>> 
>>> 
>>> # problem 1
>>> from nltk.corpus import state_union
>>> cfd = nltk.ConditionalFreqDist((text, word)
			       for text in state_union.fileids()
			       for word in state_union.words( fileids = text ))

>>> text = state_union.fileids()
>>> contexts = ['men', 'women', 'people']
>>> cfd.tabulate(condition = text, samples = contexts)
                       men  women people 
    1945-Truman.txt      2      2     10 
    1946-Truman.txt     12      7     49 
    1947-Truman.txt      7      2     12 
    1948-Truman.txt      4      1     22 
    1949-Truman.txt      2      1     15 
    1950-Truman.txt      6      2     15 
    1951-Truman.txt      8      2      9 
1953-Eisenhower.txt      3      0     17 
1954-Eisenhower.txt      2      0     15 
1955-Eisenhower.txt      4      0     26 
コード例 #16
0
ファイル: 2.py プロジェクト: MaciejWasilewski/NLP-with-Python
# word tokens
len([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()])
#words
len(list(set([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()])))

#3
from nltk.corpus import brown
brown.categories()
brown.words(categories='science_fiction')

#4
from nltk.corpus import state_union
state_union.fileids()
words=['men', 'women', 'people']
from nltk import ConditionalFreqDist
cfd=ConditionalFreqDist([(word, fileid) for fileid in state_union.fileids() for word in [w for w in state_union.words(fileid)]])
cfd.plot(conditions=words)

#5
word='life'
from nltk.corpus import wordnet as wn
for syn in wn.synsets(word): 
    for mer in syn.part_meronyms():
        print("Synset '{2}':\n\t{0}\n\npart meronym '{1}':\n\t{3} ".format(syn.definition(),
              mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
        
    for mer in syn.member_meronyms():
        print("Synset '{2}':\n\t{0}\n\nmember meronym '{1}':\n\t{3} ".format(syn.definition(),
              mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
    for mer in syn.substance_meronyms():
        print("Synset '{2}':\n\t{0}\n\nsubstance meronym '{1}':\n\t{3} ".format(syn.definition(),
コード例 #17
0
import nltk
from nltk.corpus import state_union

#Word freq count
cfd = nltk.FreqDist(state_union.words())
cfd.plot(50)

j=1
for word in cfd.most_common(200):
    print(str(j)+" & "+str(word[0])+" & "+str(word[1])+"\\\\")
    j += 1
コード例 #18
0
def get_vocab_size(filename):
    words = state_union.words(filename)
    vocab = set(words)
    return len(vocab)
コード例 #19
0
def get_speech_length(filename):
    words = state_union.words(filename)
    return len(words)
コード例 #20
0
reviews_text = brown.words(categories='reviews')
# OR
brown.words(categories=['news', 'reviews'])
from nltk.corpus import webtext
# webtext.fileids()
webtext.words('singles.txt')
webtext.words('overheard.txt')

# 4.☼ Read in the texts of the State of the Union addresses, using the state_union corpus reader. 
# Count occurrences of men, women, and people in each document. 
# What has happened to the usage of these words over time?
from nltk.corpus import state_union
search_terms = ['men', 'women', 'people']
for fileid in state_union.fileids():
#     fdist = nltk.FreqDist(word for target in search_terms for word in state_union.words(fileid) if word.lower().startswith(target))
    fdist = nltk.FreqDist(state_union.words(fileid))
    for term in search_terms:
        print(fileid, term, fdist[term])

#over time
# years = [fileid[:4] for fileid in state_union.fileids()]
'''
[
(genre, fileid[:4])
for genre in ['men', 'women', 'people']
for fileid in state_union.fileids()
for word in state_union.words(fileid) if word==genre
]
    
cfd = nltk.ConditionalFreqDist(
          (target, fileid[:4])
コード例 #21
0
def question1():
    a = nltk.ConditionalFreqDist((x, id[:4]) for id in state_union.fileids()
                                 for w in state_union.words(id)
                                 for x in ['men', 'women', 'people']
                                 if w.lower().startswith(x))
    a.plot()
コード例 #22
0
ファイル: ch2.py プロジェクト: jyzhang/py-nlp
def state_union_ts(word_list):
	cfd = nltk.ConditionalFreqDist((word.lower(), fileid[:4]) 
		for fileid in state_union.fileids()
		for word in state_union.words(fileid) if word.lower() in word_list)
	return cfd
コード例 #23
0
ファイル: sou.py プロジェクト: davidar/polya
import nltk
from nltk.corpus import state_union

test  = [fid for fid in state_union.fileids() if 'Johnson' in fid]
train = [fid for fid in state_union.fileids() if fid not in test]

print 'TEST:', ', '.join(test)

f = open('sou.test.txt','w')
for w in state_union.words(test): print>>f, w
f.close()

f = open('sou.norm.test.txt','w')
for s in state_union.sents(test):
	s = ' '.join(s).lower()
	s = s.replace("' s ","'s ").replace(' .','.')
	s = ' '.join(nltk.word_tokenize(s))
	print>>f, s
f.close()

print 'TRAIN:', ', '.join(train)

f = open('sou.train.txt','w')
for w in state_union.words(train): print>>f, w
f.close()

f = open('sou.norm.train.txt','w')
for s in state_union.sents(train):
	s = ' '.join(s).lower()
	s = s.replace("' s ","'s ").replace(' .','.')
	s = ' '.join(nltk.word_tokenize(s))
コード例 #24
0
print ('len(set([w.lower() for w in gutenberg.words(\'austen-persuasion.txt\')])) = ' +
       str(len(set([w.lower() for w in gutenberg.words('austen-persuasion.txt')]))) + '\n\n')

print ('3. brown.words(categories=\'news\') = ' + str(brown.words(categories='news')) + '\n')
print ('brown.words(categories=\'reviews\') = ' + str(brown.words(categories='reviews')) +
       '\n\n')

print ('4.\nfor fileid in state_union.fileids():\n    ' +
       'print str(state_union.words(fileid).count(\'men\')) + \' \' + ' +
       'str(state_union.words(fileid).count(\'women\')) + \' \' + ' +
       'str(state_union.words(fileid).count(\'people\'))')
print ('FileID\t\t\tMen\tWomen\tPeople')
print('===============================================')
for fileid in state_union.fileids():
    print (str(fileid) + ('\t' * (1 + int(2 - (1 / 15) * len(fileid)))) +
           str(state_union.words(fileid).count('men')) + '\t' +
           str(state_union.words(fileid).count('women')) + '\t' +
           str(state_union.words(fileid).count('people')))
print '\n\n'

print '5.'
for word in [wn.synset('aircraft.n.01'), wn.synset('zebra.n.01')]:
    print word
    print 'Member meronyms: ' + str(word.member_meronyms())
    print 'Part meronyms: ' + str(word.part_meronyms())
    print 'Substance meronyms: ' + str(word.substance_meronyms())
    print 'Member holonyms: ' + str(word.member_holonyms())
    print 'Part holonyms: " ' +  str(word.part_holonyms())
    print 'Substance holonyms: " ' +  str(word.substance_holonyms())
    print '\n'
print '\n\n'
コード例 #25
0
linux_set = set(linux_words)

for cat in brown_cats:
	words = brown.words(categories=cat)
	tokens = [w.lower() for w in words]
	all_toks_brown = all_toks_brown + tokens
	complete_toks = complete_toks + tokens

for cat in reuters_cats:
	words = reuters.words(categories=cat)
	tokens = [w.lower() for w in words]
	all_toks_reuters = all_toks_reuters + tokens
	complete_toks = complete_toks + tokens

for cat in state_union_cats:
	words = state_union.words(cat)
	tokens = [w.lower() for w in words]
	all_toks_state_union = all_toks_state_union + tokens
	complete_toks = complete_toks + tokens

for word in linux_words:
	complete_toks.append(word)


#list_brown = list()
#for word in all_toks_brown:
#	word_length = len(word)
#	list_brown = list_brown + word_length

cnt_brown = Counter()
cnt_reuters = Counter()
コード例 #26
0
# -*- coding: utf-8 -*-
import matplotlib

matplotlib.use('TkAgg')
import nltk
'''
☼ Read in the texts of the State of the Union addresses, using the
state_union corpus reader.  Count occurrences of men, women,
and people in each document.  What has happened to the usage of these
words over time?
'''

from nltk.corpus import state_union
#print state_union.fileids()
targets = ['men', 'women', 'people']
pair = [(target, fileid[:4]) for fileid in state_union.fileids()
        for word in state_union.words(fileid) for target in targets
        if word.lower() == target]
print pair
cfd = nltk.ConditionalFreqDist(pair)
cfd.plot()
コード例 #27
0
for president in inaugural.fileids():
    vocab = Vocabulary(inaugural.words(president), unk_cutoff=2)
    president_vocabulary[president] = len(vocab)

inverse_vocabulary = [(value, key)
                      for key, value in president_vocabulary.items()]
print(max(inverse_vocabulary)[1],
      max(inverse_vocabulary)[0])  #richest vocabulary for Harrison in 1841
print(min(inverse_vocabulary)[1],
      min(inverse_vocabulary)[0])  #poorest vocabulary for Washington in 1793

president_vocabulary_state_union = {}

for president in state_union.fileids():
    vocab = Vocabulary(state_union.words(president), unk_cutoff=2)
    president_vocabulary_state_union[president] = len(vocab)

inverse_vocabulary_state_union = [
    (value, key) for key, value in president_vocabulary_state_union.items()
]
print(
    max(inverse_vocabulary_state_union)[1],
    max(inverse_vocabulary_state_union)
    [0])  #richest vocabulary for Truman in 1946
print(
    min(inverse_vocabulary_state_union)[1],
    min(inverse_vocabulary_state_union)
    [0])  #poorest vocabulary for Johnson in 1963

# Exercise 2
コード例 #28
0
#Number 1 (2.4) in HW3

print('################ Number 1 ################')

#Generating list for each of the words through time
from nltk.corpus import state_union as su
total = []
men = []
women = []
people = []
for s in su.fileids():
    length_women = 0
    length_men = 0
    length_people = 0
    length = 0
    for w in su.words(s):
        if w.lower() == 'women':
            length_women += 1
            length += 1
        elif w.lower() == 'men':
            length_men += 1
            length += 1
        elif w.lower() == 'people':
            length_people += 1
            length += 1
    total.append(length)
    women.append(length_women)
    men.append(length_men)
    people.append(length_people)
    length_women = 0
    length_men = 0
コード例 #29
0
ファイル: u2.py プロジェクト: ranitaofor/Team-A-NLP
		print(i,": ", len([w for w in words if len(w) == i]) / len(words))

alphalowerfreq = FreqDist(wordsloweralpha)
ourwords = ["shooting", "elementary", "school", "dead", "victim", "gunman", "connecticut", "sandy", "injured", "lanza", "tragedy", "grade", "children", "firearm", "weapon", "morning", "december", "teacher", "police", "motive"];

for i in range(0, len(ourwords)):
	print(str(ourwords[i])+ ": "+ str(alphalowerfreq[ourwords[i]] / len(wordsloweralpha)))

from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.corpus import state_union
from nltk.corpus import words

setnames = ["baseline", "Class Event", "Connecticut School Shooting"]

baselineNOSET = brown.words() + reuters.words() + state_union.words() + words.words();
baseline = set(baselineNOSET) #nltk.Text(corpuses)
sets = {"baseline":baseline , "Class Event":set(classwordsloweralpha), "Connecticut School Shooting":set(wordsloweralpha)}
setlens = {"baseline":sum(map(lambda x: len(x), baseline)) , "Class Event":sum(map(lambda x: len(x), set(classwordsloweralpha))), "Connecticut School Shooting":sum(map(lambda x: len(x), set(wordsloweralpha)))}


cfd = nltk.ConditionalFreqDist((s, len(word)) 
	for s in setnames 
	for word in sets[s])
cpd = nltk.ConditionalProbDist(
	(s, len(word)) 
	for s in setnames 
	for word in sets[s])
cfd.plot(cumulative=True)

baselinefreq = FreqDist([w.lower() for w in baselineNOSET if w.isalpha()])
コード例 #30
0
# read texts from the State of the Union addresses using the state_union module
# determine the frequency of use of the words "men", "women", "people" in each document
import nltk
from nltk.corpus import state_union

state_files = state_union.fileids()
words = ['men', 'women', 'people']

cfd = nltk.ConditionalFreqDist(
    (text, word) for text in state_files for word in state_union.words(text))
cfd.tabulate(conditions=state_files, samples=words)

cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in state_files
                               for word in state_union.words(fileid)
                               for target in words
                               if word.lower().startswith(target))
cfd.plot()

# analyze the frequency chart of modal verbs for different genres
# find other word use classes that also differ in different genres
import nltk
import nltk.corpus

corpus_name = nltk.corpus.brown
files = corpus_name.fileids()
modals = ['can', 'could', 'may', 'might', 'must', 'will']
commons = ['the', 'be', 'to', 'of', 'and', 'in', 'that']
adjectives = ['good', 'new', 'first', 'last', 'long']
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']

cfd = nltk.ConditionalFreqDist((genre, word)
コード例 #31
0
def tabulate(cfdist, words, categories):
    print('%-16s' % 'Category', end=' ')
    for word in words:
        print('%6s' % word, end=' ')
    print()
    for category in categories:
        print('%-16s' % category, end=' ')
        for word in words:
            print('%6d' % cfdist[category][word], end=' ')
        print()
        
        
cfd = nltk.ConditionalFreqDist(
    (fileid, word)
    for fileid in state_union.fileids()
    for word in state_union.words(fileid))


# In[47]:

tabulate(cfd, ['men', 'women', 'people'], state_union.fileids())


# In[55]:

#5. Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(),  substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms().

wordnet.synset('book.n.01').part_holonyms()
wordnet.synset('book.n.01').substance_holonyms()
wordnet.synset('book.n.01').member_holonyms()
コード例 #32
0
ファイル: nltk.py プロジェクト: MikeXL/Machine-Learning
mport pandas as pd

import sys
print sys.version


import nltk

help(nltk.download)
nltk.download('all-corpora')

from nltk.corpus import state_union

state_union.words()
len(state_union.words())

sentences = state_union.sents()
print sentences

state_union_text = nltk.Text(state_union.words())
print state_union_text.count("war")
state_union_text.concordance("economy")
state_union_text.similar("economy")
state_union_text.common_contexts(["economy", "jobs"])

from nltk.probability import FreqDist

fdist = FreqDist(state_union_text)
result = fdist.most_common(15)
result
コード例 #33
0
# Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of
# men, women and people in each document. What has happened to the usages of these words over time?
import nltk
from nltk.corpus import state_union

fileids = state_union.fileids()
print(fileids)
print(fileids[1][:4])
print(state_union.words(fileids[1])[:100])

cfd = nltk.ConditionalFreqDist((gender, fileid[:4])
                               for fileid in state_union.fileids()
                               for word in state_union.words(fileid)
                               for gender in ['men', 'women', 'people']
                               if word.lower().startswith(gender))
cfd.plot()
コード例 #34
0
ファイル: ch2.py プロジェクト: juri-220/Python-NLP
#2. 
persuasion=nltk.corpus.gutenberg.words('austen-persuasion.txt')
len(persuasion)
len(set(persuasion))

#3.
from nltk.corpus import brown
brown.fileids()
brown.categories()
brown.words(categories='adventure')

#4.
from nltk.corpus import state_union

text = state_union.words()
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men','women']
    if w.lower().startswith(target))
cfd.plot()

#5.
wn.synset('fish.n.01').part_meronyms()
wn.synset('fish.n.01').member_meronyms()
wn.synset('leaf.n.01').substance_meronyms()
wn.synset('fish.n.01').member_holonyms()
wn.synset('leaf.n.01').substance_holonyms()
コード例 #35
0
from nltk.corpus import state_union
import nltk

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ["people", "men", "women"]
    if w.lower().startswith(target)) 

cfd.plot() # Can't really depict a noticeable evolution.
コード例 #36
0
from nltk.corpus import state_union
import nltk

#remove punctuation and stopwords
all_words=(w.lower() for w in state_union.words() if (w.isalpha()) and (w.lower() not in nltk.corpus.stopwords.words('english')))

#Word freq count
cfd = nltk.FreqDist(all_words)

#Calculate 50 most common words
mostcommon = cfd.most_common(50)

#Plot 50 most common words and print them
cfd.plot(50)
print(mostcommon)
コード例 #37
0
#4

import nltk
from nltk.corpus import state_union

for speech in state_union.fileids():
    words = state_union.words(fileids=[speech])
    fdist = nltk.FreqDist(w.lower() for w in words)
    print(speech)
    print("she: ", fdist["she"], end='\n')
    print("he: ", fdist["he"], end='\n')
    print("people: ", fdist["people"], end='\n')

コード例 #38
0
ファイル: ex16.py プロジェクト: vpapg/NLTK_book_py3
        'u': 6,
        'v': 6,
        'w': 800,
        'x': 60,
        'y': 10,
        'z': 7
    }

    return sum(letter_vals[l.lower()] for l in word
               if len(re.findall(r'[a-z]', l.lower())) > 0)


print('(a)\ngematria for \'forbidden\':', gematria('forbidden'), '\n\n')

# (b)
words = state_union.words()

#print('\n(b)\nNumber of words with gematria score 666: ' + str(len([w for w in words if (gematria(w)==666)])))

#w666 = [(fileid,w) for fileid in state_union.fileids() for w in state_union.words(fileid) if gematria(w)==666]

print('(b)')
for fileid in state_union.fileids():
    w666 = [w.lower() for w in state_union.words(fileid) if gematria(w) == 666]
    print(fileid, ': ', len(w666), w666)
    print()


# (c)
def decode(text):
    tokens = [t.lower() for t in text.split()]
コード例 #39
0
def findNbest(finder, n):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    resultList = []
    for col in finder.nbest(trigram_measures.pmi, n * 3):
        if (len(resultList) >= n): break
        if maxOneName(col): resultList.append(col)
    return resultList


def performTrigram(wordSet, windowSize, printTime=False):
    start = time.time()
    finder = TrigramCollocationFinder.from_words(wordSet,
                                                 window_size=windowSize)
    end = time.time()
    if printTime:
        print(end - start)
    return finder


def printResults(set):
    i = 1
    for (a, b, c) in set:
        print("\\item " + a + " " + b + " " + c)


#ADD 1.3 code here!!!!

printResults(findNbest(performTrigram(state_union.words(), 3, True), 10))
printResults(findNbest(performTrigram(state_union.words(), 5, True), 10))
printResults(findNbest(performTrigram(state_union.words(), 10, True), 10))
コード例 #40
0
ファイル: small_analysis.py プロジェクト: atokop/compling
classevent_words = classevent_wordlists.words()
classevent_sents = classevent_wordlists.sents()
classevent_words = [w.lower() for w in classevent_words if w.isalnum()]
classevent_words = nltk.Text(classevent_words)
classevent_words_lem = [lemmer.lemmatize(w) for w in classevent_words]

small_words = wordlists.words()
small_words = [w.lower() for w in small_words if w.isalnum()]
small_words = nltk.Text(small_words)
small_words_lem = [lemmer.lemmatize(w) for w in small_words]

yourwords = ['earthquake', 'seismic', 'aftershocks', 'quake', 'damage', 'magnitude', 'tremor', 'richter', 'epicenter', 'depth', 'fault', 'hypocenter', 'focus', 'dead', 'casualties', 'structural', 'seismometer', 'temblor', 'hazard', 'impact']
yourwords_lem = [lemmer.lemmatize(w.lower()) for w in yourwords]


baseline_words = brown.words() + state_union.words() + reuters.words() + words.words()
baseline_words_lem = [lemmer.lemmatize(w) for w in baseline_words]
def non_cumulative_word_length_distribution(words):
    freq_dist_pairs = FreqDist([len(w) for w in words]).items()
    return_dist = {}
    for pair in freq_dist_pairs:
        return_dist[pair[0]] = pair[1]
    return return_dist

def cumulative_word_length_distribution(words):
    non_cumulative_dist = non_cumulative_word_length_distribution(words)
    print non_cumulative_dist
    cumulative_dist = {}
    for i in non_cumulative_dist:
        sum = 0
        for j in range(i):
コード例 #41
0
import nltk
from nltk.corpus import state_union
from nltk.stem.porter import PorterStemmer

#stemmer declaration
stemmer=PorterStemmer()

#retrieve all non-stopwords and stem them
all_stems =(stemmer.stem(w.lower()) for w in state_union.words () if (w.isalpha ()) and 
           (w.lower() not in nltk.corpus.stopwords.words('english')))

#list of all unique stems in the collection
dist_stems=set(all_stems)
コード例 #42
0
import nltk
from nltk.corpus import state_union
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

#declare lematizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

#get all non-stopwords and lemmatize them
all_lemmas = (lemmatizer.lemmatize(w.lower()) for w in state_union.words() if (
    w.isalpha()) and (w.lower() not in nltk.corpus.stopwords.words('english')))

#create empy list for stemmed lemmas
stemmed_lemmas = []

#populate list
for word in all_lemmas:
    stemmed_lemmas.append(stemmer.stem(word))
コード例 #43
0
ファイル: 2-04_cond-fdist.py プロジェクト: hmly/nlp-solutions
import nltk
from nltk.corpus import state_union

# Plot usage of words over time
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men', 'women', 'people']
    if w.lower().startswith(target))
cfd.plot()
コード例 #44
0
        
    #def __new__(cls, *args, **kwargs):              # http://stackoverflow.com/questions/42558/python-and-the-singleton-pattern
    #    if not cls.__instance:                      
    #        cls.__instance = super(Decoder, cls).__new__(cls, *args, **kwargs)
    #    return cls.__instance
    def __init__(self):
        if not Decoder.__index:
            Decoder.__index = nltk.Index([])
            Decoder.__index
        
        
            
    

    
    


    
if __name__ == "__main__":
    from nltk.corpus import state_union
    print "Occurrences of the sign of the Devil in SotU addresses past:"
    for file in state_union.fileids():
        print file[:4], sum(1 for word in state_union.words(file) if gematria(word) == 666)

    
    decoded = Decoder().decode(state_union.words())
    print decoded[:200]