def answers():
    ### Question 1
    print "*** Question 1 ***"
    print "Top 50 tokens for the inaugural corpus:"
    answer1a = q1(inaugural, inaugural.fileids(), 50)
    print answer1a
    print "Top 50 tokens for the twitter corpus:"
    answer1b = q1(xtwc, twitter_file_ids, 50)
    print answer1b
    ### Question 2
    print "*** Question 2 ***"
    corpus_tokens = get_corpus_tokens(inaugural, inaugural.fileids())
    answer2a = clean_tokens(corpus_tokens)
    print "Inaugural Speeches:"
    print "Number of tokens in original corpus: " + str(len(corpus_tokens))
    print "Number of tokens in cleaned corpus: " + str(len(answer2a))
    print "First 100 tokens in cleaned corpus:"
    print answer2a[:100]
    print "-----"
    corpus_tokens = get_corpus_tokens(xtwc, twitter_file_ids)
    answer2b = clean_tokens(corpus_tokens)
    print "Twitter:"
    print "Number of tokens in original corpus: " + str(len(corpus_tokens))
    print "Number of tokens in cleaned corpus: " + str(len(answer2b))
    print "First 100 tokens in cleaned corpus:"
    print answer2b[:100]

    print "Top 50 tokens for the cleaned inaugural corpus:"
    answer2c = q2(answer2a, 50)
    print answer2c
    print "Top 50 tokens for the cleaned twitter corpus:"
    answer2d = q2(answer2b, 50)
    print answer2d
    ### Question 3
    print "*** Question 3 ***"
    answer3 = q3()
    print answer3[:280]
    ### Question 4
    print "*** Question 4: building brown bigram letter model ***"
    brown_bigram_model = q4(brown)
    ### Question 5
    print "*** Question 5 ***"
    answer5 = q5("20100128.txt", brown_bigram_model)
    print "Top 10 entropies:"
    print answer5[:10]
    print "Bottom 10 entropies:"
    print answer5[-10:]
    ### Question 6
    print "*** Question 6 ***"
    answer6 = q6(answer5)
    print "Mean: " + str(answer6[0])
    print "Standard Deviation: " + str(answer6[1])
    print "ASCII tweets: Top 10 entropies:"
    print answer6[2][:10]
    print "ASCII tweets: Bottom 10 entropies:"
    print answer6[2][-10:]
    print "Probably not English tweets: Top 10 entropies:"
    print answer6[3][:10]
    print "Probably not English tweets: Bottom 10 entropies:"
    print answer6[3][-10:]
Beispiel #2
0
def answers():
    ### Question 1
    print "*** Question 1 ***"
    answer1a = q1(inaugural, inaugural.fileids())
    print "Average token length for inagural corpus: " + str(answer1a)
    '''
    For some reason it doesn't want to print anything for 1b, therefore I commented it out, it will print anything else
    
    answer1b = q1(xtwc,twitter_file_ids)
    print "Average token length for twitter corpus: " + str(answer1b)
    '''
    ### Question 2
    print "*** Question 2 ***"
    answer2 = q2()
    print answer2
    ### Question 3
    print "*** Question 3 ***"
    print "Top 50 tokens for the inagural corpus:"
    answer3a = q3(inaugural, inaugural.fileids(), 50)
    print answer3a
    print "Top 50 tokens for the twitter corpus:"
    answer3b = q3(xtwc, twitter_file_ids, 50)
    print answer3b
    ### Question 4
    print "*** Question 4 ***"
    corpus_tokens = get_corpus_tokens(inaugural, inaugural.fileids())
    answer4a = q4(corpus_tokens)
    print "Inaugural Speeches:"
    print "Number of tokens in original corpus: " + str(len(corpus_tokens))
    print "Number of tokens in cleaned corpus: " + str(len(answer4a))
    print "First 100 tokens in cleaned corpus:"
    print answer4a[:100]
    print "-----"
    corpus_tokens = get_corpus_tokens(xtwc, twitter_file_ids)
    answer4b = q4(corpus_tokens)
    print "Twitter:"
    print "Number of tokens in original corpus: " + str(len(corpus_tokens))
    print "Number of tokens in cleaned corpus: " + str(len(answer4b))
    print "First 100 tokens in cleaned corpus:"
    print answer4b[:100]
    ### Question 5
    print "*** Question 5 ***"
    print "Top 50 tokens for the cleaned inagural corpus:"
    answer5a = q5(answer4a, 50)
    print answer5a
    print "Top 50 tokens for the cleaned twitter corpus:"
    answer5b = q5(answer4b, 50)
    print answer5b
    ### Question 6
    print "*** Question 6 ***"
    answer6 = q6()
    print answer6
    ### Question 7
    print "*** Question 7: building brown bigram letter model ***"
    brown_bigram_model = q7(brown)
    '''
Beispiel #3
0
def fun11():
    """inaugural address corpus"""
    print inaugural.fileids()
    print[fileid[:4] for fileid in inaugural.fileids()]

    cfd = nltk.ConditionalFreqDist((target, fileid[:4]) \
        for fileid in inaugural.fileids() \
        for w in inaugural.words(fileid) \
        for target in ['america', 'citizen'] \
        if w.lower().startswith(target))
    cfd.plot()
Beispiel #4
0
def fun8():
    from nltk.corpus import inaugural
    print inaugural.fileids()
    print[w[:4] for w in inaugural.fileids()]

    cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                                   for fileid in inaugural.fileids()
                                   for w in inaugural.words(fileid)
                                   for target in ['america', 'citizen']
                                   if w.lower().startswith(target))
    cfd.plot()  # 条件频率分布图
Beispiel #5
0
def inaugural():

    inaugural.fileids()
    [fileid[:4] for fileid in inaugural.fileids()]

    cfd = nltk.ConditionalFreqDist(
            (target, fileid[:4])
            for fileid in inaugural.fileids()
            for w in inaugural.words(fileid)
            for target in ['america', 'citizen']
            if w.lower().startswith(target))
    cfd.plot()
def exercise_inaugural():
    print(inaugural.fileids())
    # 提取每个演讲文本的年代名
    print([file_id[:4] for file_id in inaugural.fileids()])

    # 观察词汇america和citizen在不同年份演讲中的出现频率
    cfd = nltk.ConditionalFreqDist((target, file_id[:4])
                                   for file_id in inaugural.fileids()
                                   for w in inaugural.words(file_id)
                                   for target in ['america', 'citizen']
                                   if w.lower().startswith(target))
    cfd.plot()
def compare(word, word2):
    cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                                   for fileid in inaugural.fileids()
                                   for w in inaugural.words(fileid)
                                   for target in [word, word2]
                                   if w.lower().startswith(target))
    cfd.plot()
Beispiel #8
0
def build_word_count():
    if os.path.isfile('pickled/wcount.pickle'):
        return read_pickle('pickled/wcount.pickle')
    wcount = Counter()
    for fid in words.fileids():
        for word in words.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in gutenberg.fileids():
        for word in gutenberg.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in brown.fileids():
        for word in brown.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in reuters.fileids():
        for word in reuters.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    for fid in inaugural.fileids():
        for word in inaugural.words(fid):
            word = word.lower()
            if only_words.match(word) is not None:
                wcount[word] += 1
    dump_pickle(wcount, 'pickled/wcount.pickle')
    return wcount
Beispiel #9
0
def cfd(text, tgt_list):
    from nltk.corpus import inaugural
    cfd = nltk.ConditionalFreqDist( (target, fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in tgt_list if w.lower().startswith(target))
    #cfd.plot()
    return cfd

    
Beispiel #10
0
def sent_length():

    text_file = str(input("Enter the name of a text file : \n"))
    txt_fl = inaugural.sents(text_file)
    print(len(txt_fl))
    file_name = inaugural.fileids()
    print(len(inaugural.sents(file_name)))
Beispiel #11
0
def main():

    #Part 1: load inaugural addresses, tokenize, and serialize to pickle file
    #=============================================

    #get files names from nltk library
    file_ids = inaugural.fileids()

    #list to all hold tokenized addresses
    tokenized_addresses = []

    #loop through all inaugural addresses
    for address in file_ids:

        #read the address into a string of newline separated sentences
        string = read_address(address)

        #tokenize each address into a list of lowercase words
        words = tokenize(string)

        #add address title to beginning of address
        words.insert(0, address)

        #append the tokenized address to the master list
        tokenized_addresses.append(words)

    #serialize list of addresses to pickle file
    with open('proj3.pkl', 'wb') as fout:
        pickle.dump(tokenized_addresses, fout)
Beispiel #12
0
def main():
    s1=pre(inaugural.raw('2009-Obama.txt'))
    sx=inaugural.fileids()
    for file in sx:
        s2=pre(inaugural.raw(file))
        #inter=set(s1) & set(s2)
        similarity1=similarity(s1,s2)
        print(similarity1,file)
Beispiel #13
0
def main():

	cfd = nltk.ConditionalFreqDist(
			 (target, file[:4]) 
			 for fileid in inaugural.fileids()
             for w in inaugural.words(fileid)
             for target in ['democracy', 'republic']
             if w.lower().startswith(target))
	cfd.plot()
Beispiel #14
0
def get_sentences():
    '''获得语料库中的句子,输出成sectence'''
    '''需要调用这个函数'''
    articles = inaugural.fileids()
    sentences = []
    for i in articles:
        article = inaugural.sents(i)
        sentences = sentences + list(article)
    return sentences
Beispiel #15
0
def print_inaugural():
    from nltk.corpus import inaugural
    cfd=nltk.ConditionalFreqDist(
        (target,file[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america','citizen']
        if w.lower().startswith(target)
    )
    cfd.plot()
Beispiel #16
0
def build_inaugural_corpus():
    """
    Get a word token list for each doc in the inaugural address corpus
    :return: word_lists
    """
    word_lists = []
    for fileid in inaugural.fileids():
        words = [w for w in inaugural.words(fileid)]
        word_lists.append(words)
    return word_lists
def get_inaugural_docs(download=False) -> List[List[List[str]]]:
    """
    Get the inaugural documents as a list (documents) of list (sentences) of list (sentence) of strings (words)
    :param download: If True, the corpus will be downloaded. Default=False
    :return:
    """
    if download:
        nltk.download('inaugural')
    return [[[w.lower() for w in sent] for sent in inaugural.sents(fileid)]
            for fileid in inaugural.fileids()]
def getFileids():

    index = 0
    for id in inaugural.fileids():
        index += 1
        if index == 2:
            print id  # print the president name
            print len(inaugural.words(id))  #print the # of words
        if index == 14:
            print id  # print the president name
            print len(inaugural.words(id))  #print the # of words
def main():
    # @BEGIN normalize_list
    # @IN inaugural @URI file:data/inaugural/{year}-{president}.txt
    # @OUT normalized_addresses 
    file_ids = inaugural.fileids()
    print(file_ids)
    normalized_addresses = []
    for address in file_ids:
        normalized_words = [address.split("-")[0]]
        for sent in inaugural.sents(address):
            prev_word = ""
            for word in sent:
                if(prev_word == "'"):
                    continue
                
                normalized = re.sub("[^a-z0-9]", "", word.lower())
                if(normalized != ""):
                    normalized_words.append(normalized)
                prev_word = word
        normalized_addresses.append(normalized_words)
    # @END normalized_list

    # @BEGIN pickleize
    # @IN normalized_addresses
    # @OUT pkl @URI file:data/norm_addresses.pkl
    fout = open("norm_addresses.pkl", "wb")
    pickle.dump(normalized_addresses, fout)
    fout.close()
    # @END pickleize

    # deserialize pkl file
    # @BEGIN depickleize
    # @IN pkl @URI file:data/norm_addresses.pkl
    # @OUT address_word_list
    fin = open("norm_addresses.pkl", "rb")
    address_word_list = pickle.load(fin)
    fin.close()
    # @END depickleize

    # @BEGIN frequency
    # @IN address_word_list
    # @IN search_word
    # @OUT frequency_maps
    search_word = input("Input word to find frequency: ")


    frequency_maps = {}
    for word_list in address_word_list:
        
        frequency_maps[word_list[0]] = calculate_frequency_map(word_list[1:])
    # @END frequency
    

    generate_plot(search_word, frequency_maps)
Beispiel #20
0
def main():
    list_of_addresses = []
    for fileid in inaugural.fileids():

        list_of_words = inaugural.words(fileid)
        string_of_words = ' '.join(list_of_words)
        alphabetic_words = re.findall(r"\w+", string_of_words)
        list_of_addresses.append(alphabetic_words)
    #print(list_of_addresses)
    fout = open('proj3.pkl', 'wb')
    pickle.dump(list_of_addresses, fout)
    fout.close()
Beispiel #21
0
def inaug20():
	
	#Variables
	myinaug=inaugural.fileids()
	myaug20=[]
	
	#Function
	for x in range(len(myinaug)-4):					#Goes through all ids, -4 for Obama
		(myaug20.append(myinaug[x:(x+5)]))			#Create list from one president to five more

	#Return
	return myaug20
Beispiel #22
0
    def tabulateWordsInPeriods(self, theWords):
        """
		find the distribution of words within the years, based in Inaugural corpus
		@params theWords: the word/list of words to find info about
		"""
        cdf = ConditionalFreqDist((textid[:4], target)
                                  for textid in inaugural.fileids()
                                  for word in inaugural.words(textid)
                                  for target in theWords
                                  if word.lower().startswith(target)
                                  or word.lower().endswith(target))
        cdf.tabulate()
Beispiel #23
0
def Get_Corpus(debug):
    print("Get_Corpus")

    # Inaugural is a list of lists.
    # 	Each row is one of 56 inaugural addresses.
    #	Each row is a sequential list of the words and punctuation marks
    #		 in the speech.
    Inaugural = []
    i = 0
    for fileid in inaugural.fileids():
        Inaugural.append(inaugural.words(fileid))
        #	for fileid in genesis.fileids():
        #		Inaugural.append(genesis.words(fileid))
        print(i, fileid)
        debug.write("%d %s\n" % (i, fileid))
        i += 1

    Words = []
    for speech in Inaugural:
        words = list(set(speech))
        Words = list(set(words + Words))

    Frequency = []
    for word in Words:
        Frequency.append([0, 0, word])
    for speech in Inaugural:
        for word in speech:
            i = Words.index(word)
            Frequency[i][1] += 1
        S = list(set(speech))
        for word in S:
            i = Words.index(word)
            Frequency[i][0] += 1

#	Frequency = sorted(Frequency,key=lambda x:x[2], reverse=True)
    Frequency = sorted(Frequency, key=lambda x: x[1], reverse=True)
    Frequency = sorted(Frequency, key=lambda x: x[0], reverse=True)

    debug.write("\n\nSpeeches\n\n")
    for speech in Inaugural:
        for word in speech:
            debug.write("%s " % (word))
        debug.write("\n\n")
    debug.write("\n\n")

    debug.write("\n\nFrequency\n\n")
    for row in Frequency:
        debug.write("%d %d %s\n" % (row[0], row[1], row[2]))
    debug.write("\n\n")

    return (Inaugural, Frequency)
def lexDiv():
    y4 = []
    x4 = []

    for fileid in inaugural.fileids():
        div = len(set(fileid)) / len(fileid)
        print(fileid[:4], "-", div)
        y4.append(fileid[:4])
        x4.append(div)

    plt.title('Różnorodność słownictwa')
    plt.xticks(rotation=90)
    plt.plot(y4, x4)
    plt.show()
def avgWord():
    x1 = []
    y1 = []
    for fileid in inaugural.fileids():
        words = inaugural.raw(fileids=fileid)
        words = words.split()
        average = sum(len(word) for word in words) / len(words)
        print(fileid[:4], "-", average)
        y1.append(fileid[:4])
        x1.append(average)

    plt.title('Średnia długość słowa:')
    plt.xticks(rotation=90)
    plt.plot(y1, x1)
    plt.show()
def avgSent():
    x2 = []
    y2 = []

    for fileid in inaugural.fileids():
        average = sum(len(sent)
                      for sent in inaugural.sents(fileids=[fileid])) / len(
                          inaugural.sents(fileids=[fileid]))
        print(fileid[:4], "-", average)
        y2.append(fileid[:4])
        x2.append(average)

    plt.title('Średnia długość zdania:')
    plt.xticks(rotation=90)
    plt.plot(y2, x2)
    plt.show()
Beispiel #27
0
def main():
    ##nltk.download('reuters')
    nltk.download('inaugural')
    nltk.download('punkt')
    docinaug=inaugural.fileids()
    documents = reuters.fileids()
    print(str(len(documents)))
    print(reuters.raw("test/15556"))
    forwardDict,backwardsDict,probMatrix,probUniMatrix,totalProb=tokenize(reuters.raw("test/15556"))

    ##print(documents[1])
    ##print(docinaug[1])
    #forwardDict,backwardDict,probMtrx=tokenize("the man. the man. the man")
    sent_token=word_tokenize("hello my friend how are you")
    print("a")
    print(sentence_perplex(inaugural.raw(docinaug[1]),probMatrix,forwardDict,probUniMatrix))
def getGraphs():
    index = 0
    for id in inaugural.fileids():  #prob(-14)
        index += 1
        ww = inaugural.raw(id).lower()
        num_war = ww.count('war')
        num_america = ww.count('america')
        num_economy = ww.count('economy')
        num_world = ww.count('world')
        plot(index, num_war, 'mo')  #war
        plot(index, num_america, 'go')  #america (increasing)
        plot(index, num_economy, 'ro')  #ecomony
        plot(index, num_world, 'bo')  #world (increasing)
        xlabel('index, purple-war, green-america, red-economy, world-blue')
        ylabel('the frequency of the words used')
    show()
def CountWords(words):
    x = []
    y = []
    for fileid in inaugural.fileids():
        count = 0
        for w in inaugural.words(fileid):
            if w.lower() in words:
                count += 1
        per = (count / len(inaugural.words(fileid))) * 100
        y.append(fileid[:4])
        x.append(per)

    plt.title('Liczba wystąpień:')
    plt.xticks(rotation=90)
    plt.plot(y, x)
    plt.show()
Beispiel #30
0
def fun2():
    # 绘制分布图和分布表
    # 条件是词america或citizen 2 ,绘图中的
    # 计数是指在特定演讲中出现该词的次数。它利用了每个演讲的文件名——例如1865-Lincoln.txt——前4个字
    # 符包含了年代信息的特
    # 点 1 。这段代码为文件1865-Lincoln.txt中每个以america小写形式开头的词——
    # 如:Americans——产生一个配对('america', '1865')
    from nltk.corpus import inaugural
    cfd = nltk.ConditionalFreqDist(
        (target, fileid[:4])  # 行, 列
        for fileid in inaugural.fileids() for w in inaugural.words(fileid)
        for target in ['america', 'citizen']
        if w.lower().startswith(target)  # 某列中符合条件的单词计数
    )
    cfd.tabulate()
    cfd.plot()
def tabulate():

    cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                                   for fileid in inaugural.fileids()
                                   for w in inaugural.words(fileid)
                                   for target in ['america', 'citizen']
                                   if w.lower().startswith(target))

    languages = [
        'Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut',
        'Hungarian_Magyar', 'Ibibio_Efik'
    ]

    cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages
                                   for word in udhr.words(lang + '-Latin1'))

    cfd.tabulate(conditions=['English', 'German_Deutsch'],
                 samples=range(10),
                 cumulative=True)
Beispiel #32
0
def tabulate():

    cfd = nltk.ConditionalFreqDist(
            (target, fileid[:4])
            for fileid in inaugural.fileids()
            for w in inaugural.words(fileid)
            for target in ['america', 'citizen']
            if w.lower().startswith(target))

    languages = ['Chickasaw', 'English', 'German_Deutsch',
            'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']

    cfd = nltk.ConditionalFreqDist(
            (lang, len(word))
            for lang in languages
            for word in udhr.words(lang + '-Latin1'))

    cfd.tabulate(conditions=['English', 'German_Deutsch'],
            samples=range(10), cumulative=True)
def senti():
    x3 = []
    y3 = []
    x31 = []

    for fileid in inaugural.fileids():
        text = inaugural.raw(fileids=fileid)
        senti = TextBlob(text)
        print(fileid[:4], "-", senti.sentiment)
        y3.append(fileid[:4])
        x3.append(senti.sentiment[0])
        x31.append(senti.sentiment[1])
    plt.title('Polarity')
    plt.xticks(rotation=90)
    plt.plot(y3, x3)
    plt.show()
    plt.title('Subjectivity')
    plt.xticks(rotation=90)
    plt.plot(y3, x31)
    plt.show()
def graphWords():
    index = 0
    for id in inaugural.fileids():
        index += 1
        nchar = len(inaugural.raw(id)) * 1.0
        nword = len(inaugural.words(id)) * 1.0
        nsent = len(inaugural.sents(id)) * 1.0
        nvoc = len(set(w.lower() for w in inaugural.words(id))) * 1.0
        a = nchar / nword
        b = nword / nsent
        c = nword / nvoc
        plot(index, a, 'mo')  #purple color
        plot(index, b, 'go')  #green color
        plot(index, c, 'ro')  #red color

        xlabel(
            'index, from Washington to Obama (purple - character/word), (red - word/vocab)'
        )
        ylabel('Average numbers (green - word/sentence)')
    show()
Beispiel #35
0
def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")
    def __append_corpus_data(self):
        """
        Appends data to the questions and statements files from the inaugural address corpus
        """
        sentences = []

        # Use the Presidential inaugural addresses corpus
        for fileid in inaugural.fileids():
            raw_text = inaugural.raw(fileid)
            sentence_tokens = nltk.sent_tokenize(raw_text)
            sentences += sentence_tokens
        random.shuffle(sentences)
        random.shuffle(sentences)
        random.shuffle(sentences)

        # Write sentences to the sentences and questions files
        for sentence in sentences:
            if sentence and 10 < len(sentence) < 75:
                if sentence.endswith('?'):
                    self.q_out.write(self.__strip_sentence(sentence) + '\n')
                else:
                    self.s_out.write(self.__strip_sentence(sentence) + '\n')
Beispiel #37
0
def inaug():
	myinaug=inaugural.fileids()
	myaug20=[]
	for x in range(len(myinaug)-4):
		(myaug20.append(myinaug[x:(x+5)]))
	return myaug20
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 24 11:00:43 2017

@author: Mohnish_Devadiga
"""

import nltk
from nltk.corpus import inaugural
import pandas as pd
import matplotlib 

inaugural.fileids()

#print(inaugural.fileids())

for speech in inaugural.fileids():
    word_count_total = len(inaugural.words(speech))
    print(speech , word_count_total)
    
#Go through all speech     
speech_length = [(len(inaugural.words(speech)), speech)for speech in inaugural.fileids()]

print(speech_length)

#Get the max and min speech
print("Max is : ",max(speech_length))
print("Min is : ",min(speech_length))

#Avg no of words per sentence for each speech
Beispiel #39
0
def main():
#store FreqDist's
#index is the length of the word, 0 is for all words
  samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  brown_letters = FreqDist()
  web_letters = FreqDist()
  inaugural_letters = FreqDist()
  gutenberg_letters = FreqDist()
  genesis_letters = FreqDist()

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      for character in word:
        if(character in string.letters):
            gutenberg_letters[character.upper()] += 1

  for file in brown.fileids():
    for word in brown.words(file):
      for character in word:
        if(character in string.letters):
            brown_letters[character.upper()] += 1

  for file in webtext.fileids():
    for word in webtext.words(file):
      for character in word:
        if(character in string.letters):
            web_letters[character.upper()] += 1

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      for character in word:
        if(character in string.letters):
            inaugural_letters[character.upper()] += 1

  for file in genesis.fileids():
    for word in genesis.words(file):
      for character in word:
        if(character in string.letters):
            genesis_letters[character.upper()] += 1

  with open("genesis-letter-freq.txt",'w') as f:
    sys.stdout = f
    f.write("GENESIS\n")
    for let in samples:
        print(str(genesis_letters[let]))
  
  with open("gutenberg-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("GUTENBERG\n")
    for let in samples:
        print(str(gutenberg_letters[let]))
  with open("webtext-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("WEBTEXT\n")
    for let in samples:
        print(str(web_letters[let]))
  with open("inaugural-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("INAUGURAL\n")
    for let in samples:
        print(str(inaugural_letters[let]))
  with open("brown-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("BROWN\n")
    for let in samples:
        print(str(brown_letters[let]))
  
  with open("letter-freq.txt", 'w') as f:
    corpora = [gutenberg_letters, web_letters, inaugural_letters,
        brown_letters, genesis_letters]
    f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n")
    for let in samples:
      for corpus in corpora:
        f.write(str(corpus[let]) + ",")
      f.write("\n")
Beispiel #40
0
print(reuters.words('training/9865')[:14])
print(reuters.words(['training/9865', 'training/9880']))
print(reuters.words(categories='barley'))
print(reuters.words(categories=['barley', 'com']))
print("-" * 40)


print("""
----------------------------------------------------------------------
1.5  Inaugural Address Corpus
(any key to continue)""")
raw_input()
print("-" * 40)

from nltk.corpus import inaugural
print(inaugural.fileids())
print([fileid[:4] for fileid in inaugural.fileids()])
print("-" * 40)

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))
cfd.plot()
print("-" * 40)


print("""
----------------------------------------------------------------------
Beispiel #41
0
def answers():
    _rvals = []

    #### Question 1 ####
    print '##### Question 1 #####'
    print '(see code - lines 64-65)'
    print '(NB: the two variables are returned by this function)'
    _bush01 = inaugural.words('2001-Bush.txt')
    bush01_word_lengths = _lengths(_vocabulary(_bush01))
    fd_bush01_words = FreqDist(_nopunct(_bush01))
    _rvals.append(bush01_word_lengths)
    _rvals.append(fd_bush01_words)

    #### Question 2 ####
    print '\n##### Question 2 #####'
    bush01_top10_words = _firsts(fd_bush01_words.items()[:10])
    bush01_average_word_lengths = _avg(bush01_word_lengths)
    _obama09 = inaugural.words('2009-Obama.txt')
    _fd_obama09_words = FreqDist(_nopunct(_obama09))
    _obama09_word_lengths = _lengths(_vocabulary(_obama09))
    obama09_top10_words = _firsts(_fd_obama09_words.items()[:10])
    obama09_average_word_lengths = _avg(_obama09_word_lengths)
    print 'top10 words Bush (2001): ', _str(bush01_top10_words)
    print 'top10 words Obama (2009):', _str(obama09_top10_words)
    print 'average word length Bush (2001): ', bush01_average_word_lengths
    print 'average word length Obama (2009):', obama09_average_word_lengths

    #### Question 3 ####
    print '\n##### Question 3 #####'
    bush01_token_lengths = _avg(_lengths(_nopunct(_bush01)))
    obama09_token_lengths = _avg(_lengths(_nopunct(_obama09)))
    print 'average token length Bush (2001): ', bush01_token_lengths
    print 'average token length Obama (2009):', obama09_token_lengths

    #### Question 4 ####
    print '\n##### Question 4 #####'
    for _fileid in inaugural.fileids():
        _year = int(_fileid.split('-')[0])
        _vocab_size = number_of_word_types(_fileid)
        print 'year %d: %d word types' % (_year, _vocab_size)

    #### Question 5 ####
    print '\n##### Question 5 #####'
    fd_bush01_nostop = FreqDist(_nostops(_nopunct(_bush01)))
    fd_obama09_nostop = FreqDist(_nostops(_nopunct(_obama09)))
    bush01_top10_nostop = _firsts(fd_bush01_nostop.items()[:10])
    obama09_top10_nostop = _firsts(fd_obama09_nostop.items()[:10])
    print 'top10 non-stop-words Bush (2001): ', _str(bush01_top10_nostop)
    print 'top10 non-stop-words Obama (2009):', _str(obama09_top10_nostop)

    #### Question 6 ####
    print '\n##### Question 6 #####'
    _wash89 = inaugural.words('1789-Washington.txt')
    fd_wash89_nostop = FreqDist(_nostops(_nopunct(_wash89)))
    wash89_top10_nostop = _firsts(fd_wash89_nostop.items()[:10])
    print 'top10 non-stop-words Washington (1789):', _str(wash89_top10_nostop)

    #### Question 7 ####
    print '\n##### Question 7 #####'
    wash89_rank_country = rank(fd_wash89_nostop, 'country')
    obama09_rank_country = rank(fd_obama09_nostop, 'country')
    bush01_rank_country = rank(fd_bush01_nostop, 'country')
    print 'rank of "country" in Washington (1789):', wash89_rank_country
    print 'rank of "country" in Obama (2009):', obama09_rank_country
    print 'rank of "country" in Bush (2001):', bush01_rank_country

    #### Question 8 ####
    print '\n##### Question 7 #####'
    print '(see comments in "rank" function on lines 20-45)'
    
    #### Question 9 ####
    print '\n##### Question 9 #####'
    print '(see plot)'
    ff = inaugural.fileids()
    fdd = {}
    _years = []
    for _fileid in ff:
        fdd[_fileid] = FreqDist(_nostops(inaugural.words(_fileid)))
        _years.append(_fileid[0:4])
    pylab.plot([(lambda d: len(d) / float(d.N()))(fdd[f]) for f in ff])
    pylab.xticks(range(len(ff)), _years, rotation=90)
    pylab.xlim(0, len(ff) - 1)
    pylab.ylabel('ratio of word types to tokens (without stop-words)')
    pylab.xlabel('time')
    pylab.title('f(time) = #(word types) / #(word tokens)')
    pylab.show()

    #### Question 10 ####
    print '\n##### Question 10 #####'
    print '(see plot)'
    obama09top10_butnot_wash89top10 = [word for word in obama09_top10_nostop
        if word in fd_wash89_nostop and word not in wash89_top10_nostop]
    wash89top10_butnot_obama09top10 = [word for word in wash89_top10_nostop
        if word in fd_obama09_nostop and word not in obama09_top10_nostop]
    obama09_word = 'world'
    wash89_word = 'government'
    assert(wash89_word in wash89top10_butnot_obama09top10)
    assert(obama09_word in obama09top10_butnot_wash89top10)
    normalisation_justification = (\
    "We normalise for different sizes in vocabulary by dividing the rank of "
    "some word by the size of the vocabulary in that speech"
    "Since rank is in relation with vocabulary size, this is similar to "
    "getting the maximum rank over all speeches and dividing each rank by that "
    "quantity")
    print normalisation_justification
    _normalised_rank = lambda f, w: min(1, rank(fdd[f], w) / \
        float(len(_vocabulary(_nostops(fdd[f])))))
    pylab.plot([_normalised_rank(f, obama09_word) for f in ff],
        label=obama09_word, color='b')
    pylab.plot([_normalised_rank(f, wash89_word) for f in ff],
        label=wash89_word, color='r')
    pylab.xticks(range(len(ff)), _years, rotation=90)
    pylab.xlim(0, len(ff) - 1)
    pylab.ylabel('normalised word rank (lower is better)')
    pylab.xlabel('time')
    pylab.title('f(time) = word rank / vocabulary size')
    pylab.legend()
    pylab.show()

    #### Question 11 ####
    print '\n##### Question 11 #####'
    observations_on_plots = (\
    "We observe that the rank of 'world' is noisy when observed on the level "
    "of some individual year/inaugural speech. However, when looking at the "
    "larger picture, a trend emerges: 'world''s rank is consistenlty getting "
    "higher over time - an indicator for an ever-globalising and shrinking "
    "world?"
    "\n"
    "We observe that 'government' is a consistently highly ranked word across "
    "time - expcept for some few inaugural speeches where it has a very low "
    "rank. Those speeches are around the early 1800s (abolishment of slavery),"
    " 1860s-70s (US civil war), the early 1900s (Word War One), and 1937-1981"
    "(World War Two + Cold War) - it would seem that presidents don't want to "
    "remind their subjugates of the government during hard times. Outliers to "
    "this theory can be explained easily (e.g. somewhat high rank of "
    "'government' in 1949 = a certain 'evil government' being defeated).")
    print observations_on_plots

    return _rvals
Beispiel #42
0
reuters.words('training/9865')[:14]
# [u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', u'BIDS',
# u'DETAILED', u'French', u'operators', u'have', u'requested', u'licences',
#u'to', u'export']
reuters.words(['training/9865', 'training/9880'])
# out: [u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', ...]
reuters.words(categories='barley')
# [u'FRENCH', u'FREE', u'MARKET', u'CEREAL', u'EXPORT', ...]
reuters.words(categories=['barley', 'corn'])
# [u'THAI', u'TRADE', u'DEFICIT', u'WIDENS', u'IN', ...]

# INAUGURAL ADDRESS corpus

from nltk.corpus import inaugural
inaugural.fileids()
# out: [u'1789-Washington.txt', u'1793-Washington.txt', u'1797-Adams.txt', u'1801-Jefferson.txt', u'1805-Jefferson.txt', u'1809-Madison.txt'...
# grab the first 4 chars of the fileids to grab the years
[fileid[:4] for fileid in inaugural.fileids()]
# out: [u'1789', u'1793', u'1797', u'1801', u'1805',...
cfd = nltk.ConditionalFreqDist(
        (target, fileid[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america', 'citizen']
        if w.lower().startswith(target)) #convert corpus to lowercase then check whether they start with either the targets america or citizen
# requires matplotlib
cfd.plot()

# ANNOTATED TEXT CORPORA

# Loading your own corpus
# see Pathology project: need to add pathology report text to txt files
Beispiel #43
0
Datei: NLP.py Projekt: Toma-L/NLP
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()

reuters.categories('training/9865')
reuters.categories(['training/9865', 'training/9880'])
reuters.fileids('barley')
reuters.fileids(['barley', 'corn'])

reuters.words('training/9865')[:14]
reuters.words(['training/9865', 'training/9880'])
reuters.words(categories = 'barley')
reuters.words(categories = ['barley', 'corn'])

from nltk.corpus import inaugural
inaugural.fileids()
[fileid[:4] for fileid in inaugural.fileids()]

cfd = nltk.ConditionalFreqDist(
        (target, fileid[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america', 'citizen']
        if w.lower().startswith(target))

cfd.plot()

nltk.corpus.cess_esp.words()
nltk.corpus.floresta.words() #Error
nltk.corpus.indian.words('hindi.pos')
nltk.corpus.udhr.fileids()
Beispiel #44
0
    (genre, word)
    for genre in brown.categories() 
    for word in brown.words(categories = genre))
genres = ['news','religion','hobbies','science_fiction','romance','humor']    
modals = ['can','could','may','might','must','will']
cfd.tabulate(conditions = genres, samples = modals)

from nltk.corpus import reuters
reuters.fileids()
reuters.categories(['training/9865', 'training/8666'])
reuters.fileids(['barley','corn'])
reuters.words('training/9865')[:14]
reuters.words(categories = ['corn','barley'])

from nltk.corpus import inaugural
inaugural.fileids()
inaugYears = [fileid[:4] for fileid in inaugural.fileids()]

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america','citizen']
    if w.lower().startswith(target))
cfd.plot()

from nltk.corpus import udhr
languages = ['English','Finnish_Suomi','Italian_Italiano', 'Greenlandic_Inuktikut']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
from nltk.corpus import inaugural as inag
from nltk import ConditionalFreqDist as CondFreqDist
cfd = CondFreqDist([(target , fileid[:4])\
		for fileid in inag.fileids() \
			for word in inag.words(fileid) \
				for target in ["wealth" , "peace" , "harmony" , "prosperous"] if word.lower().startswith(target)
		])
cfd.plot()
sorted([w for w in set(text4) if len(w) > 7 and fdist[w] > 7])

# Collocations and Bigrams. 
# A collocation is a sequence of words that occur together unusually often. 
# Built in collocations function
text4.collocations()


#############
#Corpus data#
#############

# Inaugural Address Corpus

from nltk.corpus import inaugural
inaugural.fileids()[:2]
[fileid[:4] for fileid in inaugural.fileids()]

#How the words America and citizen are used over time.

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'war']
    if w.lower().startswith(target))
cfd.plot()
#cfd.tabulate()

from nltk.corpus import brown
news_words=brown.words(categories="news")
genre_word = [(genre, word) for genre in ['news', 'romance'] for word in brown.words(categories=genre)]
print(len(genre_word))  # 170576 个词类
print(genre_word[:4])  # [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')] # [_start-genre]
print(genre_word[-4:])  # [('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')] # [_end-genre]
cfd = ConditionalFreqDist(genre_word)
print(cfd)  # <ConditionalFreqDist with 2 conditions>
print(cfd.conditions())  # ['news', 'romance'] # [_conditions-cfd]
print(cfd['news'])  # <FreqDist with 14394 samples and 100554 outcomes>
print(cfd['romance'])  # <FreqDist with 8452 samples and 70022 outcomes>
print(cfd['romance'].most_common(2))  # [(',', 3899), ('.', 3736)]
print(cfd['romance']['could'])  # 193
print(cfd['romance'].max())  # 找到 romance 中最大的
print(cfd['romance'][','])  # 3899
##################################################################
## plot() how the words America and citizen are used over time; 美国总统就职演讲, 使用 America 和 citizen 情况
cfd = ConditionalFreqDist((target, fileid[:4]) for fileid in inaugural.fileids() for word in inaugural.words(fileid) for target in ['america', 'citizen'] if word.lower().startswith(target))
cfd.plot()  # 绘制演讲中出现 America 和 citizen 次数
##################################################################
## tabulate(); 提取词对
# Next, let's combine regular expressions with conditional frequency distributions.
# Here we will extract all consonant-vowel sequences from the words of Rotokas, such as ka and si. Since each of these is a pair,
# it can be used to initialize a conditional frequency distribution. We then tabulate the frequency of each pair:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
print(cvs[:10])  # ['ka', 'ka', 'ka', 'ka', 'ka', 'ro', 'ka', 'ka', 'vi', 'ko']
cfd = ConditionalFreqDist(cvs)
cfd.tabulate()
#     a    e    i    o    u
# k  418  148   94  420  173
# p   83   31  105   34   51
# r  187   63   84   89   79
Beispiel #48
0
def fun11():
    """inaugural address corpus"""
    print inaugural.fileids()
    print [fileid[:4] for fileid in inaugural.fileids()]
Beispiel #49
0
__author__ = 'auroua'
from nltk.corpus import inaugural
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
from lda_1 import LDA
import seaborn as sns

stops = set(stopwords.words("english"))

vocab = dict()
for fileid in inaugural.fileids():
    for word in inaugural.words(fileid):
        word = word.lower()
        if word not in stops and word.isalpha():
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1

"""
Sort the vocab keep only words which occur more than 50 times
Then Create word to id and id to word dictionaries
"""
vocab_sorted = filter(lambda x: x[1] > 50, sorted(vocab.items(), key=lambda x: x[1], reverse=True))
wordids = {v[0]: i for i, v in enumerate(vocab_sorted)}
idwords = {i: v[0] for i, v in enumerate(vocab_sorted)}
vocab_size = len(wordids)
print vocab_size

# Generate corpus document vectors
data = []
Beispiel #50
0
import numpy as np
import csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

os.chdir("C:/Users/Charles/Desktop/Inaugural/")

#LOAD DATASETS

from nltk.corpus import inaugural
titles = inaugural.fileids()

addresses = []
for title in titles:
    f = inaugural.open(title)
    text = f.read().encode('UTF-8')
    addresses.append(text)

Pstem = PorterStemmer()
WNL = WordNetLemmatizer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.lemmatize(item))
    return stemmed
Beispiel #51
0
def exercise_inaugural():
    print inaugural.fileids()
    # 提取每个演讲文本的年代名
    print [file_id[:4] for file_id in inaugural.fileids()]
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import inaugural
##################################################################
## 简单了解
print(type(inaugural))  # <class 'nltk.corpus.reader.plaintext.PlaintextCorpusReader'>
print(len(inaugural.fileids()))  # 56; 到 奥巴马 为止一共 56 个总统
print(inaugural.fileids()[:3])  # ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt']
##################################################################
## 输出美国总统的就职年份
print([fileid[:4] for fileid in inaugural.fileids()])  # ['1789', '1793', '1797', '1801', '1805', '1809', '1813', '1817', '1821', ...]
Beispiel #53
0
#路透语料库

from nltk.corpus import reuters
reuters.fileids()
reuters.categories()

reuters.categories(['training/9865', 'training/9880'])
reuters.fileids(['barley', 'corn'])
reuters.words('training/9865')[:14]
reuters.words(['training/9865', 'training/9880'])
reuters.words(categories=['barley', 'corn'])

#演说语料库
from nltk.corpus import inaugural
inaugural.fileids()
#多国世界人权宣言
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch','Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))
        
cfd.plot(cumulative = True)
cfd.tabulate(conditions=['English', 'German_Deutsch'],samples=range(10), cumulative=True)
#条件频率分布
genre_word = [(genre, word) for genre in ['news', 'romance'] 
for word in brown.words(categories=genre)]

cfd = nltk.ConditionalFreqDist(genre_word)
Beispiel #54
0
# Note that there are no optional tasks in this section, because familiarizing yourself with the NLTK
# is important even if you are an experienced programmer. Please complete the tasks in the boxes below.


# ======================= 		Learning how to use NLTK - Task 1     ============================
# 	Write all the Python code below in a new file called corpuses.py and make sure you understand it!
# 	You must work in your Dropbox folder so we can see your progress.
# 	Run your file everytime something new is added so you can see how it works.
# 	There is a compulsory exercise for Task 1 that needs to be completed at the bottom of your corpuses.py

# === Part 1: Importing Corpuses ===

import nltk
from nltk.corpus import inaugural

print inaugural.fileids()

# Run your file.You should see all the text files containing all the speeches of the US presidents that the
# NLTK has saved inside it.
# Now add the lines:

print "=============Words in Obama's Speech ======"
print inaugural.words("2009-Obama.txt")  # Returns a list of all the words in Obama's speech
print "=============Words in Bush's speech ======"
print inaugural.sents("2005-Bush.txt")  # Returns a list of all the sentences in Bush's speech

# As you can see, the words of Obamas speech are printed in a list, as are the sentences of Bush's speech.

# Try add code to your program to find and outprint the first 25 words of Obama's 2009 speech.

# ===  Part 2: Analysing tokens (words) of a text ===
from nltk.probability import FreqDist
from nltk.corpus import inaugural, stopwords
import string
import json
from pprint import pprint
import math
import networkx as nx

filenames = inaugural.fileids()

def dump_content(filename, content):
	j = json.dumps(content, indent=4)
	f = open(filename+'.json', 'w')
	print >> f, j
	f.close()

def read_content(filename):
	json_data=open(filename+'.json')
	content = json.load(json_data)
	json_data.close()
	return content

def remove_punctuation(text):
	content = [w.strip(string.punctuation) for w in text]
	return content

def remove_stopwords(text):
	content = [w for w in text if w.lower() not in stopwords.words('english')]
	return content

def clean(text):
Beispiel #56
0
def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")