Ejemplo n.º 1
0
def make_visualizations(chronology_date):

    num_per_bigram_per_date = chronology_date.groupby(["Bigram", "Date"]).agg(
        {"Num occurrences": "sum"})
    store_to_pickle(num_per_bigram_per_date, "num_per_bigram_per_date.pickle")

    # num_bigram_date = pickle.load(open("num_per_bigram_date.pickle","rb"))
    grouped = chronology_date.groupby(["Bigram"])
    store_to_pickle(grouped, "grouped.pickle")
Ejemplo n.º 2
0
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list,
                         speakers_to_analyze, Girondins, Montagnards):
    speaker_ngrams = {}
    speakers_to_consider = []
    speaker_distances = collections.defaultdict()
    chronology = collections.defaultdict(dict)

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    row_entry_speechid = []
    row_entry_date = []
    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        speaker_name = speechid_to_speaker[identity]
        if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                speaker_name in speakers_to_consider):
            party = speakers_to_analyze.loc[speaker_name, "Party"]
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            # Store relevant information for each bigram
            for bigram in indv_speech_bigram:
                row_entry_speechid.append([
                    str(bigram), speaker_name, identity,
                    indv_speech_bigram[bigram], party
                ])
                row_entry_date.append([
                    str(bigram), speaker_name, date,
                    indv_speech_bigram[bigram], party
                ])

    chronology_speechid = pd.DataFrame(row_entry_speechid,
                                       columns=[
                                           "Bigram", "Speaker Name",
                                           "Speechid", "Num occurrences",
                                           "Party"
                                       ])
    chronology_date = pd.DataFrame(
        row_entry_date,
        columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"])

    # w = csv.writer(open("chronology.csv", "w"))
    # for key, val in chronology.items():
    # 	if (Girondins[key] >= 10) or (Montagnards[key] >= 10):
    # 		w.writerow([key,val])
    make_visualizations(chronology_date)

    write_to_excel(chronology_speechid, "chronology_speechid.xlsx")
    write_to_excel(chronology_date, "chronology_date.xlsx")

    store_to_pickle(chronology_speechid, "chronology_speechid.pickle")
    store_to_pickle(chronology_date, "chronology_date.pickle")
def parseFiles(raw_speeches, multiple_speakers):
    # Assumes all xml files are stored in a Docs folder in the same directory as the python file
    files = os.listdir("AP_ARTFL_vols/")
    dates = set()
    num_sessions = 0
    num_morethan1_session = 0
    for filename in files:
        if filename.endswith(".xml"):
            print(filename)
            filename = open('AP_ARTFL_vols/' + filename, "r")
            # Extracts volume number to keep track of for names_not_caught and speakers_using_find
            volno = re.findall(vol_regex, str(filename))[0]
            contents = filename.read()
            soup = BeautifulSoup(contents, 'lxml')
            pages = re.findall(page_regex, contents)
            # Find all the sessions in the xml
            sessions = soup.find_all(['div3'], {"type": ["other"]})
            sessions_other = soup.find_all(['div2'], {"type": ["session"]})
            sessions = sessions + sessions_other
            # sessions = soup.find_all(['div2', 'div3'], {"type": ["session", "other"]})

            for session in sessions:
                date = extractDate(session)
                # Restricts to valid dates we want to look at
                if (date >= "1789-05-05") and (date <= "1795-01-04") and (
                        date != "error"):
                    # Datas is a dataset keeping track of dates already looked at
                    # Accounts for multiple sessions per day
                    num_sessions += 1
                    if date in dates:
                        num_morethan1_session += 1
                        date = date + "_soir"
                        if date in dates:
                            date = date + "2"
                            findSpeeches(raw_speeches, multiple_speakers,
                                         session, date, volno)
                        else:
                            findSpeeches(raw_speeches, multiple_speakers,
                                         session, date, volno)
                            dates.add(date)
                    else:
                        findSpeeches(raw_speeches, multiple_speakers, session,
                                     date, volno)
                        dates.add(date)
            filename.close()

        store_to_pickle(num_sessions, "num_sessions.pickle")
        store_to_pickle(num_morethan1_session, "num_morethan1_session.pickle")
Ejemplo n.º 4
0
def aggregate_by_speaker():

    byspeaker = {}
    speakerdict = {}

    ngrams = {}

    speakers_to_consider = []

    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    speakers_to_analyze = load_list(
        "Girondins and Montagnards New Mod Limit.xlsx")
    speaker_num_words = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speechid in raw_speeches:
        fulldate = speechid[0:10]
        if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"):
            num_words = len(raw_speeches[speechid].split())
            speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)

            speaker = speechid_to_speaker[speechid]

            if speaker in speaker_num_words:
                speaker_num_words[speaker] += num_words
            else:
                speaker_num_words[speaker] = num_words

            if speaker in speakers_to_consider:
                if speaker in byspeaker:
                    byspeaker[speaker] = byspeaker[speaker] + speech_bigrams
                else:
                    byspeaker[speaker] = speech_bigrams
            speech_bigrams = None

    write_to_csv(byspeaker)
    store_to_pickle(byspeaker)

    write_to_csv(speaker_num_words)
    store_to_pickle(speaker_num_words)
def counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq):
	
	# Computes the tfidf scores within each group
	gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
	mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

	store_to_pickle(gir_tfidf, "gir_tfidf.pickle")
	store_to_pickle(mont_tfidf, "mont_tfidf.pickle")

	# Stores the tf_idf vectors in Excel
	df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index")
	write_to_excel(df_gir_tfidf, 'gir_tfidf.xlsx')

	df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index")
	write_to_excel(df_mont_tfidf, 'mont_tfidf.xlsx')

	# Combines the tfidf vectors of both parties into one file
	df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
	df_tfidf_combined = df_tfidf_combined.transpose()
	df_tfidf_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_tfidf_combined, 'combined_tfidf.xlsx')

	# Limits based on v, or the number of times that bigram appears, and gir or mont docs, the number of 
	# speakers in each group that use that bigram
	# Can change the name of these dataframes to illuminate what the restrictions are
	Girondins_restricted = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)}
	Montagnards_restricted = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)}

	store_to_pickle(Girondins_restricted, "Girondins_restricted.pickle")
	store_to_pickle(Montagnards_restricted, "Montagnards_restricted.pickle")

	gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
	mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

	# Stores the Girondins and Montagnards frequency vectors and tfidfs in the same document according to restrictions
	df_combined = pd.DataFrame([Girondins, Montagnards])
	df_combined = df_combined.transpose()
	df_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_combined, 'combined_frequency_restricted.xlsx')

	df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
	df_tfidf_combined = df_tfidf_combined.transpose()
	df_tfidf_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_tfidf_combined, 'combined_tfidf_restricted.xlsx')
Ejemplo n.º 6
0
    	year = session[0:4]
    	if year in num_speeches_per_year:
    		num_speeches_per_year[year] = num_speeches_per_year[year] + speeches_per_session[session]
    	else:
    		num_speeches_per_year[year] = speeches_per_session[session]

    	if year in count_sessions:
    		count_sessions[year] = count_sessions[year] + 1
    	else:
    		count_sessions[year] = 1

    avg_num_speeches_per_session_per_year = {}
    for year in num_speeches_per_year:
    	avg_num_speeches_per_session_per_year[year] = num_speeches_per_year[year]/(1.0*count_sessions[year])

    store_to_pickle(num_speeches_per_year, "num_speeches_per_year.pickle")
    store_to_pickle(count_sessions, "count_sessions.pickle")

    write_to_csv(num_speeches_per_year, "num_speeches_per_year.csv")
    write_to_csv(avg_num_speeches_per_session_per_year, "avg_num_speeches_per_session_per_year.csv")
    write_to_csv(count_sessions, "count_sessions.csv")
    	

    # num speakers per year
    speakers_per_year = {}
    for speechid in speechid_to_speaker:
    	year = speechid[0:4]
    	speaker = speechid_to_speaker[speechid]
    	if year in speakers_per_year:
    		speakers_per_year[year].add(speaker)
    	else:
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards):
	speaker_num_speeches = {}
	speaker_char_count = {}
	
	# Dataframe to keep track of the speakers we care about
	speakers_to_consider = []
	# Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches
	# and speechid_to_speaker
	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	# Matches bigrams to the list of speakers and speeches that have that bigram
	bigrams_to_speeches = {}
	bigrams_to_speakers = {}

	# Maintains the number of documents a given bigram is spoken in for use with tf-idf
	bigram_doc_freq = collections.defaultdict()

	gir_num_speeches = 0
	mont_num_speeches = 0
	gir_docs = {}
	mont_docs = {}

	for speaker_name in speakers_to_consider:
		print speaker_name
		party = speakers_to_analyze.loc[speaker_name, "Party"]
		speech = Counter()
		for identity in raw_speeches:
			date = re.findall(date_regex, str(identity))[0]
			if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]):
				# Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
				# To potentially establish a cutoff for analysis purposes
				augment(speaker_num_speeches, speaker_name)
				if speaker_name in speaker_char_count:
					speaker_char_count[speaker_name] += len(raw_speeches[identity])
				else:
					speaker_char_count[speaker_name] = len(raw_speeches[identity])

				indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

				for bigram in indv_speech_bigram:
					augment(bigram_doc_freq, bigram)

					# Maintains a list of speeches in which given bigrams are spoken in
					if bigram in bigrams_to_speeches:
						bigrams_to_speeches[bigram].append(identity)
					else:
						bigrams_to_speeches[bigram] = []
						bigrams_to_speeches[bigram].append(identity)
					if bigram in bigrams_to_speakers:
						bigrams_to_speakers[bigram].add(speaker_name)
					else:
						bigrams_to_speakers[bigram] = set()
						bigrams_to_speakers[bigram].add(speaker_name)

				# Augments the relevant variables according to the party the speaker belongs to
				if party == "Girondins":
					gir_num_speeches += 1
					gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs)
					try:
						Girondins = Girondins + indv_speech_bigram
					except NameError:
						Girondins = indv_speech_bigram
				else:
					mont_num_speeches += 1
					mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs)
					try:
						Montagnards = Montagnards + indv_speech_bigram
					except NameError:
						Montagnards = indv_speech_bigram
			
				### Maintains a Counter of all the bigrams and their counts for a given speaker
				# speech = speech + indv_speech_bigram

	### Stores the bigram Counter object for each individual speaker
		# pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		# with open(pickle_filename, 'wb') as handle:
		# 	pickle.dump(speech, handle, protocol = 0)

	# Store raw counts
	store_to_pickle(Girondins,"Girondins.pickle")
	store_to_pickle(Montagnards, "Montagnards.pickle")

	# Store in memory aggregate information about each bigram
	bigram_aggregate_info(Girondins, Montagnards, bigrams_to_speakers, bigrams_to_speeches)


	### If data has already been stored to memory, the lines below can be used
	# bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb"))
	# bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb"))

	# gir_docs = pickle.load(open("gir_docs.pickle", "rb"))
	# mont_docs = pickle.load(open("mont_docs.pickle", "rb"))

	# Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb"))
	# Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb"))

	# bigram_doc_freq = pickle.load(open("bigram_doc_freq.pickle", 'rb'))

	num_speeches = 4479

	# Computes counts and tfidf scores for each party and outputs for further analysis in R
	counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq)



	""" EVERYTHING BELOW IS STORING DATA TO MEMORY """
	
	# Stores the bigrams_to_speeches document in Excel
	df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index")
	write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')
	df_bigrams_to_speakers = pd.DataFrame.from_dict(bigrams_to_speakers, orient = "index")
	write_to_excel(df_bigrams_to_speakers, 'bigrams_to_speakers.xlsx')
	df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient = "index")
	write_to_excel(df_doc_freq, 'doc_freq.xlsx')
	
	# Stores files in memory
	store_to_pickle(bigrams_to_speakers, "bigrams_to_speakers.pickle")
	store_to_pickle(bigrams_to_speeches, "bigrams_to_speeches.pickle")
	store_to_pickle(gir_docs, "gir_docs.pickle")
	store_to_pickle(mont_docs, "mont_docs.pickle")
	store_to_pickle(speaker_num_speeches, "speaker_num_speeches.pickle")
	store_to_pickle(speaker_char_count, "speaker_char_count.pickle")
	store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle")

	with open('gir_speeches.txt', 'w') as f:
		f.write('%d' % gir_num_speeches)
	with open('mont_speeches.txt', 'w') as f:
		f.write('%d' % mont_num_speeches)

	write_to_csv(speaker_num_speeches, "speaker_num_speeches.csv")
	write_to_csv(speaker_char_count, "speaker_char_count.csv")

	with open('num_speeches.txt', 'w') as f:
		f.write('%d' % num_speeches)
    # txtfile = open("names_not_caught.txt", 'w')
    # for name in sorted(names_not_caught):
    # 	txtfile.write(name)
    # txtfile.close()

    # file = open('speakers_using_find.txt', 'w')
    # for item in sorted(speakers_using_find):
    # 	file.write(item)
    # file.close()

    file = open('speakers.txt', 'w')
    for item in sorted(speakers):
        file.write(item + "\n")
    file.close()

    store_to_pickle(speechid_to_speaker, "speechid_to_speaker.pickle")
    store_to_pickle(raw_speeches, "raw_speeches.pickle")
    store_to_pickle(multiple_speakers, "multiple_speakers.pickle")

    store_to_pickle(speaker_num_total_speeches,
                    "speaker_num_total_speeches.pickle")
    store_to_pickle(speaker_num_total_chars, "speaker_num_total_chars.pickle")
    store_to_pickle(speakers, "speakers.pickle")
    store_to_pickle(speeches_per_day, "speeches_per_session.pickle")
    store_to_pickle(speakers_per_session, "speakers_per_session.pickle")

    write_to_csv(speechid_to_speaker, "speechid_to_speaker.csv")
    write_to_csv(raw_speeches, "raw_speeches.csv")
    write_to_csv(speaker_num_total_speeches, "speaker_num_total_speeches.csv")
    write_to_csv(speaker_num_total_chars, "speaker_num_total_chars.csv")
    write_to_csv(speeches_per_day, "speeches_per_day.csv")
Ejemplo n.º 9
0
def checkErrors(enc_words, french_stopwords):
    files = os.listdir("AP_ARTFL_vols/")
    errors_per_vol = {}
    errors_per_page = {}
    word_freq_wrong = {}

    for filename in files:
        if filename.endswith(".xml"):
            filename = open('AP_ARTFL_vols/' + filename, "r")
            volno = re.findall(vol_regex, str(filename))[0]
            print volno
            contents = filename.read()
            soup = BeautifulSoup(contents, 'lxml')

            num_errors = 0
            num_words_vol = 0
            word_freq = {}

            # Iterate through contents and find all page tags
            pb_tags = []
            last_index = 0
            while True:
                loc = contents.find("<pb n=", last_index)
                if loc == -1:
                    break
                pb_tags.append(loc)
                last_index = loc + 1

            # Iterates through all page tags and looks through the contents on each page, checking each word against the
            # words contained in the Encyclodpedie
            for i in range(0, len(pb_tags) - 1):
                contents_substr = contents[pb_tags[i]:pb_tags[i + 1]]
                page_num = BeautifulSoup(contents_substr,
                                         'lxml').find_all('pb')
                pb_soup = BeautifulSoup(contents_substr, 'lxml')

                pageno = volno + "_pg" + page_num[0].get("n")
                error_per_page = 0
                num_words_pg = 0

                text = unicode(contents_substr, "ascii", errors="ignore")
                text = remove_diacritic(text).decode('utf-8')
                paragraph = remove_stopwords(text, french_stopwords)
                paragraph = paragraph.replace("\n", " ").replace(
                    ")", "").replace("*", "").replace(":", "").replace(
                        "-", "").replace("_", "").replace("(", "").replace(
                            "& ",
                            "").replace("; ", "").replace(".", "").replace(
                                ",", "").replace("?", "").replace("!", "")
                paragraph = re.sub(r'([0-9]{1,4})', ' ', paragraph)
                words = paragraph.split(" ")
                num_words_vol += len(words)
                num_words_pg += len(words)
                for word in words:
                    if word not in enc_words:
                        if word in word_freq:
                            word_freq[word] += 1
                        else:
                            word_freq[word] = 1
                        error_per_page += 1
                        num_errors += 1

                errors_per_page[pageno] = [error_per_page, num_words_pg]

            word_freq_wrong[volno] = sorted(word_freq.items(),
                                            key=lambda kv: kv[1])
            errors_per_vol[volno] = [num_errors, num_words_vol]

    # Save and output errors per volume
    store_to_pickle(errors_per_vol, "errors_per_vol.pickle")
    w = csv.writer(open("errors_per_vol.csv", "w"))
    for key, val in errors_per_vol.items():
        if isinstance(key, str):
            key = unicode(key, "ascii", errors="ignore")
        w.writerow([key, val[0], val[1]])

    # Save and output errors per page
    store_to_pickle(error_per_page, "errors_per_page.pickle")
    w = csv.writer(open("errors_per_page.csv", "w"))
    for key, val in errors_per_page.items():
        if isinstance(key, str):
            key = unicode(key, "ascii", errors="ignore")
        w.writerow([key.encode("utf-8", errors="ignore"), val[0], val[1]])

    # Save and output frequency of errors per word per volume
    store_to_pickle(word_freq_wrong, "word_freq_errors.pickle")
    w = csv.writer(open("word_freq_errors.csv", "w"))
    for key, val in word_freq_wrong.items():
        w.writerow([key, val])
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker):
	# Dataframe to keep track of the speakers we care about
	speakers_to_consider = []
	# Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches
	# and speechid_to_speaker
	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	# Initialize various data frames for export to the classification script
	train_total_freq_unigram = {}
	test_total_freq_unigram = {}
	train_total_freq_bigram = {}
	test_total_freq_bigram = {}
	train_number_speeches = 0
	test_number_speeches = 0
	
	# Keeps track of which speeches contain the given bigram
	train_speeches_bigram = collections.defaultdict(dict)
	test_speeches_bigram = collections.defaultdict(dict)
	train_speeches_unigram = collections.defaultdict(dict)
	test_speeches_unigram = collections.defaultdict(dict)

	bigrams_to_speeches = collections.defaultdict()
	bigram_doc_freq = collections.defaultdict()
	unigram_doc_freq = collections.defaultdict()

	gir_num_speeches = 0
	mont_num_speeches = 0
	gir_docs = {}
	mont_docs = {}

	for speaker_name in speakers_to_consider:
		print speaker_name
		party = speakers_to_analyze.loc[speaker_name, "Party"]
		speech = Counter()
		# Variable to keep track of a given speaker's number of speeches
		speech_num = 0
		for identity in raw_speeches:
			date = re.findall(date_regex, str(identity))[0]
			# Only look at speeches within the date frame and that are from the speaker of interest
			if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]):
				# Only looking at speeches with substance, so greater than 100 characters
				if len(raw_speeches[identity]) >= 100:
					indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
					indv_speech_unigram = compute_ngrams(raw_speeches[identity], 1)
					# Splitting the data into training and test data with 1/4 of each speaker's data in the test set
					if speech_num%4 != 0:
						train_number_speeches += 1
						for bigram in indv_speech_bigram:
							augment(bigram_doc_freq, bigram)
							augment(train_total_freq_bigram, bigram)
						for unigram in indv_speech_unigram:
							augment(unigram_doc_freq, unigram)
							augment(train_total_freq_unigram, unigram)
						train_speeches_bigram[identity] = indv_speech_bigram
						train_speeches_unigram[identity] = indv_speech_unigram
					else:
						test_number_speeches += 1
						for bigram in indv_speech_bigram:
							augment(test_total_freq_bigram, bigram)
						for unigram in indv_speech_unigram:
							augment(test_total_freq_unigram, unigram)
						test_speeches_bigram[identity] = indv_speech_bigram
						test_speeches_unigram[identity] = indv_speech_unigram

					speech_num += 1
		
	# Write all relevant data objects and values to memory to use when running classification
	store_to_pickle(speakers_to_analyze, "speakers_to_analyze.pickle")
	
	# Set these dataframes to None to conserve memory
	speakers_to_analyze = None
	speechid_to_speaker = None
	raw_speeches = None

	store_to_pickle(train_speeches_bigram, "train_speeches_bigram.pickle")
	store_to_pickle(train_speeches_unigram, "train_speeches_unigram.pickle")
	store_to_pickle(train_total_freq_bigram, "train_total_freq_bigram.pickle")
	store_to_pickle(train_total_freq_unigram, "train_total_freq_unigram.pickle")

	store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle")
	store_to_pickle(unigram_doc_freq, "unigram_doc_freq.pickle")
	store_to_pickle(train_number_speeches, "train_number_speeches.pickle")

	store_to_pickle(test_speeches_bigram, "test_speeches_bigram.pickle")
	store_to_pickle(test_speeches_unigram, "test_speeches_unigram.pickle")
	store_to_pickle(test_total_freq_bigram, "test_total_freq_bigram.pickle")
	store_to_pickle(test_total_freq_unigram, "test_total_freq_unigram.pickle")