Ejemplo n.º 1
0
def distance_analysis():

    gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx')
    mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx")

    gir_dict = convert_keys_to_string(gir_tfidf)
    mont_dict = convert_keys_to_string(mont_tfidf)
    gir_mont_diff = compute_difference(gir_dict, mont_dict)
    print "here2"

    #by_month = pickle.load(open("byyearmonth.pickle", "rb"))
    #by_date = pickle.load(open("byfulldate.pickle", "rb"))
    by_speaker = pickle.load(open("byspeaker.pickle", "rb"))
    #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb"))
    """by_month = create_tfidf_vectors(by_month)
	by_month_dist = compute_distances(by_month, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_month_dist, 'by_month_distances.xlsx')

	by_period = aggregate_by_period(by_date)
	by_date = create_tfidf_vectors(by_date)

	by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_period_dist, "by_period_distances.xlsx")

	by_date_dist = compute_distances(by_date, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_date_dist, 'by_date_distances.xlsx')"""

    #by_speaker = create_tfidf_vectors(by_speaker)
    by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict,
                                        mont_dict, gir_mont_diff)
    write_to_excel(by_speaker_dist,
                   'by_speaker_noplein_distances_speaker_withsub.xlsx')
    """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
def distance_analysis():

    #When doing anything but speakers need to change this so accounts for introduction of plein
    gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx')
    mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx")
    plein_tfidf = process_excel("plein_tfidf.xlsx")

    gir_dict = convert_keys_to_string(gir_tfidf)
    mont_dict = convert_keys_to_string(mont_tfidf)
    plein_dict = convert_keys_to_string(plein_tfidf)
    gir_mont_diff = compute_difference(gir_dict, mont_dict)

    #by_month = pickle.load(open("byyearmonth.pickle", "rb"))
    #by_date = pickle.load(open("byfulldate.pickle", "rb"))
    by_speaker = pickle.load(open("byspeaker.pickle", "rb"))
    #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb"))
    """by_month = create_tfidf_vectors(by_month)
	by_month_dist = compute_distances(by_month, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_month_dist, 'by_month_distances.xlsx')

	by_period = aggregate_by_period(by_date)
	by_date = create_tfidf_vectors(by_date)

	by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_period_dist, "by_period_distances.xlsx")

	by_date_dist = compute_distances(by_date, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_date_dist, 'by_date_distances.xlsx')"""

    #by_speaker = create_tfidf_vectors(by_speaker)
    by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict,
                                        mont_dict, plein_dict, gir_mont_diff)
    write_to_excel(by_speaker_dist, 'by_speaker_distances.xlsx')
    """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
Ejemplo n.º 3
0
def distance_analysis():

    gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx')
    mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx")

    gir_dict = convert_keys_to_string(gir_tfidf)
    mont_dict = convert_keys_to_string(mont_tfidf)
    gir_mont_diff = compute_difference(gir_dict, mont_dict)

    by_month = pd.read_excel("By_Month.xlsx")
    by_date = pd.read_excel("By_Date.xlsx")
    by_speaker = pd.read_excel("By_Speaker_Convention.xlsx")
    """by_month = create_tfidf_vectors(by_month)
	by_month_dist = compute_distances(by_month, 'month',  gir_dict, mont_dict, gir_mont_diff)
	write_to_excel(by_month_dist, 'by_month_distances.xlsx')

	by_date = create_tfidf_vectors(by_date)
	by_period = aggregate_by_period(by_date)

	by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, gir_mont_diff)
	write_to_excel(by_period_dist, "by_period_distances.xlsx")

	by_date_dist = compute_distances(by_date, 'date',  gir_dict, mont_dict, gir_mont_diff)
	write_to_excel(by_date_dist, 'by_date_distances.xlsx')"""

    by_speaker = create_tfidf_vectors(by_speaker)
    by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict,
                                        mont_dict, gir_mont_diff)
    write_to_excel(by_speaker_dist, 'by_speaker_distances.xlsx')
Ejemplo n.º 4
0
	for bigram in dict1:
		if bigram not in dict2:
			diff_counter[bigram] = dict1[bigram]

	sum_of_squares = 0
	for entry in diff_counter:
		sum_of_squares = sum_of_squares + math.pow(diff_counter[entry], 2)
	euclidean_distance = math.sqrt(sum_of_squares)
	return(euclidean_distance)
	#print(euclidean_distance)


if __name__ == '__main__':
    import sys
    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    speaker_list = load_speakerlist('Copy of AP_Speaker_Authority_List_Edited_3.xlsx')
    """#frequency bigrams when only bigrams occuring more than 3 times accounted for
    gir_frequency = process_excel('girondins_frequency.xlsx')
    mont_frequency = process_excel("montagnards_frequency.xlsx")"""
    #frequency vectors when all possible bigrams accounted for
    gir_tfidf = process_excel('girondins_tfidf.xlsx')
    mont_tfidf = process_excel("montagnards_tfidf.xlsx")
    #doc_freq = process_excel("doc_freq.xlsx")
    doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb"))
    file = open('num_speeches.txt', 'r')
    num_speeches = int(file.read())
    speakers_to_analyze = load_list("Girondins and Montagnards New Mod.xlsx")

    build_vectors(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, gir_tfidf, mont_tfidf, num_speeches, doc_freq)