def distance_analysis(): gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx') mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx") gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) gir_mont_diff = compute_difference(gir_dict, mont_dict) print "here2" #by_month = pickle.load(open("byyearmonth.pickle", "rb")) #by_date = pickle.load(open("byfulldate.pickle", "rb")) by_speaker = pickle.load(open("byspeaker.pickle", "rb")) #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb")) """by_month = create_tfidf_vectors(by_month) by_month_dist = compute_distances(by_month, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_month_dist, 'by_month_distances.xlsx') by_period = aggregate_by_period(by_date) by_date = create_tfidf_vectors(by_date) by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_period_dist, "by_period_distances.xlsx") by_date_dist = compute_distances(by_date, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_date_dist, 'by_date_distances.xlsx')""" #by_speaker = create_tfidf_vectors(by_speaker) by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_speaker_dist, 'by_speaker_noplein_distances_speaker_withsub.xlsx') """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
def distance_analysis(): #When doing anything but speakers need to change this so accounts for introduction of plein gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx') mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx") plein_tfidf = process_excel("plein_tfidf.xlsx") gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) plein_dict = convert_keys_to_string(plein_tfidf) gir_mont_diff = compute_difference(gir_dict, mont_dict) #by_month = pickle.load(open("byyearmonth.pickle", "rb")) #by_date = pickle.load(open("byfulldate.pickle", "rb")) by_speaker = pickle.load(open("byspeaker.pickle", "rb")) #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb")) """by_month = create_tfidf_vectors(by_month) by_month_dist = compute_distances(by_month, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_month_dist, 'by_month_distances.xlsx') by_period = aggregate_by_period(by_date) by_date = create_tfidf_vectors(by_date) by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_period_dist, "by_period_distances.xlsx") by_date_dist = compute_distances(by_date, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_date_dist, 'by_date_distances.xlsx')""" #by_speaker = create_tfidf_vectors(by_speaker) by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_speaker_dist, 'by_speaker_distances.xlsx') """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
def distance_analysis(): gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx') mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx") gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) gir_mont_diff = compute_difference(gir_dict, mont_dict) by_month = pd.read_excel("By_Month.xlsx") by_date = pd.read_excel("By_Date.xlsx") by_speaker = pd.read_excel("By_Speaker_Convention.xlsx") """by_month = create_tfidf_vectors(by_month) by_month_dist = compute_distances(by_month, 'month', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_month_dist, 'by_month_distances.xlsx') by_date = create_tfidf_vectors(by_date) by_period = aggregate_by_period(by_date) by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_period_dist, "by_period_distances.xlsx") by_date_dist = compute_distances(by_date, 'date', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_date_dist, 'by_date_distances.xlsx')""" by_speaker = create_tfidf_vectors(by_speaker) by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_speaker_dist, 'by_speaker_distances.xlsx')
for bigram in dict1: if bigram not in dict2: diff_counter[bigram] = dict1[bigram] sum_of_squares = 0 for entry in diff_counter: sum_of_squares = sum_of_squares + math.pow(diff_counter[entry], 2) euclidean_distance = math.sqrt(sum_of_squares) return(euclidean_distance) #print(euclidean_distance) if __name__ == '__main__': import sys raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) speaker_list = load_speakerlist('Copy of AP_Speaker_Authority_List_Edited_3.xlsx') """#frequency bigrams when only bigrams occuring more than 3 times accounted for gir_frequency = process_excel('girondins_frequency.xlsx') mont_frequency = process_excel("montagnards_frequency.xlsx")""" #frequency vectors when all possible bigrams accounted for gir_tfidf = process_excel('girondins_tfidf.xlsx') mont_tfidf = process_excel("montagnards_tfidf.xlsx") #doc_freq = process_excel("doc_freq.xlsx") doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb")) file = open('num_speeches.txt', 'r') num_speeches = int(file.read()) speakers_to_analyze = load_list("Girondins and Montagnards New Mod.xlsx") build_vectors(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, gir_tfidf, mont_tfidf, num_speeches, doc_freq)