def distance_analysis(): gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx') mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx") gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) gir_mont_diff = compute_difference(gir_dict, mont_dict) print "here2" #by_month = pickle.load(open("byyearmonth.pickle", "rb")) #by_date = pickle.load(open("byfulldate.pickle", "rb")) by_speaker = pickle.load(open("byspeaker.pickle", "rb")) #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb")) """by_month = create_tfidf_vectors(by_month) by_month_dist = compute_distances(by_month, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_month_dist, 'by_month_distances.xlsx') by_period = aggregate_by_period(by_date) by_date = create_tfidf_vectors(by_date) by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_period_dist, "by_period_distances.xlsx") by_date_dist = compute_distances(by_date, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_date_dist, 'by_date_distances.xlsx')""" #by_speaker = create_tfidf_vectors(by_speaker) by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_speaker_dist, 'by_speaker_noplein_distances_speaker_withsub.xlsx') """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
def distance_analysis(): #When doing anything but speakers need to change this so accounts for introduction of plein gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx') mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx") plein_tfidf = process_excel("plein_tfidf.xlsx") gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) plein_dict = convert_keys_to_string(plein_tfidf) gir_mont_diff = compute_difference(gir_dict, mont_dict) #by_month = pickle.load(open("byyearmonth.pickle", "rb")) #by_date = pickle.load(open("byfulldate.pickle", "rb")) by_speaker = pickle.load(open("byspeaker.pickle", "rb")) #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb")) """by_month = create_tfidf_vectors(by_month) by_month_dist = compute_distances(by_month, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_month_dist, 'by_month_distances.xlsx') by_period = aggregate_by_period(by_date) by_date = create_tfidf_vectors(by_date) by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_period_dist, "by_period_distances.xlsx") by_date_dist = compute_distances(by_date, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_date_dist, 'by_date_distances.xlsx')""" #by_speaker = create_tfidf_vectors(by_speaker) by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_speaker_dist, 'by_speaker_distances.xlsx') """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
def distance_analysis(): #by_month = pickle.load(open("byyearmonth.pickle", "rb")) #by_date = pickle.load(open("byfulldate.pickle", "rb")) by_speaker = pickle.load(open("byspeaker.pickle", "rb")) #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb")) """by_month = create_tfidf_vectors(by_month) by_month_dist = compute_distances(by_month, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_month_dist, 'by_month_distances.xlsx') by_period = aggregate_by_period(by_date) by_date = create_tfidf_vectors(by_date) by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_period_dist, "by_period_distances.xlsx") by_date_dist = compute_distances(by_date, 'aggregation', gir_dict, mont_dict, plein_dict, gir_mont_diff) write_to_excel(by_date_dist, 'by_date_distances.xlsx')""" #by_speaker = create_tfidf_vectors(by_speaker) by_speaker_dist = compute_distances(by_speaker) write_to_excel(by_speaker_dist, 'by_speaker_noplein_distances_withlimit_withsub.xlsx') """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
def distance_analysis(): gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx') mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx") gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) gir_mont_diff = compute_difference(gir_dict, mont_dict) by_month = pd.read_excel("By_Month.xlsx") by_date = pd.read_excel("By_Date.xlsx") by_speaker = pd.read_excel("By_Speaker_Convention.xlsx") """by_month = create_tfidf_vectors(by_month) by_month_dist = compute_distances(by_month, 'month', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_month_dist, 'by_month_distances.xlsx') by_date = create_tfidf_vectors(by_date) by_period = aggregate_by_period(by_date) by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_period_dist, "by_period_distances.xlsx") by_date_dist = compute_distances(by_date, 'date', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_date_dist, 'by_date_distances.xlsx')""" by_speaker = create_tfidf_vectors(by_speaker) by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict, mont_dict, gir_mont_diff) write_to_excel(by_speaker_dist, 'by_speaker_distances.xlsx')
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, Girondins, Montagnards): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) row_entry_speechid = [] row_entry_date = [] for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name in speakers_to_consider): party = speakers_to_analyze.loc[speaker_name, "Party"] indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) # Store relevant information for each bigram for bigram in indv_speech_bigram: row_entry_speechid.append([ str(bigram), speaker_name, identity, indv_speech_bigram[bigram], party ]) row_entry_date.append([ str(bigram), speaker_name, date, indv_speech_bigram[bigram], party ]) chronology_speechid = pd.DataFrame(row_entry_speechid, columns=[ "Bigram", "Speaker Name", "Speechid", "Num occurrences", "Party" ]) chronology_date = pd.DataFrame( row_entry_date, columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"]) # w = csv.writer(open("chronology.csv", "w")) # for key, val in chronology.items(): # if (Girondins[key] >= 10) or (Montagnards[key] >= 10): # w.writerow([key,val]) make_visualizations(chronology_date) write_to_excel(chronology_speechid, "chronology_speechid.xlsx") write_to_excel(chronology_date, "chronology_date.xlsx") store_to_pickle(chronology_speechid, "chronology_speechid.pickle") store_to_pickle(chronology_date, "chronology_date.pickle")
def groupby_date(df, ngrams): bydate_dict = {} for i, speechid in enumerate(df['Speechid']): date = df['Full Date'].iloc[i] dict_ngrams = ngrams[speechid] if date in bydate_dict: bydate_dict[date] = byydate_dict[date] + dict_ngrams else: bydate_dict[date] = dict_ngrams bydate = pd.DataFrame.from_dict(bydate_dict, orient = "index") write_to_excel(bydate, "bydate.xlsx") with open("bydate.pickle", "wb") as handle: pickle.dump(bydate, handle, protocol = 0)
def groupby_speaker(df, ngrams): byspeaker_dict = {} for i, speechid in enumerate(df['Speechid']): speaker = df['Speaker'].iloc[i] dict_ngrams = ngrams[speechid] if speaker in byspeaker_dict: byspeaker_dict[speaker] = byspeaker_dict[speaker] + dict_ngrams else: byspeaker_dict[speaker] = dict_ngrams byspeaker = pd.DataFrame.from_dict(byspeaker_dict, orient = "index") write_to_excel(byspeaker, "byspeaker.xlsx") with open("byspeaker.pickle", "wb") as handle: pickle.dump(byspeaker, handle, protocol = 0)
def groupby_yearmonth(df, ngrams): byyearmonth_dict = {} for i, speechid in enumerate(df['Speechid']): yearmonth = df['Year-Month'].iloc[i] dict_ngrams = ngrams[speechid] if yearmonth in byyearmonth_dict: byyearmonth_dict[yearmonth] = byyearmonth_dict[yearmonth] + dict_ngrams else: byyearmonth_dict[yearmonth] = dict_ngrams byyearmonth = pd.DataFrame.from_dict(byyearmonth_dict, orient = "index") write_to_excel(byyearmonth, "byyearmonth.xlsx") with open("byyearmonth.pickle", "wb") as handle: pickle.dump(byyearmonth, handle, protocol = 0)
def counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq): # Computes the tfidf scores within each group gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) store_to_pickle(gir_tfidf, "gir_tfidf.pickle") store_to_pickle(mont_tfidf, "mont_tfidf.pickle") # Stores the tf_idf vectors in Excel df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index") write_to_excel(df_gir_tfidf, 'gir_tfidf.xlsx') df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index") write_to_excel(df_mont_tfidf, 'mont_tfidf.xlsx') # Combines the tfidf vectors of both parties into one file df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf.xlsx') # Limits based on v, or the number of times that bigram appears, and gir or mont docs, the number of # speakers in each group that use that bigram # Can change the name of these dataframes to illuminate what the restrictions are Girondins_restricted = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} Montagnards_restricted = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} store_to_pickle(Girondins_restricted, "Girondins_restricted.pickle") store_to_pickle(Montagnards_restricted, "Montagnards_restricted.pickle") gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # Stores the Girondins and Montagnards frequency vectors and tfidfs in the same document according to restrictions df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency_restricted.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_restricted.xlsx')
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards, Plein): speaker_names = set() speaker_num_speeches = {} speaker_char_count = {} speakers_to_consider = [] bigrams_to_speeches = {} bigrams_to_speakers = {} bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} plein_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len( raw_speeches[identity]) else: speaker_char_count[speaker_name] = len( raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) if bigram in bigrams_to_speakers: bigrams_to_speakers[bigram].add(speaker_name) else: bigrams_to_speakers[bigram] = set() bigrams_to_speakers[bigram].add(speaker_name) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram else: mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram #speech = speech + indv_speech_bigram # # Stores the bigram Counter object for each individual speaker # """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(speech, handle, protocol = 0)""" # Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient="index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx') pickle_filename = "bigrams_to_speakers.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speakers, handle, protocol=0) pickle_filename = "bigrams_to_speeches.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speeches, handle, protocol=0) pickle_filename = "bigrams_to_speeches.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speeches, handle, protocol=0) pickle_filename = "gir_docs.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(gir_docs, handle, protocol=0) pickle_filename = "mont_docs.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(mont_docs, handle, protocol=0) # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb")) # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb")) # gir_docs = pickle.load(open("gir_docs.pickle", "rb")) # mont_docs = pickle.load(open("mont_docs.pickle", "rb")) # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb")) # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb")) bigram_num_speakers = [] bigram_num_speeches = [] bigram_total_freq = [] bg_speeches = {} bigrams = [] speeches = [] speakers = [] for bigram in bigrams_to_speeches: if (Girondins[bigram] >= 10) or (Montagnards[bigram] >= 10): bigram_num_speakers.append(len(bigrams_to_speakers[bigram])) bigram_num_speeches.append(len(bigrams_to_speeches[bigram])) bigram_total_freq.append(Girondins[bigram] + Montagnards[bigram]) bigrams.append(str(bigram)) speeches.append(str(bigrams_to_speeches[bigram])) speakers.append(str(bigrams_to_speakers[bigram])) bg_num_speakers = pd.DataFrame(bigram_num_speakers, columns=['Num Speakers']) bg_num_speeches = pd.DataFrame(bigram_num_speeches, columns=['Num Speeches']) bg_total_freq = pd.DataFrame(bigram_total_freq, columns=['Total count']) bgs = pd.DataFrame(bigrams, columns=["Bigram"]) speech = pd.DataFrame(speeches, columns=["Speechids"]) speaker = pd.DataFrame(speakers, columns=["Speakers"]) bigram_info = pd.DataFrame() bigram_info = pd.concat([ bgs, speech, speaker, bg_num_speeches, bg_num_speakers, bg_total_freq ], axis=1) writer = pd.ExcelWriter("bigram_info.xlsx") bigram_info.to_excel(writer, 'Sheet1') writer.save() w = csv.writer(open("bigrams_to_speeches_noplein.csv", "w")) for key, val in bigrams_to_speeches.items(): w.writerow([key, val]) bigrams_to_speakers_noplein_sorted = sorted(bigrams_to_speakers.items(), key=lambda x: len(x[1]), reverse=True) w = csv.writer(open("bigrams_to_speakers_noplein_sorted.csv", "w")) for item in bigrams_to_speakers_noplein_sorted: w.writerow([item[0], item[1]]) # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors # num_speeches = 4479 # bigram_doc_freq = pickle.load(open("bigram_doc_freq_noplein_withlimit.pickle", 'rb')) with open('gir_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % gir_num_speeches) with open('mont_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % mont_num_speeches) print num_speeches with open('speaker_num_speeches_withlimit.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) with open('speaker_char_count_withlimit.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) w = csv.writer(open("speaker_num_speeches_withlimit.csv", "w")) for key, val in speaker_num_speeches.items(): w.writerow([key, val]) w = csv.writer(open("speaker_char_count_withlimit.csv", "w")) for key, val in speaker_char_count.items(): w.writerow([key, val]) # Write the number of speeches and doc_frequency to memory for use in further analysis with open('num_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % num_speeches) df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') with open("bigram_doc_freq_noplein_withlimit.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) # # Girondins = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} # # Montagnards = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} # with open("Girondins_withlimit.pickle", 'wb') as handle: # pickle.dump(Girondins, handle, protocol = 0) # with open("Montagnards_withlimit.pickle", 'wb') as handle: # pickle.dump(Montagnards, handle, protocol = 0) # gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) # mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # """with open("gir_tfidf.pickle", 'wb') as handle: # pickle.dump(gir_tfidf, handle, protocol = 0) # with open("mont_tfidf.pickle", 'wb') as handle: # pickle.dump(mont_tfidf, handle, protocol = 0)""" # # Computes the distance between the tf_idf vectors # #compute_distance(gir_tfidf, mont_tfidf) # # Stores the tf_idf vectors # df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index") # #df_gir_tfidf.columns = ['Bigrams', 'tfidf'] # write_to_excel(df_gir_tfidf, 'gir_tfidf_withlimit.xlsx') # df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index") # #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] # write_to_excel(df_mont_tfidf, 'mont_tfidf_withlimit.xlsx') # df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) # df_tfidf_combined = df_tfidf_combined.transpose() # df_tfidf_combined.columns = ["Girondins", "Montagnards"] # write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx') # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches # print gir_docs Girondins = {k: v for k, v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} df_girondins = pd.DataFrame.from_dict(Girondins, orient="index") write_to_excel(df_girondins, "Girondins_counts_withlimit.xlsx") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index") write_to_excel(df_montagnards, "Montagnards_counts_withlimit.xlsx") gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # # Normalizes the vectors and computes the distance between them # #normalized = normalize_dicts(Girondins, Montagnards) # #compute_distance(normalized[0], normalized[1]) # Stores the Girondins and Montagnards frequency vectors in the same document df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency_withlimit.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')
def firststep(): year_month = [] full_date = [] speaker = [] ngrams = {} byyearmonth = pd.DataFrame() bydate = pd.DataFrame() byspeaker = pd.DataFrame() raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index") dataframe.columns = ['Speeches'] speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) file = open('num_speeches.txt', 'r') num_speeches = int(file.read()) doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb")) for speechid in raw_speeches: speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) ngrams[speechid] = speech_bigrams yearmonth = speechid[0:7] year_month.append(yearmonth) fulldate = speechid[0:10] full_date.append(fulldate) speaker.append(speechid_to_speaker[speechid]) dataframe['Year-Month'] = pd.Series(year_month).values dataframe['Full Date'] = pd.Series(full_date).values dataframe['Speaker'] = pd.Series(speaker).values dataframe['Speechid'] = dataframe.index write_to_excel(dataframe, "raw_data.xlsx") """with open("ngrams.pickle", "wb") as handle: pickle.dump(ngrams, handle, protocol = 0)""" """byyearmonth['YearMonth'] = pd.Series(year_month).values byyearmonth['ngrams'] = pd.Series(ngrams).values byyearmonth_dict = pd.Series(byyearmonth.ngrams.values, index = byyearmonth.YearMonth).to_dict() with open("byyearmonth_dict.pickle", 'wb') as handle: pickle.dump(byyearmonth_dict, handle, protocol = 0) bydate['FullDate'] = pd.Series(full_date).values bydate['ngrams'] = pd.Series(ngrams).values bydate_dict = pd.Series(bydate.ngrams.values, index = bydate.FullDate).to_dict() with open("bydate_dict.pickle", 'wb') as handle: pickle.dump(bydate_dict, handle, protocol = 0) byspeaker['Speaker'] = pd.Series(speaker).values byspeaker['ngrams'] = pd.Series(ngrams).values byspeaker_dict = pd.Series(byspeaker.ngrams.values, index = byspeaker.Speaker).to_dict() with open("byspeaker_dict.pickle", 'wb') as handle: pickle.dump(byspeaker_dict, handle, protocol = 0)""" # compute ngrams for each speech # don't need tfidf because should just add the frequency vectors not the tfidf ones # extract year-month # extract year-month-date # make all of those individual columns and create a pandas dataframe # create a function for each grouping and do a pandas groupby """byyearmonth = groupby_yearmonth(dataframe) write_to_excel(byyearmonth, "byyearmonth.xlsx") byyearmonth = None byspeaker = groupby_speaker(dataframe) write_to_excel(byspeaker, "byspeaker.xlsx") byspeaker = None bydate = groupby_date(dataframe) write_to_excel(bydate, "bydate.xlsx") bydate = None""" groupby_yearmonth(dataframe, ngrams) groupby_date(dataframe, ngrams) groupby_speaker(dataframe, ngrams)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards, Plein): speaker_names = set() speaker_num_speeches = {} speaker_char_count = {} speakers_to_consider = [] bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 plein_num_speeches = 0 gir_docs = {} mont_docs = {} plein_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02"): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes speaker_name = speechid_to_speaker[identity] party = "" if speaker_name in speakers_to_consider: party = speakers_to_analyze.loc[speaker_name, "Party"] else: party = "Plein" augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len(raw_speeches[identity]) else: speaker_char_count[speaker_name] = len(raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram elif party == "Montagnards": mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram # Creates a Plein category that is neither Girondins or Montagnards to better understand speakers that are not distinctly one # or the other else: plein_num_speeches += 1 plein_docs = check_num_speakers(indv_speech_bigram, speaker_name, plein_docs) try: Plein = Plein + indv_speech_bigram except NameError: Plein = indv_speech_bigram #speech = speech + indv_speech_bigram # Stores the bigram Counter object for each individual speaker """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speech, handle, protocol = 0)""" """# Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')""" # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors num_speeches = gir_num_speeches + mont_num_speeches + plein_num_speeches print num_speeches with open('speaker_num_speeches_withplein.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) with open('speaker_char_count_withplein.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) w = csv.writer(open("speaker_num_speeches_withplein.csv", "w")) for key, val in speaker_num_speeches.items(): w.writerow([key, val]) w = csv.writer(open("speaker_char_count_withplein.csv", "w")) for key, val in speaker_char_count.items(): w.writerow([key, val]) # Write the number of speeches and doc_frequency to memory for use in further analysis with open('num_speeches_withplein.txt', 'w') as f: f.write('%d' % num_speeches) df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') with open("bigram_doc_freq_withplein.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) with open("Girondins_withplein.pickle", 'wb') as handle: pickle.dump(Girondins, handle, protocol=0) with open("Montagnards_withplein.pickle", 'wb') as handle: pickle.dump(Montagnards, handle, protocol=0) with open("Plein.pickle", 'wb') as handle: pickle.dump(Plein, handle, protocol=0) gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) plein_tfidf = compute_tfidf(Plein, num_speeches, bigram_doc_freq) """with open("gir_tfidf.pickle", 'wb') as handle: pickle.dump(gir_tfidf, handle, protocol = 0) with open("mont_tfidf.pickle", 'wb') as handle: pickle.dump(mont_tfidf, handle, protocol = 0)""" # Computes the distance between the tf_idf vectors #compute_distance(gir_tfidf, mont_tfidf) # Stores the tf_idf vectors df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient="index") #df_gir_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_gir_tfidf, 'gir_tfidf_withplein.xlsx') df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient="index") #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_mont_tfidf, 'mont_tfidf_withplein.xlsx') df_plein_tfidf = pd.DataFrame.from_dict(plein_tfidf, orient="index") #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_plein_tfidf, 'plein_tfidf.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_withplein.xlsx') # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches Girondins = {k: v for k, v in Girondins.items() if (v >= 3)} #and (len(gir_docs[k]) > 1)} df_girondins = pd.DataFrame.from_dict(Girondins, orient="index") write_to_excel(df_girondins, "Girondins_counts_withplein.xlsx") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 3)} #and (len(mont_docs[k]) > 1)} df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index") write_to_excel(df_montagnards, "Montagnards_counts_withplein.xlsx") # Normalizes the vectors and computes the distance between them #normalized = normalize_dicts(Girondins, Montagnards) #compute_distance(normalized[0], normalized[1]) # Stores the Girondins and Montagnards frequency vectors in the same document df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency.xlsx')
""" import pickle import pandas as pd from pandas import * from processing_functions import write_to_excel if __name__ == '__main__': import sys raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) # Write just the raw speeches to Excel df = pd.DataFrame.from_dict(raw_speeches, orient = "index") filename = "raw_speeches.xlsx" write_to_excel(df, filename) """writer = pd.ExcelWriter(filename) df.to_excel(writer, 'Sheet1') writer.save()""" # Write just the speaker names to Excel df2 = pd.DataFrame.from_dict(speechid_to_speaker, orient = "index") filename2 = "speechid_to_speaker.xlsx" write_to_excel(df2, filename2) """writer2 = pd.ExcelWriter(filename2) df2.to_excel(writer2, 'Sheet1') writer2.save()""" # Concatenante the speeches with the speaker names to have all data in one Excel file joined = pd.concat([df,df2], axis = 1) filename3 = "speeches_and_speakers.xlsx"
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards): speaker_num_speeches = {} speaker_char_count = {} # Dataframe to keep track of the speakers we care about speakers_to_consider = [] # Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches # and speechid_to_speaker for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) # Matches bigrams to the list of speakers and speeches that have that bigram bigrams_to_speeches = {} bigrams_to_speakers = {} # Maintains the number of documents a given bigram is spoken in for use with tf-idf bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len(raw_speeches[identity]) else: speaker_char_count[speaker_name] = len(raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) if bigram in bigrams_to_speakers: bigrams_to_speakers[bigram].add(speaker_name) else: bigrams_to_speakers[bigram] = set() bigrams_to_speakers[bigram].add(speaker_name) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram else: mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram ### Maintains a Counter of all the bigrams and their counts for a given speaker # speech = speech + indv_speech_bigram ### Stores the bigram Counter object for each individual speaker # pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(speech, handle, protocol = 0) # Store raw counts store_to_pickle(Girondins,"Girondins.pickle") store_to_pickle(Montagnards, "Montagnards.pickle") # Store in memory aggregate information about each bigram bigram_aggregate_info(Girondins, Montagnards, bigrams_to_speakers, bigrams_to_speeches) ### If data has already been stored to memory, the lines below can be used # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb")) # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb")) # gir_docs = pickle.load(open("gir_docs.pickle", "rb")) # mont_docs = pickle.load(open("mont_docs.pickle", "rb")) # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb")) # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb")) # bigram_doc_freq = pickle.load(open("bigram_doc_freq.pickle", 'rb')) num_speeches = 4479 # Computes counts and tfidf scores for each party and outputs for further analysis in R counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq) """ EVERYTHING BELOW IS STORING DATA TO MEMORY """ # Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx') df_bigrams_to_speakers = pd.DataFrame.from_dict(bigrams_to_speakers, orient = "index") write_to_excel(df_bigrams_to_speakers, 'bigrams_to_speakers.xlsx') df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient = "index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') # Stores files in memory store_to_pickle(bigrams_to_speakers, "bigrams_to_speakers.pickle") store_to_pickle(bigrams_to_speeches, "bigrams_to_speeches.pickle") store_to_pickle(gir_docs, "gir_docs.pickle") store_to_pickle(mont_docs, "mont_docs.pickle") store_to_pickle(speaker_num_speeches, "speaker_num_speeches.pickle") store_to_pickle(speaker_char_count, "speaker_char_count.pickle") store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle") with open('gir_speeches.txt', 'w') as f: f.write('%d' % gir_num_speeches) with open('mont_speeches.txt', 'w') as f: f.write('%d' % mont_num_speeches) write_to_csv(speaker_num_speeches, "speaker_num_speeches.csv") write_to_csv(speaker_char_count, "speaker_char_count.csv") with open('num_speeches.txt', 'w') as f: f.write('%d' % num_speeches)
if date.attrs: relevant_dates.append(date) if (len(relevant_dates) > 0): return (relevant_dates[0]['value']) else: return ("error") if __name__ == '__main__': import sys raw_speeches = {} multiple_speakers = {} parseFiles(raw_speeches, multiple_speakers) # Stores data in files to then be merged with AN dataset speaker_distances = pd.DataFrame(speaker_dists, columns=[ "Speaker Name", "Levenshtein Dists", "Volno", "Date", "Departments/Notes" ]) write_to_excel(speaker_distances, "speaker_distances.xlsx") speaker_distances_split = pd.DataFrame(speaker_dists_split, columns=[ "Speaker Name", "Full Name", "Distance", "Volno", "Date", "Department/Notes" ]) write_to_excel(speaker_distances_split, "speaker_distances_split.xlsx")
ftnotes = ftnotes.replace("\n","").replace("\r","").replace("\t","").replace(" "," ") footnotes.append([ftnotes, speaker, speech_id, volno]) number_of_speeches += 1 # Parses dates from file being analyzed def extractDate(soup_file): dates = soup_file.find_all('date') relevant_dates = [] for date in dates: if date.attrs: relevant_dates.append(date) if (len(relevant_dates) > 0): return(relevant_dates[0]['value']) else: return("error") if __name__ == '__main__': import sys speaker_list = load_speakerlist('Copy of AP_Speaker_Authority_List_Edited_3.xlsx') raw_speeches = {} multiple_speakers = {} parseFiles(raw_speeches, multiple_speakers) footnotes = pd.DataFrame(footnotes, columns = ["Footnote", "Speaker", "Speechid", "Volno"]) write_to_excel(footnotes, "footnotes.xlsx")